[utils] adding a chunked shuffle as the concatenated file sizes may get larger than memory

This commit is contained in:
Al
2016-11-21 14:04:34 -05:00
parent eff0443fcf
commit 7298c895c8

22
src/chunked_shuffle Normal file
View File

@@ -0,0 +1,22 @@
set -e
if [ "$#" -lt 3 ]; then
echo "Usage: chunked_shuffle filename parts outfile"
exit 1
fi
filename=$1
parts=$2
outfile=$3
awk -v parts=$parts -v filename=$filename 'BEGIN{srand();} { print > filename"."int(rand() * parts) }' $filename
tmp_outfile=$filename.out
> $tmp_outfile
for i in $(seq 0 $[$parts - 1]); do
shuf $filename.$i >> $tmp_outfile
rm $filename.$i
done
mv $tmp_outfile $outfile