[utils] adding a chunked shuffle as the concatenated file sizes may get larger than memory
This commit is contained in:
22
src/chunked_shuffle
Normal file
22
src/chunked_shuffle
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
if [ "$#" -lt 3 ]; then
|
||||||
|
echo "Usage: chunked_shuffle filename parts outfile"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
filename=$1
|
||||||
|
parts=$2
|
||||||
|
outfile=$3
|
||||||
|
|
||||||
|
awk -v parts=$parts -v filename=$filename 'BEGIN{srand();} { print > filename"."int(rand() * parts) }' $filename
|
||||||
|
|
||||||
|
tmp_outfile=$filename.out
|
||||||
|
> $tmp_outfile
|
||||||
|
|
||||||
|
for i in $(seq 0 $[$parts - 1]); do
|
||||||
|
shuf $filename.$i >> $tmp_outfile
|
||||||
|
rm $filename.$i
|
||||||
|
done
|
||||||
|
|
||||||
|
mv $tmp_outfile $outfile
|
||||||
Reference in New Issue
Block a user