From 7298c895c802d43e27cef609a35c85ea2935a36b Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 21 Nov 2016 14:04:34 -0500 Subject: [PATCH] [utils] adding a chunked shuffle as the concatenated file sizes may get larger than memory --- src/chunked_shuffle | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/chunked_shuffle diff --git a/src/chunked_shuffle b/src/chunked_shuffle new file mode 100644 index 00000000..56ba75c6 --- /dev/null +++ b/src/chunked_shuffle @@ -0,0 +1,22 @@ +set -e + +if [ "$#" -lt 3 ]; then + echo "Usage: chunked_shuffle filename parts outfile" + exit 1 +fi + +filename=$1 +parts=$2 +outfile=$3 + +awk -v parts=$parts -v filename=$filename 'BEGIN{srand();} { print > filename"."int(rand() * parts) }' $filename + +tmp_outfile=$filename.out +> $tmp_outfile + +for i in $(seq 0 $[$parts - 1]); do + shuf $filename.$i >> $tmp_outfile + rm $filename.$i +done + +mv $tmp_outfile $outfile \ No newline at end of file