From b76b7b852778d076335ad840e2ebffe8da233aed Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sun, 5 Mar 2017 02:15:03 -0500
Subject: [PATCH] [parser] adding chunked shuffle as a C function (writes each
 line to one of n random files, runs shuf on each file and concatenates the
 result). Adding a version which allows specifying a specific chunk size, and
 using a 2GB limit for address parser training. Allowing gshuf again for Mac
 as it seems the only problem there was not having enough memory when testing
 on a Mac laptop. The new limited-memory version should be fast enough.

---
 configure.ac               |  2 +
 src/address_parser_train.c |  8 ++--
 src/shuffle.c              | 97 ++++++++++++++++++++++++++++++++++++++
 src/shuffle.h              |  2 +
 4 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index b790c546..953f29c6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -57,8 +57,10 @@ AC_CONFIG_FILES([Makefile
                  test/Makefile])
 
 AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
+AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
 
 AS_IF([test "x$FOUND_SHUF" = xyes],  [AC_DEFINE([HAVE_SHUF], [1], [shuf available])])
+AS_IF([test "x$FOUND_GSHUF" = xyes],  [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
 
 # ------------------------------------------------------------------
 # Checks for SSE2 build
diff --git a/src/address_parser_train.c b/src/address_parser_train.c
index 70d56c48..c7a76a2a 100644
--- a/src/address_parser_train.c
+++ b/src/address_parser_train.c
@@ -20,6 +20,9 @@ KHASH_MAP_INIT_STR(phrase_stats, phrase_stats_t)
 KHASH_MAP_INIT_STR(postal_code_context_phrases, khash_t(str_set) *)
 KHASH_MAP_INIT_STR(phrase_types, address_parser_types_t)
 
+#define CHUNK_SIZE_MB 1024 * 1024
+#define CHUNK_SIZE_GB 1024 * (CHUNK_SIZE_MB)
+#define DEFAULT_SHUFFLE_CHUNK_SIZE 2 * (CHUNK_SIZE_GB)
 
 // Training
 
@@ -1011,10 +1014,10 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i
 
         trainer->iterations = iter;
 
-        #if defined(HAVE_SHUF)
+        #if defined(HAVE_SHUF) || defined(HAVE_GSHUF)
         log_info("Shuffling\n");
 
-        if (!shuffle_file(filename)) {
+        if (!shuffle_file_chunked_size(filename, DEFAULT_SHUFFLE_CHUNK_SIZE)) {
             log_error("Error in shuffle\n");
             averaged_perceptron_trainer_destroy(trainer);
             return false;
@@ -1033,7 +1036,6 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i
     log_debug("Done with training, averaging weights\n");
 
     self->model = averaged_perceptron_trainer_finalize(trainer);
-    
 
     return true;
 }
diff --git a/src/shuffle.c b/src/shuffle.c
index db9979e2..f714d127 100644
--- a/src/shuffle.c
+++ b/src/shuffle.c
@@ -3,11 +3,14 @@
 #include <config.h>
 #include "string_utils.h"
 
+// Run shuf/gshuf on a file in-place if the shuf command is available.
 bool shuffle_file(char *filename) {
     char *shuffle_command = NULL;
 
     #if defined(HAVE_SHUF)
     shuffle_command = "shuf";
+    #elif defined(HAVE_GSHUF)
+    shuffle_command = "gshuf";
     #else
     return false;
     #endif
@@ -22,3 +25,97 @@ bool shuffle_file(char *filename) {
 
     return ret == EXIT_SUCCESS;
 }
+
+// Assign each line of the file randomly to n chunks and shuffle each file sequentially in-memory
+// This approach will produce a random permutation of the lines using limited memory
+bool shuffle_file_chunked(char *filename, size_t parts) {
+    char *shuffle_command = NULL;
+
+    // Linux
+    #if defined(HAVE_SHUF)
+    shuffle_command = "shuf";
+    // Mac
+    #elif defined(HAVE_GSHUF)
+    shuffle_command = "gshuf";
+    #else
+    return false;
+    #endif
+
+    if (filename == NULL) {
+        return false;
+    }
+
+    // Make sure the input file exists
+    FILE *f = fopen(filename, "r");
+    if (f == NULL) {
+        return false;
+    }
+    fclose(f);
+
+    // This is an in-place shuffle to keep the API simple
+    char *outfile = filename;
+
+    char_array *command = char_array_new();
+
+    // Split the file randomly into $parts files
+    // Need to be assigned randomly, not just every nth line or it's not really a random permutation
+    char_array_cat_printf(command, "awk -v parts=%zu -v filename=%s 'BEGIN{srand();} { print > filename\".\"int(rand() * parts) }'", parts, filename);
+
+    int ret = system(char_array_get_string(command));
+    if (ret != EXIT_SUCCESS) {
+        goto exit_char_array_allocated;
+    }
+
+    // Run shuf sequentially on each of the $parts files
+    // This should be sequential, not parallelized as the goal is
+    // to limit memory usage when shuffling large files
+    for (size_t i = 0; i < parts; i++) {
+        char_array_clear(command);
+        char_array_cat_printf(command, "%s %s.%zu %s %s.tmp", shuffle_command, filename, i, i > 0 ? ">>" : ">", outfile);
+        ret = system(char_array_get_string(command));
+        if (ret != EXIT_SUCCESS) {
+            goto exit_char_array_allocated;
+        }
+
+        // Delete the file temp file
+        char_array_clear(command);
+        char_array_cat_printf(command, "rm %s.%zu", filename, i);
+        ret = system(char_array_get_string(command));
+        if (ret != EXIT_SUCCESS) {
+            goto exit_char_array_allocated;
+        }
+    }
+
+    char_array_clear(command);
+    char_array_cat_printf(command, "mv %s.tmp %s", outfile, outfile);
+    ret = system(char_array_get_string(command));
+    if (ret != EXIT_SUCCESS) {
+        goto exit_char_array_allocated;
+    }
+
+exit_char_array_allocated:
+    char_array_destroy(command);
+    return ret == EXIT_SUCCESS;
+}
+
+// Shuffle a file in-place, specifying a rough upper bound on system memory
+bool shuffle_file_chunked_size(char *filename, size_t chunk_size) {
+    FILE *f = fopen(filename, "r");
+
+    if (f == NULL) return false;
+
+    fseek(f, 0L, SEEK_END);
+    size_t size = ftell(f);
+    fclose(f);
+
+    size_t parts = size / chunk_size + 1;
+    // If the file is smaller than the chunk size, do a
+    // simple in-memory shuffle of the whole file
+    if (parts == 1) {
+        return shuffle_file(filename);
+    }
+
+    return shuffle_file_chunked(filename, parts);
+}
+
+
diff --git a/src/shuffle.h b/src/shuffle.h
index c715d08e..4c1e208e 100644
--- a/src/shuffle.h
+++ b/src/shuffle.h
@@ -5,5 +5,7 @@
 #include <stdbool.h>
 
 bool shuffle_file(char *filename);
+bool shuffle_file_chunked(char *filename, size_t parts);
+bool shuffle_file_chunked_size(char *filename, size_t chunk_size);
 
 #endif
\ No newline at end of file