From b76b7b852778d076335ad840e2ebffe8da233aed Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 5 Mar 2017 02:15:03 -0500 Subject: [PATCH] [parser] adding chunked shuffle as a C function (writes each line to one of n random files, runs shuf on each file and concatenates the result). Adding a version which allows specifying a specific chunk size, and using a 2GB limit for address parser training. Allowing gshuf again for Mac as it seems the only problem there was not having enough memory when testing on a Mac laptop. The new limited-memory version should be fast enough. --- configure.ac | 2 + src/address_parser_train.c | 8 ++-- src/shuffle.c | 97 ++++++++++++++++++++++++++++++++++++++ src/shuffle.h | 2 + 4 files changed, 106 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index b790c546..953f29c6 100644 --- a/configure.ac +++ b/configure.ac @@ -57,8 +57,10 @@ AC_CONFIG_FILES([Makefile test/Makefile]) AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes]) +AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes]) AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf available])]) +AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])]) # ------------------------------------------------------------------ # Checks for SSE2 build diff --git a/src/address_parser_train.c b/src/address_parser_train.c index 70d56c48..c7a76a2a 100644 --- a/src/address_parser_train.c +++ b/src/address_parser_train.c @@ -20,6 +20,9 @@ KHASH_MAP_INIT_STR(phrase_stats, phrase_stats_t) KHASH_MAP_INIT_STR(postal_code_context_phrases, khash_t(str_set) *) KHASH_MAP_INIT_STR(phrase_types, address_parser_types_t) +#define CHUNK_SIZE_MB 1024 * 1024 +#define CHUNK_SIZE_GB 1024 * (CHUNK_SIZE_MB) +#define DEFAULT_SHUFFLE_CHUNK_SIZE 2 * (CHUNK_SIZE_GB) // Training @@ -1011,10 +1014,10 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i trainer->iterations = iter; - #if defined(HAVE_SHUF) + #if defined(HAVE_SHUF) || defined(HAVE_GSHUF) log_info("Shuffling\n"); - if (!shuffle_file(filename)) { + if (!shuffle_file_chunked_size(filename, DEFAULT_SHUFFLE_CHUNK_SIZE)) { log_error("Error in shuffle\n"); averaged_perceptron_trainer_destroy(trainer); return false; @@ -1033,7 +1036,6 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i log_debug("Done with training, averaging weights\n"); self->model = averaged_perceptron_trainer_finalize(trainer); - return true; } diff --git a/src/shuffle.c b/src/shuffle.c index db9979e2..f714d127 100644 --- a/src/shuffle.c +++ b/src/shuffle.c @@ -3,11 +3,14 @@ #include #include "string_utils.h" +// Run shuf/gshuf on a file in-place if the shuf command is available. bool shuffle_file(char *filename) { char *shuffle_command = NULL; #if defined(HAVE_SHUF) shuffle_command = "shuf"; + #elif defined(HAVE_GSHUF) + shuffle_command = "gshuf"; #else return false; #endif @@ -22,3 +25,97 @@ bool shuffle_file(char *filename) { return ret == EXIT_SUCCESS; } + +// Assign each line of the file randomly to n chunks and shuffle each file sequentially in-memory +// This approach will produce a random permutation of the lines using limited memory +bool shuffle_file_chunked(char *filename, size_t parts) { + char *shuffle_command = NULL; + + // Linux + #if defined(HAVE_SHUF) + shuffle_command = "shuf"; + // Mac + #elif defined(HAVE_GSHUF) + shuffle_command = "gshuf"; + #else + return false; + #endif + + if (filename == NULL) { + return false; + } + + // Make sure the input file exists + FILE *f = fopen(filename, "r"); + if (f == NULL) { + return false; + } + fclose(f); + + // This is an in-place shuffle to keep the API simple + char *outfile = filename; + + char_array *command = char_array_new(); + + // Split the file randomly into $parts files + // Need to be assigned randomly, not just every nth line or it's not really a random permutation + char_array_cat_printf(command, "awk -v parts=%zu -v filename=%s 'BEGIN{srand();} { print > filename\".\"int(rand() * parts) }'", parts, filename); + + int ret = system(char_array_get_string(command)); + if (ret != EXIT_SUCCESS) { + goto exit_char_array_allocated; + } + + // Run shuf sequentially on each of the $parts files + // This should be sequential, not parallelized as the goal is + // to limit memory usage when shuffling large files + for (size_t i = 0; i < parts; i++) { + char_array_clear(command); + char_array_cat_printf(command, "%s %s.%zu %s %s.tmp", shuffle_command, filename, i, i > 0 ? ">>" : ">", outfile); + ret = system(char_array_get_string(command)); + if (ret != EXIT_SUCCESS) { + goto exit_char_array_allocated; + } + + // Delete the file temp file + char_array_clear(command); + char_array_cat_printf(command, "rm %s.%zu", filename, i); + ret = system(char_array_get_string(command)); + if (ret != EXIT_SUCCESS) { + goto exit_char_array_allocated; + } + } + + char_array_clear(command); + char_array_cat_printf(command, "mv %s.tmp %s", outfile, outfile); + ret = system(char_array_get_string(command)); + if (ret != EXIT_SUCCESS) { + goto exit_char_array_allocated; + } + +exit_char_array_allocated: + char_array_destroy(command); + return ret == EXIT_SUCCESS; +} + +// Shuffle a file in-place, specifying a rough upper bound on system memory +bool shuffle_file_chunked_size(char *filename, size_t chunk_size) { + FILE *f = fopen(filename, "r"); + + if (f == NULL) return false; + + fseek(f, 0L, SEEK_END); + size_t size = ftell(f); + fclose(f); + + size_t parts = size / chunk_size + 1; + // If the file is smaller than the chunk size, do a + // simple in-memory shuffle of the whole file + if (parts == 1) { + return shuffle_file(filename); + } + + return shuffle_file_chunked(filename, parts); +} + + diff --git a/src/shuffle.h b/src/shuffle.h index c715d08e..4c1e208e 100644 --- a/src/shuffle.h +++ b/src/shuffle.h @@ -5,5 +5,7 @@ #include bool shuffle_file(char *filename); +bool shuffle_file_chunked(char *filename, size_t parts); +bool shuffle_file_chunked_size(char *filename, size_t chunk_size); #endif \ No newline at end of file