[parser] adding chunked shuffle as a C function (writes each line to one of n random files, runs shuf on each file and concatenates the result). Adding a version which allows specifying a specific chunk size, and using a 2GB limit for address parser training. Allowing gshuf again for Mac as it seems the only problem there was not having enough memory when testing on a Mac laptop. The new limited-memory version should be fast enough.

2017-03-05 02:15:03 -05:00
parent ba4052c9ba
commit b76b7b8527
4 changed files with 106 additions and 3 deletions
--- a/src/address_parser_train.c
+++ b/src/address_parser_train.c
@@ -20,6 +20,9 @@ KHASH_MAP_INIT_STR(phrase_stats, phrase_stats_t)
 KHASH_MAP_INIT_STR(postal_code_context_phrases, khash_t(str_set) *)
 KHASH_MAP_INIT_STR(phrase_types, address_parser_types_t)

+#define CHUNK_SIZE_MB 1024 * 1024
+#define CHUNK_SIZE_GB 1024 * (CHUNK_SIZE_MB)
+#define DEFAULT_SHUFFLE_CHUNK_SIZE 2 * (CHUNK_SIZE_GB)

 // Training

@@ -1011,10 +1014,10 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i

        trainer->iterations = iter;

-        #if defined(HAVE_SHUF)
+        #if defined(HAVE_SHUF) || defined(HAVE_GSHUF)
        log_info("Shuffling\n");

-        if (!shuffle_file(filename)) {
+        if (!shuffle_file_chunked_size(filename, DEFAULT_SHUFFLE_CHUNK_SIZE)) {
            log_error("Error in shuffle\n");
            averaged_perceptron_trainer_destroy(trainer);
            return false;
@@ -1033,7 +1036,6 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i
    log_debug("Done with training, averaging weights\n");

    self->model = averaged_perceptron_trainer_finalize(trainer);
-    

    return true;
 }