[parser] adding chunked shuffle as a C function (writes each line to one of n random files, runs shuf on each file and concatenates the result). Adding a version which allows specifying a specific chunk size, and using a 2GB limit for address parser training. Allowing gshuf again for Mac as it seems the only problem there was not having enough memory when testing on a Mac laptop. The new limited-memory version should be fast enough.
This commit is contained in:
@@ -57,8 +57,10 @@ AC_CONFIG_FILES([Makefile
|
||||
test/Makefile])
|
||||
|
||||
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
||||
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
||||
|
||||
AS_IF([test "x$FOUND_SHUF" = xyes], [AC_DEFINE([HAVE_SHUF], [1], [shuf available])])
|
||||
AS_IF([test "x$FOUND_GSHUF" = xyes], [AC_DEFINE([HAVE_GSHUF], [1], [gshuf available])])
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Checks for SSE2 build
|
||||
|
||||
@@ -20,6 +20,9 @@ KHASH_MAP_INIT_STR(phrase_stats, phrase_stats_t)
|
||||
KHASH_MAP_INIT_STR(postal_code_context_phrases, khash_t(str_set) *)
|
||||
KHASH_MAP_INIT_STR(phrase_types, address_parser_types_t)
|
||||
|
||||
#define CHUNK_SIZE_MB 1024 * 1024
|
||||
#define CHUNK_SIZE_GB 1024 * (CHUNK_SIZE_MB)
|
||||
#define DEFAULT_SHUFFLE_CHUNK_SIZE 2 * (CHUNK_SIZE_GB)
|
||||
|
||||
// Training
|
||||
|
||||
@@ -1011,10 +1014,10 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i
|
||||
|
||||
trainer->iterations = iter;
|
||||
|
||||
#if defined(HAVE_SHUF)
|
||||
#if defined(HAVE_SHUF) || defined(HAVE_GSHUF)
|
||||
log_info("Shuffling\n");
|
||||
|
||||
if (!shuffle_file(filename)) {
|
||||
if (!shuffle_file_chunked_size(filename, DEFAULT_SHUFFLE_CHUNK_SIZE)) {
|
||||
log_error("Error in shuffle\n");
|
||||
averaged_perceptron_trainer_destroy(trainer);
|
||||
return false;
|
||||
@@ -1033,7 +1036,6 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i
|
||||
log_debug("Done with training, averaging weights\n");
|
||||
|
||||
self->model = averaged_perceptron_trainer_finalize(trainer);
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3,11 +3,14 @@
|
||||
#include <config.h>
|
||||
#include "string_utils.h"
|
||||
|
||||
// Run shuf/gshuf on a file in-place if the shuf command is available.
|
||||
bool shuffle_file(char *filename) {
|
||||
char *shuffle_command = NULL;
|
||||
|
||||
#if defined(HAVE_SHUF)
|
||||
shuffle_command = "shuf";
|
||||
#elif defined(HAVE_GSHUF)
|
||||
shuffle_command = "gshuf";
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
@@ -22,3 +25,97 @@ bool shuffle_file(char *filename) {
|
||||
|
||||
return ret == EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
// Assign each line of the file randomly to n chunks and shuffle each file sequentially in-memory
|
||||
// This approach will produce a random permutation of the lines using limited memory
|
||||
bool shuffle_file_chunked(char *filename, size_t parts) {
|
||||
char *shuffle_command = NULL;
|
||||
|
||||
// Linux
|
||||
#if defined(HAVE_SHUF)
|
||||
shuffle_command = "shuf";
|
||||
// Mac
|
||||
#elif defined(HAVE_GSHUF)
|
||||
shuffle_command = "gshuf";
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
if (filename == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure the input file exists
|
||||
FILE *f = fopen(filename, "r");
|
||||
if (f == NULL) {
|
||||
return false;
|
||||
}
|
||||
fclose(f);
|
||||
|
||||
// This is an in-place shuffle to keep the API simple
|
||||
char *outfile = filename;
|
||||
|
||||
char_array *command = char_array_new();
|
||||
|
||||
// Split the file randomly into $parts files
|
||||
// Need to be assigned randomly, not just every nth line or it's not really a random permutation
|
||||
char_array_cat_printf(command, "awk -v parts=%zu -v filename=%s 'BEGIN{srand();} { print > filename\".\"int(rand() * parts) }'", parts, filename);
|
||||
|
||||
int ret = system(char_array_get_string(command));
|
||||
if (ret != EXIT_SUCCESS) {
|
||||
goto exit_char_array_allocated;
|
||||
}
|
||||
|
||||
// Run shuf sequentially on each of the $parts files
|
||||
// This should be sequential, not parallelized as the goal is
|
||||
// to limit memory usage when shuffling large files
|
||||
for (size_t i = 0; i < parts; i++) {
|
||||
char_array_clear(command);
|
||||
char_array_cat_printf(command, "%s %s.%zu %s %s.tmp", shuffle_command, filename, i, i > 0 ? ">>" : ">", outfile);
|
||||
ret = system(char_array_get_string(command));
|
||||
if (ret != EXIT_SUCCESS) {
|
||||
goto exit_char_array_allocated;
|
||||
}
|
||||
|
||||
// Delete the file temp file
|
||||
char_array_clear(command);
|
||||
char_array_cat_printf(command, "rm %s.%zu", filename, i);
|
||||
ret = system(char_array_get_string(command));
|
||||
if (ret != EXIT_SUCCESS) {
|
||||
goto exit_char_array_allocated;
|
||||
}
|
||||
}
|
||||
|
||||
char_array_clear(command);
|
||||
char_array_cat_printf(command, "mv %s.tmp %s", outfile, outfile);
|
||||
ret = system(char_array_get_string(command));
|
||||
if (ret != EXIT_SUCCESS) {
|
||||
goto exit_char_array_allocated;
|
||||
}
|
||||
|
||||
exit_char_array_allocated:
|
||||
char_array_destroy(command);
|
||||
return ret == EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
// Shuffle a file in-place, specifying a rough upper bound on system memory
|
||||
bool shuffle_file_chunked_size(char *filename, size_t chunk_size) {
|
||||
FILE *f = fopen(filename, "r");
|
||||
|
||||
if (f == NULL) return false;
|
||||
|
||||
fseek(f, 0L, SEEK_END);
|
||||
size_t size = ftell(f);
|
||||
fclose(f);
|
||||
|
||||
size_t parts = size / chunk_size + 1;
|
||||
// If the file is smaller than the chunk size, do a
|
||||
// simple in-memory shuffle of the whole file
|
||||
if (parts == 1) {
|
||||
return shuffle_file(filename);
|
||||
}
|
||||
|
||||
return shuffle_file_chunked(filename, parts);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,5 +5,7 @@
|
||||
#include <stdbool.h>
|
||||
|
||||
bool shuffle_file(char *filename);
|
||||
bool shuffle_file_chunked(char *filename, size_t parts);
|
||||
bool shuffle_file_chunked_size(char *filename, size_t chunk_size);
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user