From c67678087fa47c661a8a9ae6fbdf9b807a5c4247 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 18 Mar 2017 06:05:28 -0400 Subject: [PATCH] [parser] using a bipartite graph (indptr + indices) to represent postal code<=>admin relationships instead of a set of 64-bit ints. Requires |V(postal codes)| + |E| 32 bit ints instead of |E| 64 bit ints. Saves several hundred MB in file size and even more space in memory because of the hashtable overhead --- src/Makefile.am | 8 +++-- src/address_parser.c | 71 +++++++++----------------------------- src/address_parser.h | 3 +- src/address_parser_train.c | 35 +++++++------------ src/graph.c | 52 +++++++++++++++++++++++++++- src/graph.h | 4 +++ src/graph_builder.c | 7 ++-- 7 files changed, 94 insertions(+), 86 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 35c67f43..5c599f6d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -28,15 +28,17 @@ libscanner_la_SOURCES = scanner.c libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA) noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_geodb build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test + libpostal_SOURCES = main.c json_encode.c libpostal_LDADD = libpostal.la libpostal_CFLAGS = $(CFLAGS_O3) bench_SOURCES = bench.c bench_LDADD = libpostal.la libscanner.la $(BLAS_LIBS) bench_CFLAGS = $(CFLAGS_O3) -address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c numex.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c +address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c numex.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c address_parser_LDADD = libscanner.la $(BLAS_LIBS) address_parser_CFLAGS = $(CFLAGS_O3) + build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c build_address_dictionary_CFLAGS = $(CFLAGS_O3) build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c graph.c graph_builder.c normalize.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c @@ -46,11 +48,11 @@ build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_ut build_numex_table_CFLAGS = $(CFLAGS_O3) build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c build_trans_table_CFLAGS = $(CFLAGS_O3) -address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c +address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c address_parser_train_LDADD = libscanner.la $(BLAS_LIBS) address_parser_train_CFLAGS = $(CFLAGS_O3) -address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c +address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c address_parser_test_LDADD = libscanner.la $(BLAS_LIBS) address_parser_test_CFLAGS = $(CFLAGS_O3) diff --git a/src/address_parser.c b/src/address_parser.c index 4be92131..bd825ff5 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -4,6 +4,9 @@ #include "ngrams.h" #include "scanner.h" +#include "graph_builder.h" + +#include "klib/ksort.h" #include "log/log.h" #define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat" @@ -136,18 +139,10 @@ bool address_parser_save(address_parser_t *self, char *output_dir) { return false; } - size_t num_postal_code_contexts = kh_size(self->postal_code_contexts); - if (!file_write_uint64(postal_codes_file, num_postal_code_contexts)) { + if (!graph_write(self->postal_code_contexts, postal_codes_file)) { return false; } - uint64_t postal_code_context; - kh_foreach_key(self->postal_code_contexts, postal_code_context, { - if (!file_write_uint64(postal_codes_file, postal_code_context)) { - return false; - } - }) - fclose(postal_codes_file); char_array_destroy(path); @@ -155,6 +150,11 @@ bool address_parser_save(address_parser_t *self, char *output_dir) { return true; } +static bool postal_code_context_exists(address_parser_t *self, uint32_t postal_code_id, uint32_t admin_id) { + graph_t *g = self->postal_code_contexts; + + return graph_has_edge(g, postal_code_id, admin_id); +} bool address_parser_load(char *dir) { if (parser != NULL) return false; @@ -174,7 +174,7 @@ bool address_parser_load(char *dir) { parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON; parser->model.ap = ap_model; } else { - char_array_destroy(model_path); + char_array_destroy(path); log_error("Averaged perceptron model could not be loaded\n"); return false; } @@ -194,12 +194,12 @@ bool address_parser_load(char *dir) { parser->model_type = ADDRESS_PARSER_TYPE_CRF; parser->model.crf = crf_model; } else { - char_array_destroy(model_path); + char_array_destroy(path); log_error("Averaged perceptron model could not be loaded\n"); return false; } } else { - model_path == NULL; + model_path = NULL; } } @@ -279,41 +279,13 @@ bool address_parser_load(char *dir) { goto exit_address_parser_created; } - uint64_t num_postal_code_contexts; + parser->postal_code_contexts = graph_read(postal_codes_file); - if (!file_read_uint64(postal_codes_file, &num_postal_code_contexts)) { - goto exit_address_parser_created; - } - - log_debug("num_postal_code_contexts = %llu\n", num_postal_code_contexts); - - uint64_array *postal_code_context_values = uint64_array_new_size(num_postal_code_contexts); - if (!file_read_uint64_array(postal_codes_file, postal_code_context_values->a, num_postal_code_contexts)) { - uint64_array_destroy(postal_code_context_values); - goto exit_address_parser_created; - } - postal_code_context_values->n = num_postal_code_contexts; - - fclose(postal_codes_file); - - parser->postal_code_contexts = kh_init(int64_set); if (parser->postal_code_contexts == NULL) { goto exit_address_parser_created; } - if (kh_resize(int64_set, parser->postal_code_contexts, num_postal_code_contexts) < 0) { - goto exit_address_parser_created; - } - for (size_t i = 0; i < postal_code_context_values->n; i++) { - uint64_t context_value = postal_code_context_values->a[i]; - int ret = 0; - kh_put(int64_set, parser->postal_code_contexts, context_value, &ret); - if (ret < 0) { - goto exit_address_parser_created; - } - } - - uint64_array_destroy(postal_code_context_values); + fclose(postal_codes_file); char_array_destroy(path); return true; @@ -350,7 +322,7 @@ void address_parser_destroy(address_parser_t *self) { } if (self->postal_code_contexts != NULL) { - kh_destroy(int64_set, self->postal_code_contexts); + graph_destroy(self->postal_code_contexts); } free(self); @@ -1301,12 +1273,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (last_component_phrase_index != NULL_PHRASE_MEMBERSHIP) { phrase_t last_component_phrase = component_phrases->a[last_component_phrase_index]; admin_id = last_component_phrase.data; - postal_code_context_value_t postal_code_context_value = POSTAL_CODE_CONTEXT(postal_code_id, admin_id); - postal_code_context = postal_code_context_value.value; - k = kh_get(int64_set, parser->postal_code_contexts, postal_code_context); - - if (k != kh_end(parser->postal_code_contexts)) { + if (postal_code_context_exists(parser, postal_code_id, admin_id)) { postal_code_have_admin = true; } } @@ -1317,12 +1285,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (next_component_phrase_index != NULL_PHRASE_MEMBERSHIP) { phrase_t next_component_phrase = component_phrases->a[next_component_phrase_index]; admin_id = next_component_phrase.data; - postal_code_context_value_t postal_code_context_value = POSTAL_CODE_CONTEXT(postal_code_id, admin_id); - postal_code_context = postal_code_context_value.value; - - k = kh_get(int64_set, parser->postal_code_contexts, postal_code_context); - - if (k != kh_end(parser->postal_code_contexts)) { + if (postal_code_context_exists(parser, postal_code_id, admin_id)) { postal_code_have_admin = true; } } diff --git a/src/address_parser.h b/src/address_parser.h index ec9a9f72..f77986d0 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -53,6 +53,7 @@ with the general error-driven averaged perceptron. #include "averaged_perceptron_tagger.h" #include "collections.h" #include "crf.h" +#include "graph.h" #include "normalize.h" #include "string_utils.h" @@ -203,7 +204,7 @@ typedef struct address_parser { trie_t *phrases; address_parser_types_array *phrase_types; trie_t *postal_codes; - khash_t(int64_set) *postal_code_contexts; + graph_t *postal_code_contexts; } address_parser_t; // General usage diff --git a/src/address_parser_train.c b/src/address_parser_train.c index abae4f0e..0a254cb7 100644 --- a/src/address_parser_train.c +++ b/src/address_parser_train.c @@ -8,7 +8,8 @@ #include "collections.h" #include "constants.h" #include "file_utils.h" -#include "geodb.h" +#include "graph.h" +#include "graph_builder.h" #include "shuffle.h" #include "transliterate.h" @@ -303,13 +304,9 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr char_array_cat(phrase_builder, normalized); - last_was_separator = false; - prev_label = label; - if (data_set->separators->a[i] == ADDRESS_SEPARATOR_FIELD_INTERNAL) { - last_was_separator = true; - } + last_was_separator = data_set->separators->a[i] == ADDRESS_SEPARATOR_FIELD_INTERNAL; }) @@ -377,12 +374,6 @@ address_parser_t *address_parser_init(char *filename) { return NULL; } - khash_t(int64_set) *postal_code_contexts = kh_init(int64_set); - if (postal_code_contexts == NULL) { - log_error("Could not allocate postal_code_contexts\n"); - return NULL; - } - khiter_t k; char *str; @@ -859,11 +850,14 @@ address_parser_t *address_parser_init(char *filename) { log_info("Building postal code contexts\n"); - khash_t(str_set) *context_phrases; + bool fixed_rows = false; + graph_builder_t *postal_code_contexts_builder = graph_builder_new(GRAPH_BIPARTITE, fixed_rows); uint32_t postal_code_id; uint32_t context_phrase_id; + khash_t(str_set) *context_phrases; + kh_foreach(postal_code_admin_contexts, token, context_phrases, { if (!trie_get_data(parser->postal_codes, (char *)token, &postal_code_id)) { log_error("Key %s did not exist in parser->postal_codes\n", (char *)token); @@ -879,19 +873,14 @@ address_parser_t *address_parser_init(char *filename) { goto exit_hashes_allocated; } - postal_code_context_value_t postal_code_context = POSTAL_CODE_CONTEXT(postal_code_id, context_phrase_id); - - ret = 0; - k = kh_put(int64_set, postal_code_contexts, postal_code_context.value, &ret); - if (ret < 0) { - log_error("Error in kh_put for postal_code_contexts\n"); - address_parser_destroy(parser); - parser = NULL; - goto exit_hashes_allocated; - } + graph_builder_add_edge(postal_code_contexts_builder, postal_code_id, context_phrase_id); }) }) + bool sort_edges = true; + bool remove_duplicates = true; + graph_t *postal_code_contexts = graph_builder_finalize(postal_code_contexts_builder, sort_edges, remove_duplicates); + // NOTE: don't destroy this during deallocation if (postal_code_contexts == NULL) { log_error("postal_code_contexts is NULL\n"); diff --git a/src/graph.c b/src/graph.c index 7403e17a..7981b40d 100644 --- a/src/graph.c +++ b/src/graph.c @@ -1,4 +1,5 @@ #include "graph.h" +#include "klib/ksort.h" graph_t *graph_new_dims(graph_type_t type, uint32_t m, uint32_t n, size_t nnz, bool fixed_rows) { graph_t *graph = calloc(1, sizeof(graph_t)); @@ -65,7 +66,7 @@ inline void graph_clear(graph_t *self) { uint32_array_clear(self->indices); } -inline void graph_finalize_vertex(graph_t *self) { +inline void graph_finalize_vertex_no_sort(graph_t *self) { uint32_array_push(self->indptr, (uint32_t)self->indices->n); if (!self->fixed_rows) { self->m++; @@ -73,6 +74,55 @@ inline void graph_finalize_vertex(graph_t *self) { } } +void graph_finalize_vertex(graph_t *self) { + size_t start = 0; + if (self->indptr->n > 0) { + start = self->indptr->a[self->indptr->n - 1]; + } + + size_t end = self->indices->n; + size_t len = end - start; + + if (len > 1) { + ks_introsort(uint32_t, len, self->indices->a + start); + } + graph_finalize_vertex_no_sort(self); +} + +bool graph_has_edge(graph_t *self, uint32_t i, uint32_t j) { + if (i > self->m || j > self->n || i >= self->indptr->n - 1) return false; + + uint32_t *indptr = self->indptr->a; + uint32_t *indices = self->indices->a; + + uint32_t row_start = indptr[i]; + uint32_t row_end = indptr[i + 1]; + uint32_t len = row_end - row_start; + if (len == 0) return false; + + // Simple binary search, array is sorted + ssize_t lo = (ssize_t)row_start; + ssize_t hi = (ssize_t)row_end - 1; + + bool found = false; + + while (lo <= hi) { + size_t mid = (lo + hi) / 2; + uint64_t val = indices[mid]; + if (val < j) { + lo = mid + 1; + } else if (val > j) { + hi = mid - 1; + } else { + found = true; + break; + } + } + + return found; +} + + inline void graph_append_edge(graph_t *self, uint32_t col) { uint32_array_push(self->indices, col); if (col >= self->n) self->n = col + 1; diff --git a/src/graph.h b/src/graph.h index 934aaa00..9b027535 100644 --- a/src/graph.h +++ b/src/graph.h @@ -27,6 +27,7 @@ Currently we're not implementing edge types, graph traversal, etc. #include "collections.h" #include "file_utils.h" #include "vector.h" +#include "vector_math.h" typedef enum { GRAPH_DIRECTED, @@ -55,8 +56,11 @@ void graph_clear(graph_t *self); void graph_append_edge(graph_t *self, uint32_t col); void graph_append_edges(graph_t *self, uint32_t *col, size_t n); +void graph_finalize_vertex_no_sort(graph_t *self); void graph_finalize_vertex(graph_t *self); +bool graph_has_edge(graph_t *self, uint32_t i, uint32_t j); + bool graph_write(graph_t *self, FILE *f); bool graph_save(graph_t *self, char *path); graph_t *graph_read(FILE *f); diff --git a/src/graph_builder.c b/src/graph_builder.c index 35c48f5b..8975c1f7 100644 --- a/src/graph_builder.c +++ b/src/graph_builder.c @@ -47,10 +47,9 @@ static graph_t *graph_builder_build_edges(graph_builder_t *self, bool remove_dup graph_edge_t edge = self->edges->a[i]; if (edge.v1 > last_vertex) { for (uint32_t row = last_vertex; row < edge.v1; row++) { - graph_finalize_vertex(graph); - + // Sorting is done prior to this + graph_finalize_vertex_no_sort(graph); } - } if (!remove_duplicates || i == 0 || edge.v1 != last_vertex || edge.v2 != last_edge) { @@ -60,7 +59,7 @@ static graph_t *graph_builder_build_edges(graph_builder_t *self, bool remove_dup last_edge = edge.v2; } - graph_finalize_vertex(graph); + graph_finalize_vertex_no_sort(graph); return graph; }