[parser] using a bipartite graph (indptr + indices) to represent postal code<=>admin relationships instead of a set of 64-bit ints. Requires |V(postal codes)| + |E| 32 bit ints instead of |E| 64 bit ints. Saves several hundred MB in file size and even more space in memory because of the hashtable overhead

2017-03-18 06:05:28 -04:00
parent cb112f0ea7
commit c67678087f
7 changed files with 94 additions and 86 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -28,15 +28,17 @@ libscanner_la_SOURCES = scanner.c
 libscanner_la_CFLAGS = $(CFLAGS_O0) $(CFLAGS_SCANNER_EXTRA)

 noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_geodb build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test
+
 libpostal_SOURCES = main.c json_encode.c
 libpostal_LDADD = libpostal.la
 libpostal_CFLAGS = $(CFLAGS_O3)
 bench_SOURCES = bench.c
 bench_LDADD = libpostal.la libscanner.la $(BLAS_LIBS)
 bench_CFLAGS = $(CFLAGS_O3)
-address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c numex.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
+address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c numex.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
 address_parser_LDADD = libscanner.la $(BLAS_LIBS)
 address_parser_CFLAGS = $(CFLAGS_O3)
+
 build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
 build_address_dictionary_CFLAGS = $(CFLAGS_O3)
 build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c graph.c graph_builder.c normalize.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c
@@ -46,11 +48,11 @@ build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_ut
 build_numex_table_CFLAGS = $(CFLAGS_O3)
 build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
 build_trans_table_CFLAGS = $(CFLAGS_O3)
-address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
+address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
 address_parser_train_LDADD = libscanner.la $(BLAS_LIBS)
 address_parser_train_CFLAGS = $(CFLAGS_O3)

-address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
+address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
 address_parser_test_LDADD = libscanner.la  $(BLAS_LIBS)
 address_parser_test_CFLAGS = $(CFLAGS_O3)

--- a/src/address_parser.c
+++ b/src/address_parser.c
@@ -4,6 +4,9 @@
 #include "ngrams.h"
 #include "scanner.h"

+#include "graph_builder.h"
+
+#include "klib/ksort.h"
 #include "log/log.h"

 #define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat"
@@ -136,18 +139,10 @@ bool address_parser_save(address_parser_t *self, char *output_dir) {
        return false;
    }

-    size_t num_postal_code_contexts = kh_size(self->postal_code_contexts);
-    if (!file_write_uint64(postal_codes_file, num_postal_code_contexts)) {
+    if (!graph_write(self->postal_code_contexts, postal_codes_file)) {
        return false;
    }

-    uint64_t postal_code_context;
-    kh_foreach_key(self->postal_code_contexts, postal_code_context, {
-        if (!file_write_uint64(postal_codes_file, postal_code_context)) {
-            return false;
-        }
-    })
-
    fclose(postal_codes_file);

    char_array_destroy(path);
@@ -155,6 +150,11 @@ bool address_parser_save(address_parser_t *self, char *output_dir) {
    return true;
 }

+static bool postal_code_context_exists(address_parser_t *self, uint32_t postal_code_id, uint32_t admin_id) {
+    graph_t *g = self->postal_code_contexts;
+
+    return graph_has_edge(g, postal_code_id, admin_id);
+}

 bool address_parser_load(char *dir) {
    if (parser != NULL) return false;
@@ -174,7 +174,7 @@ bool address_parser_load(char *dir) {
            parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON;
            parser->model.ap = ap_model;
        } else {
-            char_array_destroy(model_path);
+            char_array_destroy(path);
            log_error("Averaged perceptron model could not be loaded\n");
            return false;
        }
@@ -194,12 +194,12 @@ bool address_parser_load(char *dir) {
                parser->model_type = ADDRESS_PARSER_TYPE_CRF;
                parser->model.crf = crf_model;
            } else {
-                char_array_destroy(model_path);
+                char_array_destroy(path);
                log_error("Averaged perceptron model could not be loaded\n");
                return false;
            }
        } else {
-            model_path == NULL;
+            model_path = NULL;
        }
    }

@@ -279,41 +279,13 @@ bool address_parser_load(char *dir) {
        goto exit_address_parser_created;
    }

-    uint64_t num_postal_code_contexts;
+    parser->postal_code_contexts = graph_read(postal_codes_file);

-    if (!file_read_uint64(postal_codes_file, &num_postal_code_contexts)) {
-        goto exit_address_parser_created;
-    }
-
-    log_debug("num_postal_code_contexts = %llu\n", num_postal_code_contexts);
-
-    uint64_array *postal_code_context_values = uint64_array_new_size(num_postal_code_contexts);
-    if (!file_read_uint64_array(postal_codes_file, postal_code_context_values->a, num_postal_code_contexts)) {
-        uint64_array_destroy(postal_code_context_values);
-        goto exit_address_parser_created;
-    }
-    postal_code_context_values->n = num_postal_code_contexts;
-
-    fclose(postal_codes_file);
-
-    parser->postal_code_contexts = kh_init(int64_set);
    if (parser->postal_code_contexts == NULL) {
        goto exit_address_parser_created;
    }
-    if (kh_resize(int64_set, parser->postal_code_contexts, num_postal_code_contexts) < 0) {
-        goto exit_address_parser_created;
-    }

-    for (size_t i = 0; i < postal_code_context_values->n; i++) {
-        uint64_t context_value = postal_code_context_values->a[i];
-        int ret = 0;
-        kh_put(int64_set, parser->postal_code_contexts, context_value, &ret);
-        if (ret < 0) {
-            goto exit_address_parser_created;
-        }
-    }
-
-    uint64_array_destroy(postal_code_context_values);
+    fclose(postal_codes_file);

    char_array_destroy(path);
    return true;
@@ -350,7 +322,7 @@ void address_parser_destroy(address_parser_t *self) {
    }

    if (self->postal_code_contexts != NULL) {
-        kh_destroy(int64_set, self->postal_code_contexts);
+        graph_destroy(self->postal_code_contexts);
    }

    free(self);
@@ -1301,12 +1273,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
            if (last_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
                phrase_t last_component_phrase = component_phrases->a[last_component_phrase_index];
                admin_id = last_component_phrase.data;
-                postal_code_context_value_t postal_code_context_value = POSTAL_CODE_CONTEXT(postal_code_id, admin_id);
-                postal_code_context = postal_code_context_value.value;

-                k = kh_get(int64_set, parser->postal_code_contexts, postal_code_context);
-
-                if (k != kh_end(parser->postal_code_contexts)) {
+                if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
                    postal_code_have_admin = true;
                }
            }
@@ -1317,12 +1285,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
            if (next_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
                phrase_t next_component_phrase = component_phrases->a[next_component_phrase_index];
                admin_id = next_component_phrase.data;
-                postal_code_context_value_t postal_code_context_value = POSTAL_CODE_CONTEXT(postal_code_id, admin_id);
-                postal_code_context = postal_code_context_value.value;
-
-                k = kh_get(int64_set, parser->postal_code_contexts, postal_code_context);
-
-                if (k != kh_end(parser->postal_code_contexts)) {
+                if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
                    postal_code_have_admin = true;
                }
            }
--- a/src/address_parser.h
+++ b/src/address_parser.h
@@ -53,6 +53,7 @@ with the general error-driven averaged perceptron.
 #include "averaged_perceptron_tagger.h"
 #include "collections.h"
 #include "crf.h"
+#include "graph.h"
 #include "normalize.h"
 #include "string_utils.h"

@@ -203,7 +204,7 @@ typedef struct address_parser {
    trie_t *phrases;
    address_parser_types_array *phrase_types;
    trie_t *postal_codes;
-    khash_t(int64_set) *postal_code_contexts;
+    graph_t *postal_code_contexts;
 } address_parser_t;

 // General usage
--- a/src/address_parser_train.c
+++ b/src/address_parser_train.c
@@ -8,7 +8,8 @@
 #include "collections.h"
 #include "constants.h"
 #include "file_utils.h"
-#include "geodb.h"
+#include "graph.h"
+#include "graph_builder.h"
 #include "shuffle.h"
 #include "transliterate.h"

@@ -303,13 +304,9 @@ bool address_phrases_and_labels(address_parser_data_set_t *data_set, cstring_arr

        char_array_cat(phrase_builder, normalized);

-        last_was_separator = false;
-
        prev_label = label;

-        if (data_set->separators->a[i] == ADDRESS_SEPARATOR_FIELD_INTERNAL) {
-            last_was_separator = true;
-        }
+        last_was_separator = data_set->separators->a[i] == ADDRESS_SEPARATOR_FIELD_INTERNAL;

    })

@@ -377,12 +374,6 @@ address_parser_t *address_parser_init(char *filename) {
        return NULL;
    }

-    khash_t(int64_set) *postal_code_contexts = kh_init(int64_set);
-    if (postal_code_contexts == NULL) {
-        log_error("Could not allocate postal_code_contexts\n");
-        return NULL;
-    }
-
    khiter_t k;
    char *str;

@@ -859,11 +850,14 @@ address_parser_t *address_parser_init(char *filename) {

    log_info("Building postal code contexts\n");

-    khash_t(str_set) *context_phrases;
+    bool fixed_rows = false;
+    graph_builder_t *postal_code_contexts_builder = graph_builder_new(GRAPH_BIPARTITE, fixed_rows);

    uint32_t postal_code_id;
    uint32_t context_phrase_id;

+    khash_t(str_set) *context_phrases;
+
    kh_foreach(postal_code_admin_contexts, token, context_phrases, {
        if (!trie_get_data(parser->postal_codes, (char *)token, &postal_code_id)) {
            log_error("Key %s did not exist in parser->postal_codes\n", (char *)token);
@@ -879,19 +873,14 @@ address_parser_t *address_parser_init(char *filename) {
                goto exit_hashes_allocated;
            }

-            postal_code_context_value_t postal_code_context = POSTAL_CODE_CONTEXT(postal_code_id, context_phrase_id);
-
-            ret = 0;
-            k = kh_put(int64_set, postal_code_contexts, postal_code_context.value, &ret);
-            if (ret < 0) {
-                log_error("Error in kh_put for postal_code_contexts\n");
-                address_parser_destroy(parser);
-                parser = NULL;
-                goto exit_hashes_allocated;
-            }
+            graph_builder_add_edge(postal_code_contexts_builder, postal_code_id, context_phrase_id);
        })
    })

+    bool sort_edges = true;
+    bool remove_duplicates = true;
+    graph_t *postal_code_contexts = graph_builder_finalize(postal_code_contexts_builder, sort_edges, remove_duplicates);
+
    // NOTE: don't destroy this during deallocation
    if (postal_code_contexts == NULL) {
        log_error("postal_code_contexts is NULL\n");
--- a/src/graph.c
+++ b/src/graph.c
@@ -1,4 +1,5 @@
 #include "graph.h"
+#include "klib/ksort.h"

 graph_t *graph_new_dims(graph_type_t type, uint32_t m, uint32_t n, size_t nnz, bool fixed_rows) {
    graph_t *graph = calloc(1, sizeof(graph_t));
@@ -65,7 +66,7 @@ inline void graph_clear(graph_t *self) {
    uint32_array_clear(self->indices);
 }

-inline void graph_finalize_vertex(graph_t *self) {
+inline void graph_finalize_vertex_no_sort(graph_t *self) {
    uint32_array_push(self->indptr, (uint32_t)self->indices->n);
    if (!self->fixed_rows) {
        self->m++;
@@ -73,6 +74,55 @@ inline void graph_finalize_vertex(graph_t *self) {
    }
 }

+void graph_finalize_vertex(graph_t *self) {
+    size_t start = 0;
+    if (self->indptr->n > 0) {
+        start = self->indptr->a[self->indptr->n - 1];
+    }
+
+    size_t end = self->indices->n;
+    size_t len = end - start;
+
+    if (len > 1) {
+        ks_introsort(uint32_t, len, self->indices->a + start);
+    }
+    graph_finalize_vertex_no_sort(self);
+}
+
+bool graph_has_edge(graph_t *self, uint32_t i, uint32_t j) {
+    if (i > self->m || j > self->n || i >= self->indptr->n - 1) return false;
+
+    uint32_t *indptr = self->indptr->a;
+    uint32_t *indices = self->indices->a;
+
+    uint32_t row_start = indptr[i];
+    uint32_t row_end = indptr[i + 1];
+    uint32_t len = row_end - row_start;
+    if (len == 0) return false;
+
+    // Simple binary search, array is sorted
+    ssize_t lo = (ssize_t)row_start;
+    ssize_t hi = (ssize_t)row_end - 1;
+
+    bool found = false;
+
+    while (lo <= hi) {
+        size_t mid = (lo + hi) / 2;
+        uint64_t val = indices[mid];
+        if (val < j) {
+            lo = mid + 1;
+        } else if (val > j) {
+            hi = mid - 1;
+        } else {
+            found = true;
+            break;
+        }
+    }
+
+    return found;
+}
+
+
 inline void graph_append_edge(graph_t *self, uint32_t col) {
    uint32_array_push(self->indices, col);
    if (col >= self->n) self->n = col + 1;
--- a/src/graph.h
+++ b/src/graph.h
@@ -27,6 +27,7 @@ Currently we're not implementing edge types, graph traversal, etc.
 #include "collections.h"
 #include "file_utils.h"
 #include "vector.h"
+#include "vector_math.h"

 typedef enum {
    GRAPH_DIRECTED,
@@ -55,8 +56,11 @@ void graph_clear(graph_t *self);
 void graph_append_edge(graph_t *self, uint32_t col);
 void graph_append_edges(graph_t *self, uint32_t *col, size_t n);

+void graph_finalize_vertex_no_sort(graph_t *self);
 void graph_finalize_vertex(graph_t *self);

+bool graph_has_edge(graph_t *self, uint32_t i, uint32_t j);
+
 bool graph_write(graph_t *self, FILE *f);
 bool graph_save(graph_t *self, char *path);
 graph_t *graph_read(FILE *f);
--- a/src/graph_builder.c
+++ b/src/graph_builder.c
@@ -47,10 +47,9 @@ static graph_t *graph_builder_build_edges(graph_builder_t *self, bool remove_dup
        graph_edge_t edge = self->edges->a[i];
        if (edge.v1 > last_vertex) {
            for (uint32_t row = last_vertex; row < edge.v1; row++) {
-                graph_finalize_vertex(graph);
-
+                // Sorting is done prior to this
+                graph_finalize_vertex_no_sort(graph);
            }
-
        }

        if (!remove_duplicates || i == 0 || edge.v1 != last_vertex || edge.v2 != last_edge) {
@@ -60,7 +59,7 @@ static graph_t *graph_builder_build_edges(graph_builder_t *self, bool remove_dup
        last_edge = edge.v2;
    }

-    graph_finalize_vertex(graph);
+    graph_finalize_vertex_no_sort(graph);

    return graph;
 }