diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index ba8a1c18..5e4fe054 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -351,13 +351,13 @@ def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude): DROP_PROBABILITIES = { - AddressFormatter.HOUSE: 0.8, + AddressFormatter.HOUSE: 0.6, AddressFormatter.HOUSE_NUMBER: 0.5, AddressFormatter.ROAD: 0.5, - AddressFormatter.SUBURB: 1.0, - AddressFormatter.CITY_DISTRICT: 1.0, + AddressFormatter.SUBURB: 0.8, + AddressFormatter.CITY_DISTRICT: 0.8, AddressFormatter.CITY: 0.6, - AddressFormatter.STATE_DISTRICT: 1.0, + AddressFormatter.STATE_DISTRICT: 0.8, AddressFormatter.STATE: 0.8, AddressFormatter.POSTCODE: 0.7, AddressFormatter.COUNTRY: 0.8 diff --git a/src/address_parser.c b/src/address_parser.c new file mode 100644 index 00000000..a15e998a --- /dev/null +++ b/src/address_parser.c @@ -0,0 +1,748 @@ +#include "address_parser.h" +#include "address_dictionary.h" +#include "features.h" +#include "geodb.h" +#include "scanner.h" + +#include "log/log.h" + +#define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat" +#define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie" + +#define UNKNOWN_WORD "UNKNOWN" + +static address_parser_t *parser = NULL; + + +address_parser_t *address_parser_new(void) { + address_parser_t *parser = malloc(sizeof(address_parser_t)); + return parser; +} + + +address_parser_t *get_address_parser(void) { + return parser; +} + + +bool address_parser_save(address_parser_t *self, char *output_dir) { + if (self == NULL || output_dir == NULL) return false; + + char_array *path = char_array_new_size(strlen(output_dir)); + + char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_MODEL_FILENAME); + char *model_path = char_array_get_string(path); + + if (!averaged_perceptron_save(self->model, model_path)) { + char_array_destroy(path); + return false; + } + + char_array_clear(path); + + char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_VOCAB_FILENAME); + char *vocab_path = char_array_get_string(path); + + if (!trie_save(self->vocab, vocab_path)) { + return false; + } + + char_array_destroy(path); + + return true; +} + + +bool address_parser_load(char *dir) { + if (parser != NULL) return false; + if (dir == NULL) { + dir = LIBPOSTAL_ADDRESS_PARSER_DIR; + } + + char_array *path = char_array_new_size(strlen(dir)); + + char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME); + char *model_path = char_array_get_string(path); + + averaged_perceptron_t *model = averaged_perceptron_load(model_path); + + if (model == NULL) { + char_array_destroy(path); + return false; + } + + parser = address_parser_new(); + parser->model = model; + + char_array_clear(path); + + char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_VOCAB_FILENAME); + + char *vocab_path = char_array_get_string(path); + + trie_t *vocab = trie_load(vocab_path); + + if (vocab == NULL) { + address_parser_destroy(parser); + char_array_destroy(path); + return false; + } + + parser->vocab = vocab; + + char_array_destroy(path); + return true; +} + +void address_parser_destroy(address_parser_t *self) { + if (self == NULL) return; + + if (self->model != NULL) { + averaged_perceptron_destroy(self->model); + } + + if (self->vocab != NULL) { + trie_destroy(self->vocab); + } + + free(self); +} + +static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) { + uint32_t count = 0; + bool has_key = trie_get_data(parser->vocab, word, &count); + return count; +} + +inline void address_parser_normalize_token(cstring_array *array, char *str, token_t token) { + normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS); +} + +inline char *address_parser_normalize_string(char *str) { + return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS); +} + + +void address_parser_context_destroy(address_parser_context_t *self) { + if (self == NULL) return; + + if (self->phrase != NULL) { + char_array_destroy(self->phrase); + } + + if (self->separators != NULL) { + uint32_array_destroy(self->separators); + } + + if (self->normalized != NULL) { + cstring_array_destroy(self->normalized); + } + + if (self->features != NULL) { + cstring_array_destroy(self->features); + } + + if (self->tokenized_str != NULL) { + tokenized_string_destroy(self->tokenized_str); + } + + if (self->address_dictionary_phrases != NULL) { + phrase_array_destroy(self->address_dictionary_phrases); + } + + if (self->address_phrase_memberships != NULL) { + int64_array_destroy(self->address_phrase_memberships); + } + + if (self->geodb_phrases != NULL) { + phrase_array_destroy(self->geodb_phrases); + } + + if (self->geodb_phrase_memberships != NULL) { + int64_array_destroy(self->geodb_phrase_memberships); + } + + free(self); +} + +address_parser_context_t *address_parser_context_new(void) { + address_parser_context_t *context = malloc(sizeof(address_parser_context_t)); + + if (context == NULL) return NULL; + + context->language = NULL; + context->country = NULL; + + context->phrase = char_array_new(); + if (context->phrase == NULL) { + goto exit_address_parser_context_allocated; + } + + context->separators = uint32_array_new(); + if (context->separators == NULL) { + goto exit_address_parser_context_allocated; + } + + context->normalized = cstring_array_new(); + if (context->normalized == NULL) { + goto exit_address_parser_context_allocated; + } + + context->features = cstring_array_new(); + if (context->features == NULL) { + goto exit_address_parser_context_allocated; + } + + context->tokenized_str = tokenized_string_new(); + if (context->tokenized_str == NULL) { + goto exit_address_parser_context_allocated; + } + + context->address_dictionary_phrases = phrase_array_new(); + if (context->address_dictionary_phrases == NULL) { + goto exit_address_parser_context_allocated; + } + + context->address_phrase_memberships = int64_array_new(); + if (context->address_phrase_memberships == NULL) { + goto exit_address_parser_context_allocated; + } + + context->geodb_phrases = phrase_array_new(); + if (context->geodb_phrases == NULL) { + goto exit_address_parser_context_allocated; + } + + context->geodb_phrase_memberships = int64_array_new(); + if (context->geodb_phrase_memberships == NULL) { + goto exit_address_parser_context_allocated; + } + + return context; + +exit_address_parser_context_allocated: + address_parser_context_destroy(context); + return NULL; +} + +void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country) { + int64_t i, j; + + uint32_t token_index; + char *word; + phrase_t phrase; + + context->language = language; + context->country = country; + + cstring_array *normalized = context->normalized; + cstring_array_clear(normalized); + + char *str = tokenized_str->str; + token_array *tokens = tokenized_str->tokens; + + cstring_array_foreach(tokenized_str->strings, token_index, word, { + token_t token = tokens->a[token_index]; + address_parser_normalize_token(normalized, str, token); + }) + + phrase_array_clear(context->address_dictionary_phrases); + int64_array_clear(context->address_phrase_memberships); + + i = 0; + phrase_array *address_dictionary_phrases = context->address_dictionary_phrases; + int64_array *address_phrase_memberships = context->address_phrase_memberships; + + if (search_address_dictionaries_tokens_with_phrases(str, tokens, context->language, &context->address_dictionary_phrases)) { + for (j = 0; j < address_dictionary_phrases->n; j++) { + phrase = address_dictionary_phrases->a[j]; + + for (; i < phrase.start; i++) { + int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP); + log_debug("token i=%lld, null phrase membership\n", i); + } + + for (i = phrase.start; i < phrase.start + phrase.len; i++) { + log_debug("token i=%lld, phrase membership=%lld\n", i, j); + int64_array_push(address_phrase_memberships, j); + } + } + } + + for (; i < tokens->n; i++) { + log_debug("token i=%lld, null phrase membership\n", i); + int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP); + } + + phrase_array_clear(context->geodb_phrases); + int64_array_clear(context->geodb_phrase_memberships); + + phrase_array *geodb_phrases = context->geodb_phrases; + int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships; + i = 0; + + if (search_geodb_tokens_with_phrases(str, tokens, &context->geodb_phrases)) { + for (j = 0; j < geodb_phrases->n; j++) { + phrase = geodb_phrases->a[j]; + + for (; i < phrase.start; i++) { + log_debug("token i=%lld, null geo phrase membership\n", i); + int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP); + } + + for (i = phrase.start; i < phrase.start + phrase.len; i++) { + log_debug("token i=%lld, geo phrase membership=%lld\n", i, j); + int64_array_push(geodb_phrase_memberships, j); + } + } + } + for (; i < tokens->n; i++) { + log_debug("token i=%lld, null geo phrase membership\n", i); + int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP); + } + + +} + + +static inline char *get_phrase_string(tokenized_string_t *str, char_array *phrase_tokens, phrase_t phrase) { + size_t phrase_len = 0; + char_array_clear(phrase_tokens); + + size_t phrase_end = phrase.start + phrase.len; + + for (int k = phrase.start; k < phrase_end; k++) { + char *w = tokenized_string_get_token(str, k); + char_array_append(phrase_tokens, w); + if (k < phrase_end - 1) { + char_array_append(phrase_tokens, " "); + } + } + char_array_terminate(phrase_tokens); + + return char_array_get_string(phrase_tokens); +} + + +/* + +typedef struct adjacent_phrase { + phrase_t phrase; + uint32_t num_separators; +} adjacent_phrase_t; + +#define NULL_ADJACENT_PHRASE (adjacent_phrase_t){NULL_PHRASE, 0}; + +static inline adjacent_phrase_t get_adjacent_phrase(int64_array *phrase_memberships, phrase_array *phrases, uint32_array *separator_positions, uint32_t i, int32_t direction) { + uint32_t *separators = separator_positions->a; + int64_t *memberships = phrase_memberships->a; + + uint32_t num_strings = (uint32_t)phrase_memberships->n; + + adjacent_phrase_t adjacent = NULL_ADJACENT_PHRASE; + + if (direction == -1) { + for (uint32_t idx = i; idx >= 0; idx--) { + uint32_t separator = separators[idx]; + if (separator > ADDRESS_SEPARATOR_NONE) { + adjacent.num_separators++; + } + + int64_t membership = memberships[ids]; + if (membership != NULL_PHRASE_MEMBERSHIP) { + adjacent.phrase = phrases->a[membership]; + break; + } + + } + } else if (direction == 1) { + for (uint32_t idx = i; idx < num_strings; idx++) { + uint32_t separator = separators[idx]; + if (separator > ADDRESS_SEPARATOR_NONE) { + adjacent.num_separators++; + } + + int64_t membership = memberships[ids]; + if (membership != NULL_PHRASE_MEMBERSHIP) { + adjacent.phrase = phrases->a[membership]; + break; + } + } + } + + return adjacent; +} +*/ + +static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string, char *prev2, char *prev) { + if (phrase_types == component) { + log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types); + feature_array_add(features, 2, "unambiguous phrase type", phrase_type); + feature_array_add(features, 3, "unambiguous phrase type+phrase", phrase_type, phrase_string); + } else if (phrase_types & component) { + feature_array_add(features, 3, "phrase type+phrase", phrase_type, phrase_string); + } +} + + +/* +address_parser_features +----------------------- + +This is a feature function similar to those found in MEMM and CRF models. + +Follows the signature of an ap_feature_function so it can be called +as a function pointer by the averaged perceptron model. + +Parameters: + +address_parser_t *self: a pointer to the address_parser struct, which contains +word frequencies and perhaps other useful corpus-wide statistics. + +address_parser_context_t *context: The context struct containing: +- phrase dictionary memberships for all the tokens +- country (if knkown) +- language (if known) +- features array + +tokenized_string_t *tokenized: the sequence of tokens for parsing +uint32_t i: the current token index +char *prev: the predicted tag at index i - 1 +char *prev2: the predicted tag at index i - 2 + +*/ + +bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t i, char *prev, char *prev2) { + if (self == NULL || ctx == NULL) return false; + + address_parser_t *parser = (address_parser_t *)self; + address_parser_context_t *context = (address_parser_context_t *)ctx; + + cstring_array *features = context->features; + char *language = context->language; + char *country = context->country; + + phrase_array *address_dictionary_phrases = context->address_dictionary_phrases; + int64_array *address_phrase_memberships = context->address_phrase_memberships; + phrase_array *geodb_phrases = context->geodb_phrases; + int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships; + cstring_array *normalized = context->normalized; + + uint32_array *separators = context->separators; + + cstring_array_clear(features); + + // Bias unit, acts as an intercept + feature_array_add(features, 1, "bias"); + + char *original_word = tokenized_string_get_token(tokenized, i); + + token_t token = tokenized->tokens->a[i]; + + ssize_t last_index = (ssize_t)i - 1; + ssize_t next_index = (ssize_t)i + 1; + + char *word = cstring_array_get_string(normalized, i); + if (word == NULL) { + log_error("got NULL word at %d\n", i); + return false; + } + + size_t word_len = strlen(word); + char *current_word = word; + + log_debug("word=%s\n", word); + + expansion_value_t expansion; + + phrase_t phrase = NULL_PHRASE; + + char *phrase_string = NULL; + char *geo_phrase_string = NULL; + + int64_t address_phrase_index = address_phrase_memberships->a[i]; + + char_array *phrase_tokens = context->phrase; + + // Address dictionary phrases + if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) { + phrase = address_dictionary_phrases->a[address_phrase_index]; + log_debug("phrase\n"); + + last_index = (ssize_t)phrase.start - 1; + next_index = (ssize_t)phrase.start + phrase.len; + + expansion.value = phrase.data; + uint32_t address_phrase_types = expansion.components; + + log_debug("expansion=%d\n", expansion.value); + + if (address_phrase_types & (ADDRESS_STREET | ADDRESS_NAME)) { + phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase); + + if (phrase_string != NULL) { + word = phrase_string; + } + + log_debug("phrase_string=%s\n", phrase_string); + + add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string, prev2, prev); + add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string, prev2, prev); + + } + } + + // Prefixes like hinter, etc. + phrase_t prefix_phrase = search_address_dictionaries_prefix(original_word, token.len, language); + if (prefix_phrase.len > 0) { + expansion.value = prefix_phrase.data; + // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category + if (expansion.components ^ ADDRESS_ANY) { + char_array_clear(phrase_tokens); + char_array_add_len(phrase_tokens, original_word, prefix_phrase.len); + char *prefix = char_array_get_string(phrase_tokens); + log_debug("got prefix: %s\n", prefix); + feature_array_add(features, 2, "prefix", prefix); + } + } + + // Suffixes like straße, etc. + phrase_t suffix_phrase = search_address_dictionaries_suffix(original_word, token.len, language); + if (suffix_phrase.len > 0) { + expansion.value = suffix_phrase.data; + if (expansion.components & ADDRESS_STREET) { + char_array_clear(phrase_tokens); + char_array_add_len(phrase_tokens, original_word + (token.len - suffix_phrase.len), suffix_phrase.len); + char *suffix = char_array_get_string(phrase_tokens); + log_debug("got suffix: %s\n", suffix); + feature_array_add(features, 2, "suffix", suffix); + } + } + + int64_t geodb_phrase_index = geodb_phrase_memberships->a[i]; + + phrase = NULL_PHRASE; + geodb_value_t geo; + + // GeoDB phrases + if (geodb_phrase_index != NULL_PHRASE_MEMBERSHIP) { + phrase = geodb_phrases->a[geodb_phrase_index]; + + geo_phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase); + geo.value = phrase.data; + uint32_t geodb_phrase_types = geo.components; + + if (last_index <= (ssize_t)phrase.start - 1 && next_index >= (ssize_t)phrase.start + phrase.len - 1) { + last_index = (ssize_t)phrase.start - 1; + next_index = (ssize_t)phrase.start + phrase.len; + if (geo_phrase_string != NULL && geodb_phrase_types ^ ADDRESS_POSTAL_CODE) { + word = geo_phrase_string; + } + + } + + if (geodb_phrase_types ^ ADDRESS_ANY) { + + add_phrase_features(features, geodb_phrase_types, ADDRESS_LOCALITY, "city", geo_phrase_string, prev2, prev); + add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN1, "admin1", geo_phrase_string, prev2, prev); + add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN2, "admin2", geo_phrase_string, prev2, prev); + add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN3, "admin3", geo_phrase_string, prev2, prev); + add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN4, "admin4", geo_phrase_string, prev2, prev); + add_phrase_features(features, geodb_phrase_types, ADDRESS_ADMIN_OTHER, "admin other", geo_phrase_string, prev2, prev); + add_phrase_features(features, geodb_phrase_types, ADDRESS_NEIGHBORHOOD, "neighborhood", geo_phrase_string, prev2, prev); + + add_phrase_features(features, geodb_phrase_types, ADDRESS_COUNTRY, "country", geo_phrase_string, prev2, prev); + add_phrase_features(features, geodb_phrase_types, ADDRESS_POSTAL_CODE, "postal code", geo_phrase_string, prev2, prev); + + } + + } + + uint32_t word_freq = word_vocab_frequency(parser, word); + + if (phrase_string == NULL && geo_phrase_string == NULL) { + if (word_freq > 0) { + // The individual word + feature_array_add(features, 2, "word", word); + } else { + log_debug("word not in vocab: %s\n", original_word); + word = UNKNOWN_WORD; + } + } + + if (prev != NULL) { + // Previous tag and current word + feature_array_add(features, 3, "i-1 tag+word", prev, current_word); + feature_array_add(features, 2, "i-1 tag", prev); + + if (prev2 != NULL) { + // Previous two tags and current word + feature_array_add(features, 4, "i-2 tag+i-1 tag+word", prev2, prev, current_word); + feature_array_add(features, 3, "i-2 tag+i-1 tag", prev2, prev); + } + } + + if (last_index >= 0) { + char *prev_word = cstring_array_get_string(normalized, last_index); + + uint32_t prev_word_freq = word_vocab_frequency(parser, prev_word); + if (prev_word_freq == 0) { + prev_word = UNKNOWN_WORD; + } + + // Previous word + feature_array_add(features, 2, "i-1 word", prev_word); + // Previous tag + previous word + if (last_index == i - 1) { + feature_array_add(features, 3, "i-1 tag+i-1 word", prev, prev_word); + } + // Previous word and current word + feature_array_add(features, 3, "i-1 word+word", prev_word, word); + } + + size_t num_tokens = tokenized->tokens->n; + + if (next_index < num_tokens) { + char *next_word = cstring_array_get_string(normalized, next_index); + + uint32_t next_word_freq = word_vocab_frequency(parser, next_word); + if (next_word_freq == 0) { + next_word = UNKNOWN_WORD; + } + + // Next word e.g. if the current word is unknown and the next word is "street" + feature_array_add(features, 2, "i+1 word", next_word); + // Current word and next word + feature_array_add(features, 3, "word+i+1 word", word, next_word); + } + + return true; + +} + + +address_parser_response_t *address_parser_response_new(void) { + address_parser_response_t *response = malloc(sizeof(address_parser_response_t)); + return response; +} + +void address_parser_response_destroy(address_parser_response_t *self) { + if (self == NULL) return; + + for (int i = 0; i < self->num_components; i++) { + if (self->components != NULL) { + free(self->components[i]); + } + + if (self->labels != NULL) { + free(self->labels[i]); + } + } + + if (self->components != NULL) { + free(self->components); + } + + if (self->labels != NULL) { + free(self->labels); + } + + free(self); +} + + +address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context) { + if (address == NULL || context == NULL) return NULL; + + char *normalized = address_parser_normalize_string(address); + bool is_normalized = normalized != NULL; + if (!is_normalized) { + normalized = address; + } + + address_parser_t *parser = get_address_parser(); + averaged_perceptron_t *model = parser->model; + + token_array *tokens = tokenize(normalized); + char_array *token_array = char_array_new(); + + tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n); + + for (int i = 0; i < tokens->n; i++) { + token_t token = tokens->a[i]; + if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) { + uint32_array_push(context->separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) { + continue; + } + + tokenized_string_add_token(tokenized_str, (const char *)normalized, token.len, token.type, token.offset); + uint32_array_push(context->separators, ADDRESS_SEPARATOR_NONE); + } + + address_parser_context_fill(context, tokenized_str, language, country); + + cstring_array *token_labels = cstring_array_new_size(tokens->n); + + char *prev_label = NULL; + + address_parser_response_t *response = NULL; + + if (averaged_perceptron_tagger_predict(model, parser, context, context->features, token_labels, &address_parser_features, tokenized_str)) { + response = address_parser_response_new(); + + size_t num_strings = cstring_array_num_strings(tokenized_str->strings); + + cstring_array *labels = cstring_array_new_size(num_strings); + cstring_array *components = cstring_array_new_size(strlen(address) + num_strings); + + + for (int i = 0; i < num_strings; i++) { + char *str = tokenized_string_get_token(tokenized_str, i); + char *label = cstring_array_get_string(token_labels, i); + + if (prev_label == NULL || strcmp(label, prev_label) != 0) { + cstring_array_add_string(labels, label); + cstring_array_start_token(components); + + } + + if (prev_label != NULL && strcmp(label, prev_label) == 0) { + cstring_array_cat_string(components, " "); + cstring_array_cat_string(components, str); + } else { + cstring_array_append_string(components, str); + cstring_array_terminate(components); + } + + prev_label = label; + } + response->num_components = cstring_array_num_strings(components); + response->components = cstring_array_to_strings(components); + response->labels = cstring_array_to_strings(labels); + + } + + token_array_destroy(tokens); + tokenized_string_destroy(tokenized_str); + cstring_array_destroy(token_labels); + + return response; +} + + + +bool address_parser_module_setup(char *dir) { + if (parser == NULL) { + return address_parser_load(dir); + } + return true; +} + +void address_parser_module_teardown(void) { + if (parser != NULL) { + address_parser_destroy(parser); + } + parser = NULL; +} diff --git a/src/address_parser.h b/src/address_parser.h new file mode 100644 index 00000000..af97ecc3 --- /dev/null +++ b/src/address_parser.h @@ -0,0 +1,134 @@ +/* +address_parser.h +---------------- + +International address parser, designed to use OSM training data, +over 40M addresses formatted with the OpenCage address formatting +templates: https://github.com/OpenCageData/address-formatting. + +This is a sequence modeling problem similar to e.g. part-of-speech +tagging, named entity recognition, etc. in which we have a sequence +of inputs (words/tokens) and want to predict a sequence of outputs +(labeled part-of-address tags). This is a supervised learning model +and the training data is created in the Python geodata package +included with this repo. Example record: + +en us 123/house_number Fake/road Street/road Brooklyn/city NY/state 12345/postcode + +Where the fields are: {language, country, tagged address}. + +After training, the address parser can take as input a tokenized +input string e.g. "123 Fake Street Brooklyn NY 12345" and parse +it into: + +{ + "house_number": "123", + "road": "Fake Street", + "city": "Brooklyn", + "state": "NY", + "postcode": "12345" +} + +The model used is a greedy averaged perceptron rather than something +like a CRF since there's ample training data from OSM and the accuracy +on this task is already very high with the simpler model. + +However, it is still worth investigating CRFs as they are relatively fast +at prediction time for a small number of tags, can often achieve better +performance and are robust to correlated features, which may not be true +with the general error-driven averaged perceptron. + +*/ +#ifndef ADDRESS_PARSER_H +#define ADDRESS_PARSER_H + +#include +#include +#include + +#include "averaged_perceptron.h" +#include "averaged_perceptron_tagger.h" +#include "bloom.h" +#include "libpostal_config.h" +#include "collections.h" +#include "normalize.h" +#include "string_utils.h" + +#define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat" + +#define NULL_PHRASE_MEMBERSHIP -1 + +#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII +#define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS + +#define ADDRESS_SEPARATOR_NONE 0 +#define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0 +#define ADDRESS_SEPARATOR_FIELD 1 << 1 + +#define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH ) +#define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD) + +#define SEPARATOR_LABEL "sep" +#define FIELD_SEPARATOR_LABEL "fsep" + +typedef struct address_parser_context { + char *language; + char *country; + cstring_array *features; + char_array *phrase; + uint32_array *separators; + cstring_array *normalized; + phrase_array *address_dictionary_phrases; + // Index in address_dictionary_phrases or -1 + int64_array *address_phrase_memberships; + phrase_array *geodb_phrases; + // Index in gedob_phrases or -1 + int64_array *geodb_phrase_memberships; + tokenized_string_t *tokenized_str; +} address_parser_context_t; + +typedef struct address_parser_response { + size_t num_components; + char **components; + char **labels; +} address_parser_response_t; + +// Can add other gazetteers as well +typedef struct address_parser { + averaged_perceptron_t *model; + trie_t *vocab; +} address_parser_t; + +// General usage + +address_parser_t *address_parser_new(void); +address_parser_t *get_address_parser(void); +bool address_parser_load(char *dir); + +void address_parser_response_destroy(address_parser_response_t *self); +address_parser_response_t *address_parser_parse(char *address, char *language, char *country, address_parser_context_t *context); +void address_parser_destroy(address_parser_t *self); + +char *address_parser_normalize_string(char *str); +void address_parser_normalize_token(cstring_array *array, char *str, token_t token); + +address_parser_context_t *address_parser_context_new(void); +void address_parser_context_destroy(address_parser_context_t *self); + +void address_parser_context_fill(address_parser_context_t *context, tokenized_string_t *tokenized_str, char *language, char *country); + +// Feature function +bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2); + +// I/O methods + +bool address_parser_load(char *dir); +bool address_parser_save(address_parser_t *self, char *output_dir); + +// Module setup/teardown + +bool address_parser_module_setup(char *dir); +void address_parser_module_teardown(void); + + +#endif \ No newline at end of file diff --git a/src/address_parser_io.c b/src/address_parser_io.c new file mode 100644 index 00000000..f6d762dc --- /dev/null +++ b/src/address_parser_io.c @@ -0,0 +1,180 @@ +#include "address_parser_io.h" + +address_parser_data_set_t *address_parser_data_set_init(char *filename) { + address_parser_data_set_t *data_set = malloc(sizeof(address_parser_data_set_t)); + data_set->f = fopen(filename, "r"); + if (data_set->f == NULL) { + free(data_set); + return NULL; + } + + data_set->tokens = token_array_new(); + data_set->tokenized_str = NULL; + data_set->labels = cstring_array_new(); + data_set->separators = uint32_array_new(); + data_set->language = char_array_new_size(MAX_LANGUAGE_LEN); + data_set->country = char_array_new_size(MAX_COUNTRY_CODE_LEN); + + return data_set; +} + + +bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels) { + size_t count = 0; + + token_t token; + + uint32_t i = 0; + char *str = NULL; + + cstring_array *pairs = cstring_array_split(input, " ", 1, &count); + size_t num_pairs = cstring_array_num_strings(pairs); + + char *label = NULL; + + // First populate token array + cstring_array_foreach(pairs, i, str, { + size_t pair_len = strlen(str); + + char *last_separator = strrchr(str, (int)'/'); + + if (last_separator == NULL) { + log_error("All tokens must be delimited with '/'\n"); + return false; + } + + uint32_t last_separator_index = last_separator - str; + + label = str + last_separator_index + 1; + + uint32_t last_separator_type; + if (strcmp(label, FIELD_SEPARATOR_LABEL) == 0) { + last_separator_type = uint32_array_pop(separators); + uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD | ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } else if (strcmp(label, SEPARATOR_LABEL) == 0) { + last_separator_type = uint32_array_pop(separators); + uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } + + token.offset = pairs->indices->a[i]; + token.len = last_separator_index; + + scanner_t scanner = scanner_from_string(input + token.offset, token.len); + token.type = scan_token(&scanner); + if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) { + uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) { + // shouldn't happen but just in case + continue; + } else { + uint32_array_push(separators, ADDRESS_SEPARATOR_NONE); + } + + cstring_array_add_string(labels, label); + + token_array_push(tokens, token); + }) + + cstring_array_destroy(pairs); + + return true; +} + + + +bool address_parser_data_set_next(address_parser_data_set_t *data_set) { + if (data_set == NULL) return false; + + char *line = file_getline(data_set->f); + if (line == NULL) { + return false; + } + + size_t token_count; + + cstring_array *fields = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count); + + free(line); + + if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) { + log_error("Token count did not match, ected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count); + } + + char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE); + char *country = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_COUNTRY); + char *address = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_ADDRESS); + + log_debug("Doing: %s\n", address); + + char *normalized = address_parser_normalize_string(address); + bool is_normalized = normalized != NULL; + if (!is_normalized) { + log_debug("could not normalize\n"); + normalized = strdup(address); + } + + log_debug("Normalized: %s\n", normalized); + + token_array *tokens = data_set->tokens; + cstring_array *labels = data_set->labels; + uint32_array *separators = data_set->separators; + + token_array_clear(tokens); + cstring_array_clear(labels); + uint32_array_clear(separators); + size_t len = strlen(normalized); + + char_array_clear(data_set->country); + char_array_add(data_set->country, country); + + char_array_clear(data_set->language); + char_array_add(data_set->language, language); + + tokenized_string_t *tokenized_str = NULL; + + if (address_parser_data_set_tokenize_line(normalized, tokens, separators, labels)) { + // Add tokens as discrete strings for easier use in feature functions + bool copy_tokens = true; + tokenized_str = tokenized_string_from_tokens(normalized, tokens, copy_tokens); + } + + data_set->tokenized_str = tokenized_str; + + cstring_array_destroy(fields); + + return tokenized_str != NULL; +} + + +void address_parser_data_set_destroy(address_parser_data_set_t *self) { + if (self == NULL) return; + + if (self->f != NULL) { + fclose(self->f); + } + + if (self->tokens != NULL) { + token_array_destroy(self->tokens); + } + + if (self->labels != NULL) { + cstring_array_destroy(self->labels); + } + + if (self->separators != NULL) { + uint32_array_destroy(self->separators); + } + + if (self->language != NULL) { + char_array_destroy(self->language); + } + + if (self->country != NULL) { + char_array_destroy(self->country); + } + + free(self); +} diff --git a/src/address_parser_io.h b/src/address_parser_io.h new file mode 100644 index 00000000..6ccb1ba0 --- /dev/null +++ b/src/address_parser_io.h @@ -0,0 +1,40 @@ +#ifndef ADDRESS_PARSER_IO_H +#define ADDRESS_PARSER_IO_H + +#include +#include +#include + +#include "address_parser.h" +#include "collections.h" +#include "file_utils.h" +#include "scanner.h" +#include "string_utils.h" + +#define AMBIGUOUS_LANGUAGE "xxx" +#define UNKNOWN_LANGUAGE "unk" + +enum address_parser_training_data_fields { + ADDRESS_PARSER_FIELD_LANGUAGE, + ADDRESS_PARSER_FIELD_COUNTRY, + ADDRESS_PARSER_FIELD_ADDRESS, + ADDRESS_PARSER_FILE_NUM_TOKENS +}; + +typedef struct address_parser_data_set { + FILE *f; + token_array *tokens; + tokenized_string_t *tokenized_str; + cstring_array *labels; + uint32_array *separators; + char_array *language; + char_array *country; +} address_parser_data_set_t; + + +address_parser_data_set_t *address_parser_data_set_init(char *filename); +bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels); +bool address_parser_data_set_next(address_parser_data_set_t *data_set); +void address_parser_data_set_destroy(address_parser_data_set_t *self); + +#endif \ No newline at end of file diff --git a/src/address_parser_test.c b/src/address_parser_test.c new file mode 100644 index 00000000..2ec1021a --- /dev/null +++ b/src/address_parser_test.c @@ -0,0 +1,196 @@ +#include "address_parser.h" +#include "address_parser_io.h" +#include "address_dictionary.h" +#include "averaged_perceptron_trainer.h" +#include "collections.h" +#include "constants.h" +#include "file_utils.h" +#include "geodb.h" +#include "normalize.h" + +#include "log/log.h" + + +typedef struct address_parser_test_results { + size_t num_errors; + size_t num_predictions; + size_t num_address_errors; + size_t num_address_predictions; + uint32_t *confusion; +} address_parser_test_results_t; + + +uint32_t get_class_index(address_parser_t *parser, char *name) { + uint32_t i; + char *str; + + cstring_array_foreach(parser->model->classes, i, str, { + if (strcmp(name, str) == 0) { + return i; + } + }) + + return parser->model->num_classes; +} + +#define EMPTY_ADDRESS_PARSER_TEST_RESULT (address_parser_test_results_t){0, 0, 0, 0, NULL} + +bool address_parser_test(address_parser_t *parser, char *filename, address_parser_test_results_t *result) { + if (filename == NULL) { + log_error("Filename was NULL\n"); + return NULL; + } + + uint32_t num_classes = parser->model->num_classes; + + result->confusion = calloc(num_classes * num_classes, sizeof(uint32_t)); + + address_parser_data_set_t *data_set = address_parser_data_set_init(filename); + + if (data_set == NULL) { + log_error("Error initializing data set\n"); + return NULL; + } + + address_parser_context_t *context = address_parser_context_new(); + + bool success = false; + + size_t examples = 0; + + bool logged = false; + + while (address_parser_data_set_next(data_set)) { + char *language = char_array_get_string(data_set->language); + if (strcmp(language, UNKNOWN_LANGUAGE) == 0 || strcmp(language, AMBIGUOUS_LANGUAGE) == 0) { + language = NULL; + } + char *country = char_array_get_string(data_set->country); + + address_parser_context_fill(context, data_set->tokenized_str, language, country); + + cstring_array *token_labels = cstring_array_new_size(data_set->tokenized_str->strings->str->n); + + char *prev_label = NULL; + + address_parser_response_t *response = NULL; + + size_t starting_errors = result->num_errors; + + if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, token_labels, &address_parser_features, data_set->tokenized_str)) { + uint32_t i; + char *predicted; + cstring_array_foreach(token_labels, i, predicted, { + char *truth = cstring_array_get_string(data_set->labels, i); + + if (strcmp(predicted, truth) != 0) { + result->num_errors++; + + uint32_t predicted_index = get_class_index(parser, predicted); + uint32_t truth_index = get_class_index(parser, truth); + + result->confusion[predicted_index * num_classes + truth_index]++; + } + result->num_predictions++; + + }) + + } + + cstring_array_destroy(token_labels); + + if (result->num_errors > starting_errors) { + result->num_address_errors++; + } + + result->num_address_predictions++; + + if (result->num_address_predictions % 1000 == 0 && result->num_address_predictions > 0) { + log_info("Did %zu examples\n", result->num_address_predictions); + } + + tokenized_string_destroy(data_set->tokenized_str); + data_set->tokenized_str = NULL; + + } + + address_parser_data_set_destroy(data_set); + address_parser_context_destroy(context); + + return true; +} + + +int main(int argc, char **argv) { + char *address_parser_dir = LIBPOSTAL_ADDRESS_PARSER_DIR; + + if (argc < 2) { + log_error("Usage: ./address_parser_test filename [parser_dir]\n"); + exit(EXIT_FAILURE); + } + + char *filename = argv[1]; + + if (argc > 2) { + address_parser_dir = argv[2]; + } + + if (!address_dictionary_module_setup(NULL)) { + log_error("Could not load address dictionaries\n"); + exit(EXIT_FAILURE); + } + + log_info("address dictionary module loaded\n"); + + if (!geodb_module_setup(NULL)) { + log_error("Could not load geodb dictionaries\n"); + exit(EXIT_FAILURE); + } + + log_info("geodb module loaded\n"); + + if (!address_parser_load(address_parser_dir)) { + log_error("Could not initialize parser\n"); + exit(EXIT_FAILURE); + } + + log_info("Finished initialization\n"); + + address_parser_t *parser = get_address_parser(); + + address_parser_test_results_t results = EMPTY_ADDRESS_PARSER_TEST_RESULT; + + if (!address_parser_test(parser, filename, &results)) { + log_error("Error in training\n"); + exit(EXIT_FAILURE); + } + + printf("Errors: %zu / %zu (%f%%)\n", results.num_errors, results.num_predictions, (double)results.num_errors / results.num_predictions); + printf("Addresses: %zu / %zu (%f%%)\n\n", results.num_address_errors, results.num_address_predictions, (double)results.num_address_errors / results.num_address_predictions); + + + printf("Confusion matrix:\n\n"); + uint32_t num_classes = parser->model->num_classes; + for (uint32_t i = 0; i < num_classes; i++) { + for (uint32_t j = 0; j < num_classes; j++) { + if (i == j) { + continue; + } + uint32_t class_errors = results.confusion[i * num_classes + j]; + + if (class_errors > 0) { + char *predicted = cstring_array_get_string(parser->model->classes, i); + char *truth = cstring_array_get_string(parser->model->classes, j); + + printf("(%s, %s): %d\n", predicted, truth, class_errors); + } + } + } + + free(results.confusion); + + address_parser_module_teardown(); + + address_dictionary_module_teardown(); + geodb_module_teardown(); +} diff --git a/src/address_parser_train.c b/src/address_parser_train.c new file mode 100644 index 00000000..d065088d --- /dev/null +++ b/src/address_parser_train.c @@ -0,0 +1,300 @@ +#include "address_parser.h" +#include "address_parser_io.h" +#include "address_dictionary.h" +#include "averaged_perceptron_trainer.h" +#include "collections.h" +#include "constants.h" +#include "file_utils.h" +#include "geodb.h" +#include "shuffle.h" + +#include "log/log.h" + +// Training + +#define DEFAULT_ITERATIONS 5 + +#define MIN_VOCAB_COUNT 5 + +address_parser_t *address_parser_init(char *filename) { + if (filename == NULL) { + log_error("Filename was NULL\n"); + return NULL; + } + + address_parser_data_set_t *data_set = address_parser_data_set_init(filename); + + if (data_set == NULL) { + log_error("Error initializing data set\n"); + return NULL; + } + + + address_parser_t *parser = address_parser_new(); + if (parser == NULL) { + log_error("Error allocating parser\n"); + return NULL; + } + + khash_t(str_uint32) *vocab = kh_init(str_uint32); + + khiter_t k; + char *str; + + uint32_t vocab_size = 0; + size_t examples = 0; + + const char *word; + + uint32_t i; + char *token; + char *normalized; + uint32_t count; + + char_array *token_array = char_array_new(); + + while (address_parser_data_set_next(data_set)) { + tokenized_string_t *tokenized_str = data_set->tokenized_str; + + if (tokenized_str == NULL) { + log_error("tokenized str is NULL\n"); + kh_destroy(str_uint32, vocab); + return false; + } + + str = tokenized_str->str; + + cstring_array_foreach(tokenized_str->strings, i, token, { + token_t t = tokenized_str->tokens->a[i]; + + char_array_clear(token_array); + add_normalized_token(token_array, str, t, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS); + if (token_array->n == 0) { + continue; + } + + normalized = char_array_get_string(token_array); + + + k = kh_get(str_uint32, vocab, normalized); + if (k == kh_end(vocab)) { + int ret; + char *key = strdup(normalized); + k = kh_put(str_uint32, vocab, key, &ret); + if (ret < 0) { + log_error("Error in kh_put\n"); + free(key); + tokenized_string_destroy(tokenized_str); + kh_foreach(vocab, word, count, { + free((char *)word); + }) + kh_destroy(str_uint32, vocab); + char_array_destroy(token_array); + return false; + } + kh_value(vocab, k) = 1; + vocab_size++; + } else { + kh_value(vocab, k)++; + } + + }) + + tokenized_string_destroy(tokenized_str); + examples++; + if (examples % 10000 == 0 && examples != 0) { + log_info("Counting vocab: did %zu examples\n", examples); + } + } + + log_debug("Done with vocab, total size=%d\n", vocab_size); + + for (k = kh_begin(vocab); k != kh_end(vocab); ++k) { + char *word = (char *)kh_key(vocab, k); + if (!kh_exist(vocab, k)) { + continue; + } + uint32_t count = kh_value(vocab, k); + if (count < MIN_VOCAB_COUNT) { + kh_del(str_uint32, vocab, k); + free(word); + } + } + + parser->vocab = trie_new_from_hash(vocab); + + for (k = kh_begin(vocab); k != kh_end(vocab); ++k) { + if (!kh_exist(vocab, k)) { + continue; + } + char *word = (char *)kh_key(vocab, k); + free(word); + } + + kh_destroy(str_uint32, vocab); + + char_array_destroy(token_array); + address_parser_data_set_destroy(data_set); + if (parser->vocab == NULL) { + log_error("Error initializing vocabulary\n"); + address_parser_destroy(parser); + return NULL; + } + + return parser; +} + + + +bool address_parser_train_epoch(address_parser_t *self, averaged_perceptron_trainer_t *trainer, char *filename) { + if (filename == NULL) { + log_error("Filename was NULL\n"); + return false; + } + + address_parser_data_set_t *data_set = address_parser_data_set_init(filename); + if (data_set == NULL) { + log_error("Error initializing data set\n"); + return false; + } + + address_parser_context_t *context = address_parser_context_new(); + + bool success = false; + + size_t examples = 0; + size_t errors = trainer->num_errors; + + bool logged = false; + + while (address_parser_data_set_next(data_set)) { + char *language = char_array_get_string(data_set->language); + if (strcmp(language, UNKNOWN_LANGUAGE) == 0 || strcmp(language, AMBIGUOUS_LANGUAGE) == 0) { + language = NULL; + } + char *country = char_array_get_string(data_set->country); + + address_parser_context_fill(context, data_set->tokenized_str, language, country); + + bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, &address_parser_features, data_set->tokenized_str, data_set->labels); + + if (!example_success) { + log_error("Error training example\n"); + goto exit_epoch_training_started; + } + + tokenized_string_destroy(data_set->tokenized_str); + data_set->tokenized_str = NULL; + + if (!example_success) { + log_error("Error training example without country/language\n"); + goto exit_epoch_training_started; + } + + examples++; + if (examples % 1000 == 0 && examples > 0) { + log_info("Iter %d: Did %zu examples with %llu errors\n", trainer->iterations, examples, trainer->num_errors - errors); + errors = trainer->num_errors; + } + + } + + success = true; + +exit_epoch_training_started: + address_parser_data_set_destroy(data_set); + address_parser_context_destroy(context); + + return success; +} + +bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_iterations) { + averaged_perceptron_trainer_t *trainer = averaged_perceptron_trainer_new(); + + for (uint32_t iter = 0; iter < num_iterations; iter++) { + log_info("Doing epoch %d\n", iter); + + trainer->iterations = iter; + + log_debug("Shuffling\n"); + + /* + if (!shuffle_file(filename)) { + log_error("Error in shuffle\n"); + averaged_perceptron_trainer_destroy(trainer); + return false; + } + + log_debug("Shuffle complete\n"); + */ + if (!address_parser_train_epoch(self, trainer, filename)) { + log_error("Error in epoch\n"); + averaged_perceptron_trainer_destroy(trainer); + return false; + } + } + + log_debug("Done with training, averaging weights\n"); + + self->model = averaged_perceptron_trainer_finalize(trainer); + + + return true; +} + + +int main(int argc, char **argv) { + if (argc < 3) { + printf("Usage: ./address_parser_train filename output_dir\n"); + exit(EXIT_FAILURE); + } + + #if !defined(HAVE_SHUF) && !defined(HAVE_GSHUF) + log_error("shuf or gshuf must be installed to train address parser. Please install and reconfigure libpostal\n"); + exit(EXIT_FAILURE); + #endif + + char *filename = argv[1]; + char *output_dir = argv[2]; + + if (!address_dictionary_module_setup(NULL)) { + log_error("Could not load address dictionaries\n"); + exit(EXIT_FAILURE); + } + + log_info("address dictionary module loaded\n"); + + if (!geodb_module_setup(NULL)) { + log_error("Could not load geodb dictionaries\n"); + exit(EXIT_FAILURE); + } + + log_info("geodb module loaded\n"); + + address_parser_t *parser = address_parser_init(filename); + + if (parser == NULL) { + log_error("Could not initialize parser\n"); + exit(EXIT_FAILURE); + } + + log_info("Finished initialization\n"); + + if (!address_parser_train(parser, filename, DEFAULT_ITERATIONS)) { + log_error("Error in training\n"); + exit(EXIT_FAILURE); + } + + log_debug("Finished training\n"); + + if (!address_parser_save(parser, output_dir)) { + log_error("Error saving address parser\n"); + exit(EXIT_FAILURE); + } + + address_parser_destroy(parser); + + address_dictionary_module_teardown(); + geodb_module_teardown(); + log_debug("Done\n"); +}