From acd953ce51707316f6bd62657074476f0f368306 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 29 Dec 2016 02:17:05 -0500 Subject: [PATCH] [parser] first pass at new parser feature extraction - removing geodb phrases - use Latin-ASCII-simple transliteration (no umlauts, etc.) - no digit normalization for admin component phrases and postcodes - tag = START + word, special feature for first word in the sequence - add the new admin boundary categories - for hyphenated non-phrase words, add each sub-word - for rare and unknown words, add ngram features of 3-6 characters with underscores to indicate beginnings and endings (similar to language classifier features) - defines notion of "rare words" (known words with a frequency <= n where n > the unknown word threshold), so known words can share statistical strength with artificial and real unknown words --- src/address_parser.c | 764 +++++++++++++++++++++++++++---------------- src/address_parser.h | 23 +- 2 files changed, 492 insertions(+), 295 deletions(-) diff --git a/src/address_parser.c b/src/address_parser.c index 7b7ea47d..2306d85c 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -1,7 +1,7 @@ #include "address_parser.h" #include "address_dictionary.h" #include "features.h" -#include "geodb.h" +#include "ngrams.h" #include "scanner.h" #include "log/log.h" @@ -13,6 +13,8 @@ #define UNKNOWN_WORD "UNKNOWN" #define UNKNOWN_NUMERIC "UNKNOWN_NUMERIC" +#define DEFAULT_RARE_WORD_THRESHOLD 50 + static address_parser_t *parser = NULL; //#define PRINT_ADDRESS_PARSER_FEATURES @@ -21,20 +23,29 @@ typedef enum { ADDRESS_PARSER_NULL_PHRASE, ADDRESS_PARSER_DICTIONARY_PHRASE, ADDRESS_PARSER_COMPONENT_PHRASE, - ADDRESS_PARSER_GEODB_PHRASE + ADDRESS_PARSER_PREFIX_PHRASE, + ADDRESS_PARSER_SUFFIX_PHRASE } address_parser_phrase_type_t; -address_parser_t *address_parser_new(void) { + +static parser_options_t PARSER_DEFAULT_OPTIONS = { + .rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD +}; + +address_parser_t *address_parser_new_options(parser_options_t options) { address_parser_t *parser = malloc(sizeof(address_parser_t)); + parser->options = options; return parser; } +address_parser_t *address_parser_new(void) { + return address_parser_new_options(PARSER_DEFAULT_OPTIONS); +} address_parser_t *get_address_parser(void) { return parser; } - bool address_parser_save(address_parser_t *self, char *output_dir) { if (self == NULL || output_dir == NULL) return false; @@ -158,7 +169,7 @@ inline void address_parser_normalize_token(cstring_array *array, char *str, toke } inline void address_parser_normalize_phrase_token(cstring_array *array, char *str, token_t token) { - normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_PHRASE_TOKEN_OPTIONS); + normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS); } inline char *address_parser_normalize_string(char *str) { @@ -193,16 +204,24 @@ void address_parser_context_destroy(address_parser_context_t *self) { char_array_destroy(self->long_context_component_phrase); } - if (self->geodb_phrase != NULL) { - char_array_destroy(self->geodb_phrase); + if (self->prefix_phrase != NULL) { + char_array_destroy(self->prefix_phrase); } - if (self->context_geodb_phrase != NULL) { - char_array_destroy(self->context_geodb_phrase); + if (self->context_prefix_phrase != NULL) { + char_array_destroy(self->context_prefix_phrase); } - if (self->long_context_geodb_phrase != NULL) { - char_array_destroy(self->long_context_geodb_phrase); + if (self->suffix_phrase != NULL) { + char_array_destroy(self->suffix_phrase); + } + + if (self->context_suffix_phrase != NULL) { + char_array_destroy(self->context_suffix_phrase); + } + + if (self->ngrams != NULL) { + cstring_array_destroy(self->ngrams); } if (self->sub_token != NULL) { @@ -221,6 +240,18 @@ void address_parser_context_destroy(address_parser_context_t *self) { cstring_array_destroy(self->normalized); } + if (self->normalized_tokens != NULL) { + token_array_destroy(self->normalized_tokens); + } + + if (self->normalized_admin != NULL) { + cstring_array_destroy(self->normalized_admin); + } + + if (self->normalized_admin_tokens != NULL) { + token_array_destroy(self->normalized_admin_tokens); + } + if (self->features != NULL) { cstring_array_destroy(self->features); } @@ -237,14 +268,6 @@ void address_parser_context_destroy(address_parser_context_t *self) { int64_array_destroy(self->address_phrase_memberships); } - if (self->geodb_phrases != NULL) { - phrase_array_destroy(self->geodb_phrases); - } - - if (self->geodb_phrase_memberships != NULL) { - int64_array_destroy(self->geodb_phrase_memberships); - } - if (self->component_phrases != NULL) { phrase_array_destroy(self->component_phrases); } @@ -253,6 +276,14 @@ void address_parser_context_destroy(address_parser_context_t *self) { int64_array_destroy(self->component_phrase_memberships); } + if (self->prefix_phrases != NULL) { + phrase_array_destroy(self->prefix_phrases); + } + + if (self->suffix_phrases != NULL) { + phrase_array_destroy(self->suffix_phrases); + } + free(self); } @@ -294,18 +325,28 @@ address_parser_context_t *address_parser_context_new(void) { goto exit_address_parser_context_allocated; } - context->geodb_phrase = char_array_new(); - if (context->geodb_phrase == NULL) { + context->prefix_phrase = char_array_new(); + if (context->prefix_phrase == NULL) { goto exit_address_parser_context_allocated; } - context->context_geodb_phrase = char_array_new(); - if (context->context_geodb_phrase == NULL) { + context->context_prefix_phrase = char_array_new(); + if (context->context_prefix_phrase == NULL) { goto exit_address_parser_context_allocated; } - context->long_context_geodb_phrase = char_array_new(); - if (context->long_context_geodb_phrase == NULL) { + context->suffix_phrase = char_array_new(); + if (context->suffix_phrase == NULL) { + goto exit_address_parser_context_allocated; + } + + context->context_suffix_phrase = char_array_new(); + if (context->context_suffix_phrase == NULL) { + goto exit_address_parser_context_allocated; + } + + context->ngrams = cstring_array_new(); + if (context->ngrams == NULL) { goto exit_address_parser_context_allocated; } @@ -329,6 +370,21 @@ address_parser_context_t *address_parser_context_new(void) { goto exit_address_parser_context_allocated; } + context->normalized_tokens = token_array_new(); + if (context->normalized_tokens == NULL) { + goto exit_address_parser_context_allocated; + } + + context->normalized_admin = cstring_array_new(); + if (context->normalized_admin == NULL) { + goto exit_address_parser_context_allocated; + } + + context->normalized_admin_tokens = token_array_new(); + if (context->normalized_admin_tokens == NULL) { + goto exit_address_parser_context_allocated; + } + context->features = cstring_array_new(); if (context->features == NULL) { goto exit_address_parser_context_allocated; @@ -349,16 +405,6 @@ address_parser_context_t *address_parser_context_new(void) { goto exit_address_parser_context_allocated; } - context->geodb_phrases = phrase_array_new(); - if (context->geodb_phrases == NULL) { - goto exit_address_parser_context_allocated; - } - - context->geodb_phrase_memberships = int64_array_new(); - if (context->geodb_phrase_memberships == NULL) { - goto exit_address_parser_context_allocated; - } - context->component_phrases = phrase_array_new(); if (context->component_phrases == NULL) { goto exit_address_parser_context_allocated; @@ -369,6 +415,16 @@ address_parser_context_t *address_parser_context_new(void) { goto exit_address_parser_context_allocated; } + context->prefix_phrases = phrase_array_new(); + if (context->prefix_phrases == NULL) { + goto exit_address_parser_context_allocated; + } + + context->suffix_phrases = phrase_array_new(); + if (context->suffix_phrases == NULL) { + goto exit_address_parser_context_allocated; + } + return context; exit_address_parser_context_allocated: @@ -376,9 +432,30 @@ exit_address_parser_context_allocated: return NULL; } -void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) { - int64_t i, j; +inline static void fill_phrase_memberships(phrase_array *phrases, int64_array *phrase_memberships, size_t len) { + int64_t i = 0; + for (int64_t j = 0; j < phrases->n; j++) { + phrase_t phrase = phrases->a[j]; + for (; i < phrase.start; i++) { + int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP); + log_debug("token i=%lld, null phrase membership\n", i); + } + + for (i = phrase.start; i < phrase.start + phrase.len; i++) { + log_debug("token i=%lld, phrase membership=%lld\n", i, j); + int64_array_push(phrase_memberships, j); + } + } + + for (; i < len; i++) { + log_debug("token i=%lld, null phrase membership\n", i); + int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP); + } +} + + +void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) { uint32_t token_index; char *word; phrase_t phrase; @@ -387,16 +464,52 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars context->country = country; cstring_array *normalized = context->normalized; + token_array *normalized_tokens = context->normalized_tokens; cstring_array_clear(normalized); + token_array_clear(normalized_tokens); + + cstring_array *normalized_admin = context->normalized_admin; + token_array *normalized_admin_tokens = context->normalized_admin_tokens; + cstring_array_clear(normalized_admin); + token_array_clear(normalized_admin_tokens); char *str = tokenized_str->str; token_array *tokens = tokenized_str->tokens; cstring_array_foreach(tokenized_str->strings, token_index, word, { token_t token = tokens->a[token_index]; + + size_t token_offset = normalized->str->n; address_parser_normalize_token(normalized, str, token); + size_t token_len; + if (normalized->str->n > token_offset) { + token_len = normalized->str->n - 1 - token_offset; + } else { + token_len = 0; + } + token_t normalized_token; + normalized_token.offset = token_offset; + normalized_token.len = token_len; + normalized_token.type = token.type; + token_array_push(normalized_tokens, normalized_token); + + size_t admin_token_offset = normalized_admin->str->n; + address_parser_normalize_phrase_token(normalized_admin, str, token); + size_t admin_token_len; + if (normalized_admin->str->n > admin_token_offset) { + admin_token_len = normalized_admin->str->n - 1 - admin_token_offset; + } else { + admin_token_len = 0; + } + token_t normalized_admin_token; + normalized_admin_token.offset = admin_token_offset; + normalized_admin_token.len = admin_token_len; + normalized_admin_token.type = token.type; + token_array_push(normalized_admin_tokens, normalized_admin_token); }) + char *normalized_str = normalized->str->a; + char *normalized_str_admin = normalized_admin->str->a; /* Address dictionary phrases @@ -412,113 +525,50 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars phrase_array_clear(context->address_dictionary_phrases); int64_array_clear(context->address_phrase_memberships); - i = 0; phrase_array *address_dictionary_phrases = context->address_dictionary_phrases; int64_array *address_phrase_memberships = context->address_phrase_memberships; - if (search_address_dictionaries_tokens_with_phrases(str, tokens, context->language, &context->address_dictionary_phrases)) { - for (j = 0; j < address_dictionary_phrases->n; j++) { - phrase = address_dictionary_phrases->a[j]; + size_t num_tokens = tokens->n; - for (; i < phrase.start; i++) { - int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP); - log_debug("token i=%lld, null phrase membership\n", i); - } + bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, context->language, &context->address_dictionary_phrases); + fill_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens); - for (i = phrase.start; i < phrase.start + phrase.len; i++) { - log_debug("token i=%lld, phrase membership=%lld\n", i, j); - int64_array_push(address_phrase_memberships, j); - } - } + for (size_t i = 0; i < num_tokens; i++) { + token_t token = tokens->a[i]; + + phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, language); + phrase_array_push(context->prefix_phrases, prefix_phrase); + + phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, language); + phrase_array_push(context->suffix_phrases, suffix_phrase); } - for (; i < tokens->n; i++) { - log_debug("token i=%lld, null phrase membership\n", i); - int64_array_push(address_phrase_memberships, NULL_PHRASE_MEMBERSHIP); - } + /* + Component phrases + ----------------- + Precomputed phrases for cities, states, countries, etc. from the training data - phrase_array_clear(context->geodb_phrases); - int64_array_clear(context->geodb_phrase_memberships); - - phrase_array *geodb_phrases = context->geodb_phrases; - int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships; - i = 0; - - if (search_geodb_tokens_with_phrases(str, tokens, &context->geodb_phrases)) { - for (j = 0; j < geodb_phrases->n; j++) { - phrase = geodb_phrases->a[j]; - - for (; i < phrase.start; i++) { - log_debug("token i=%lld, null geo phrase membership\n", i); - int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP); - } - - for (i = phrase.start; i < phrase.start + phrase.len; i++) { - log_debug("token i=%lld, geo phrase membership=%lld\n", i, j); - int64_array_push(geodb_phrase_memberships, j); - } - } - } - - for (; i < tokens->n; i++) { - log_debug("token i=%lld, null geo phrase membership\n", i); - int64_array_push(geodb_phrase_memberships, NULL_PHRASE_MEMBERSHIP); - } + Note: if the training data has lots of mislabeled examples (e.g. Brooklyn as city + instead of a city_district), this may cause the parser to get confused. It will + penalize itself for getting the wrong answer when really the underlying data + is simply ambiguous. In the OSM training data a lot of work has been done to + ensure that there's little or no systematic mislabeling. As such, other data + sets shouldn't be added willy-nilly unless the labels are consistent. + */ phrase_array_clear(context->component_phrases); int64_array_clear(context->component_phrase_memberships); - i = 0; phrase_array *component_phrases = context->component_phrases; int64_array *component_phrase_memberships = context->component_phrase_memberships; - if (trie_search_tokens_with_phrases(parser->phrase_types, str, tokens, &component_phrases)) { - for (j = 0; j < component_phrases->n; j++) { - phrase = component_phrases->a[j]; - - for (; i < phrase.start; i++) { - log_debug("token i=%lld, null component phrase membership\n", i); - int64_array_push(component_phrase_memberships, NULL_PHRASE_MEMBERSHIP); - } - - for (i = phrase.start; i < phrase.start + phrase.len; i++) { - log_debug("token i=%lld, component phrase membership=%lld\n", i, j); - int64_array_push(component_phrase_memberships, j); - } - } - } - - for (; i < tokens->n; i++) { - log_debug("token i=%lld, null component phrase membership\n", i); - int64_array_push(component_phrase_memberships, NULL_PHRASE_MEMBERSHIP); - } + bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrase_types, normalized_str_admin, normalized_admin_tokens, &component_phrases); + fill_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens); } -static inline char *get_phrase_string_array(cstring_array *str, char_array *phrase_tokens, phrase_t phrase) { - char_array_clear(phrase_tokens); - - size_t phrase_end = phrase.start + phrase.len; - - for (int k = phrase.start; k < phrase_end; k++) { - char *w = cstring_array_get_string(str, k); - char_array_append(phrase_tokens, w); - if (k < phrase_end - 1) { - char_array_append(phrase_tokens, " "); - } - } - char_array_terminate(phrase_tokens); - - return char_array_get_string(phrase_tokens); -} - - -static inline char *get_phrase_string(tokenized_string_t *str, char_array *phrase_tokens, phrase_t phrase) { - return get_phrase_string_array(str->strings, phrase_tokens, phrase); -} - -static inline phrase_t get_phrase(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) { - if (phrases == NULL || phrase_memberships == NULL || i > phrases->n - 1) { +static inline phrase_t phrase_at_index(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) { + if (phrases == NULL || phrase_memberships == NULL || i > phrase_memberships->n - 1) { return NULL_PHRASE; } @@ -542,9 +592,10 @@ static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t address_parser_phrase_t response; char *phrase_string = NULL; - phrase = get_phrase(context->address_dictionary_phrases, context->address_phrase_memberships, i); + phrase = phrase_at_index(context->address_dictionary_phrases, context->address_phrase_memberships, i); + if (phrase.len > 0) { - phrase_string = get_phrase_string(tokenized, context->context_phrase, phrase), + phrase_string = cstring_array_get_phrase(context->normalized, context->context_phrase, phrase), response = (address_parser_phrase_t){ phrase_string, @@ -556,16 +607,12 @@ static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t address_parser_types_t types; - phrase = get_phrase(context->component_phrases, context->component_phrase_memberships, i); + phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i); if (phrase.len > 0) { types.value = phrase.data; uint32_t component_phrase_types = types.components; - if (component_phrase_types != ADDRESS_COMPONENT_POSTAL_CODE) { - phrase_string = get_phrase_string(tokenized, context->context_component_phrase, phrase); - } else { - phrase_string = get_phrase_string_array(context->normalized, context->context_component_phrase, phrase); - } + phrase_string = cstring_array_get_phrase(context->normalized_admin, context->context_component_phrase, phrase); response = (address_parser_phrase_t){ phrase_string, @@ -575,31 +622,56 @@ static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t return response; } - geodb_value_t geo; + phrase_t prefix_phrase = context->prefix_phrases->a[i]; + phrase_t suffix_phrase = context->suffix_phrases->a[i]; - phrase = get_phrase(context->geodb_phrases, context->geodb_phrase_memberships, i); - if (phrase.len > 0) { - geo.value = phrase.data; - uint32_t geodb_phrase_types = geo.components; - - if (geodb_phrase_types != GEONAMES_ADDRESS_COMPONENT_POSTCODE) { - phrase_string = get_phrase_string(tokenized, context->context_geodb_phrase, phrase); - } else { - phrase_string = get_phrase_string_array(context->normalized, context->context_geodb_phrase, phrase); - } - - response = (address_parser_phrase_t){ - phrase_string, - ADDRESS_PARSER_GEODB_PHRASE, - phrase - }; - return response; - - } + uint32_t expansion_index; + address_expansion_value_t *expansion_value; cstring_array *normalized = context->normalized; char *word = cstring_array_get_string(normalized, i); + token_t token = tokenized->tokens->a[i]; + + // Suffixes like straße, etc. + if (suffix_phrase.len > 0) { + expansion_index = suffix_phrase.data; + expansion_value = address_dictionary_get_expansions(expansion_index); + + if (expansion_value->components & ADDRESS_STREET) { + char_array_clear(context->context_suffix_phrase); + size_t suffix_len = suffix_phrase.len; + char_array_add_len(context->context_suffix_phrase, word + (token.len - suffix_phrase.len), suffix_len); + char *suffix = char_array_get_string(context->suffix_phrase); + response = (address_parser_phrase_t){ + suffix, + ADDRESS_PARSER_SUFFIX_PHRASE, + suffix_phrase + }; + return response; + } + } + + // Prefixes like hinter, etc. + if (prefix_phrase.len > 0) { + expansion_index = prefix_phrase.data; + expansion_value = address_dictionary_get_expansions(expansion_index); + + // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category + if (expansion_value->components ^ ADDRESS_ANY) { + char_array_clear(context->context_prefix_phrase); + size_t prefix_len = prefix_phrase.len; + char_array_add_len(context->context_prefix_phrase, word, prefix_len); + char *prefix = char_array_get_string(context->context_prefix_phrase); + response = (address_parser_phrase_t){ + prefix, + ADDRESS_PARSER_PREFIX_PHRASE, + prefix_phrase + }; + return response; + } + } + response = (address_parser_phrase_t){ word, ADDRESS_PARSER_NULL_PHRASE, @@ -618,7 +690,7 @@ static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start int64_t membership; if (direction == -1) { - for (size_t idx = start; idx >= 0; idx--) { + for (ssize_t idx = start; idx >= 0; idx--) { if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) { return (int64_t)idx; } @@ -635,7 +707,31 @@ static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start return -1; } -static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string, char *prev2, char *prev) { + +static inline int64_t next_numeric_token_index(tokenized_string_t *tokenized, address_parser_context_t *context, size_t start) { + if (context == NULL) return -1; + + token_array *tokens = tokenized->tokens; + + if (tokens == NULL || start > tokens->n - 1) return -1; + + phrase_t phrase; + + for (size_t i = start; i < tokens->n; i++) { + if (context->address_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP && + context->component_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP) { + token_t token = tokens->a[i]; + if (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) { + return i; + } + } + } + + return -1; +} + + +static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string) { if (phrase_types == component) { log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types); feature_array_add(features, 2, "unambiguous phrase type", phrase_type); @@ -645,6 +741,42 @@ static inline void add_phrase_features(cstring_array *features, uint32_t phrase_ } } +static bool add_ngram_features(cstring_array *features, char *feature_prefix, cstring_array *ngrams, char *str, size_t n, size_t prefix_len, size_t suffix_len) { + if (features == NULL || ngrams == NULL) return false; + + size_t len = strlen(str); + + if (n == 0 || n > len - 1) return false; + + size_t ngram_num_chars_len = INT64_MAX_STRING_SIZE; + char ngram_num_chars[ngram_num_chars_len]; + sprintf(ngram_num_chars, "%zu", n); + + bool known_prefix = prefix_len > 0; + bool known_suffix = suffix_len > 0; + + cstring_array_clear(ngrams); + if (!add_ngrams(ngrams, n, str + prefix_len, len - suffix_len - prefix_len, !known_prefix, !known_suffix)) { + return false; + } + + uint32_t idx; + char *ngram; + + if (feature_prefix != NULL) { + cstring_array_foreach(ngrams, idx, ngram, { + feature_array_add(features, 4, feature_prefix, "ngrams", ngram_num_chars, ngram); + }) + } else { + cstring_array_foreach(ngrams, idx, ngram, { + feature_array_add(features, 3, "ngrams", ngram_num_chars, ngram); + }) + } + + return true; +} + + /* address_parser_features ----------------------- @@ -672,7 +804,7 @@ char *prev2: the predicted tag at index i - 2 */ -bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t i, char *prev, char *prev2) { +bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx, char *prev, char *prev2) { if (self == NULL || ctx == NULL) return false; address_parser_t *parser = (address_parser_t *)self; @@ -684,8 +816,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize phrase_array *address_dictionary_phrases = context->address_dictionary_phrases; int64_array *address_phrase_memberships = context->address_phrase_memberships; - phrase_array *geodb_phrases = context->geodb_phrases; - int64_array *geodb_phrase_memberships = context->geodb_phrase_memberships; phrase_array *component_phrases = context->component_phrases; int64_array *component_phrase_memberships = context->component_phrase_memberships; cstring_array *normalized = context->normalized; @@ -694,14 +824,16 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize cstring_array_clear(features); - token_t token = tokenized->tokens->a[i]; + token_array *tokens = tokenized->tokens; - ssize_t last_index = (ssize_t)i - 1; - ssize_t next_index = (ssize_t)i + 1; + token_t token = tokens->a[idx]; - char *word = cstring_array_get_string(normalized, i); + ssize_t last_index = (ssize_t)idx - 1; + ssize_t next_index = (ssize_t)idx + 1; + + char *word = cstring_array_get_string(normalized, idx); if (word == NULL) { - log_error("got NULL word at %d\n", i); + log_error("got NULL word at %d\n", idx); return false; } @@ -709,19 +841,18 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize log_debug("word=%s\n", word); - expansion_value_t expansion; - phrase_t phrase = NULL_PHRASE; char *phrase_string = NULL; - char *geo_phrase_string = NULL; char *component_phrase_string = NULL; - int64_t address_phrase_index = address_phrase_memberships->a[i]; + int64_t address_phrase_index = address_phrase_memberships->a[idx]; char_array *phrase_tokens = context->phrase; char_array *component_phrase_tokens = context->component_phrase; - char_array *geodb_phrase_tokens = context->geodb_phrase; + + uint32_t expansion_index; + address_expansion_value_t *expansion_value; bool add_word_feature = true; @@ -733,52 +864,29 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize last_index = (ssize_t)phrase.start - 1; next_index = (ssize_t)phrase.start + phrase.len; - expansion.value = phrase.data; - uint32_t address_phrase_types = expansion.components; - - log_debug("expansion=%d\n", expansion.value); + expansion_index = phrase.data; + expansion_value = address_dictionary_get_expansions(expansion_index); + uint32_t address_phrase_types = 0; + if (expansion_value != NULL) { + address_phrase_types = expansion_value->components; + } else { + log_warn("expansion_value is NULL. word=%s, sentence=%s\n", word, tokenized->str); + } if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME)) { - phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase); + phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase); add_word_feature = false; log_debug("phrase_string=%s\n", phrase_string); - add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string, prev2, prev); - add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string, prev2, prev); - add_phrase_features(features, address_phrase_types, ADDRESS_HOUSE_NUMBER, "house_number", phrase_string, prev2, prev); + add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string); + add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string); + add_phrase_features(features, address_phrase_types, ADDRESS_HOUSE_NUMBER, "house_number", phrase_string); } } - // Prefixes like hinter, etc. - phrase_t prefix_phrase = search_address_dictionaries_prefix(word, token.len, language); - if (prefix_phrase.len > 0) { - expansion.value = prefix_phrase.data; - // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category - if (expansion.components ^ ADDRESS_ANY) { - char_array_clear(phrase_tokens); - char_array_add_len(phrase_tokens, word, prefix_phrase.len); - char *prefix = char_array_get_string(phrase_tokens); - log_debug("got prefix: %s\n", prefix); - feature_array_add(features, 2, "prefix", prefix); - } - } - - // Suffixes like straße, etc. - phrase_t suffix_phrase = search_address_dictionaries_suffix(word, token.len, language); - if (suffix_phrase.len > 0) { - expansion.value = suffix_phrase.data; - if (expansion.components & ADDRESS_STREET) { - char_array_clear(phrase_tokens); - char_array_add_len(phrase_tokens, word + (token.len - suffix_phrase.len), suffix_phrase.len); - char *suffix = char_array_get_string(phrase_tokens); - log_debug("got suffix: %s\n", suffix); - feature_array_add(features, 2, "suffix", suffix); - } - } - - int64_t component_phrase_index = component_phrase_memberships->a[i]; + int64_t component_phrase_index = component_phrase_memberships->a[idx]; phrase = NULL_PHRASE; address_parser_types_t types; @@ -789,7 +897,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (component_phrase_index != NULL_PHRASE_MEMBERSHIP) { phrase = component_phrases->a[component_phrase_index]; - component_phrase_string = get_phrase_string(tokenized, component_phrase_tokens, phrase); + component_phrase_string = cstring_array_get_phrase(context->normalized_admin, component_phrase_tokens, phrase); types.value = phrase.data; uint32_t component_phrase_types = types.components; @@ -798,7 +906,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (last_index >= (ssize_t)phrase.start - 1 || next_index <= (ssize_t)phrase.start + phrase.len - 1) { last_index = (ssize_t)phrase.start - 1; next_index = (ssize_t)phrase.start + phrase.len; - } if (component_phrase_string != NULL && component_phrase_types ^ ADDRESS_COMPONENT_POSTAL_CODE) { @@ -807,113 +914,185 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } if (component_phrase_types > 0) { - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_POSTAL_CODE, "postal_code", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string, prev2, prev); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string, prev2, prev); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_POSTAL_CODE, "postal_code", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string); + add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_WORLD_REGION, "world_region", component_phrase_string); } - if (most_common == ADDRESS_PARSER_CITY) { + if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) { feature_array_add(features, 2, "commonly city", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_STATE) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) { feature_array_add(features, 2, "commonly state", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_COUNTRY) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) { feature_array_add(features, 2, "commonly country", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_COUNTRY_REGION) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) { feature_array_add(features, 2, "commonly country_region", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_STATE_DISTRICT) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) { feature_array_add(features, 2, "commonly state_district", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_ISLAND) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) { feature_array_add(features, 2, "commonly island", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_SUBURB) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) { feature_array_add(features, 2, "commonly suburb", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_CITY_DISTRICT) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) { feature_array_add(features, 2, "commonly city_district", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_POSTAL_CODE) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_POSTAL_CODE) { feature_array_add(features, 2, "commonly postal_code", component_phrase_string); possible_postal_code = true; } } - int64_t geodb_phrase_index = geodb_phrase_memberships->a[i]; - - phrase = NULL_PHRASE; - geodb_value_t geo; - - // GeoDB phrases - if (component_phrase_index == NULL_PHRASE_MEMBERSHIP && geodb_phrase_index != NULL_PHRASE_MEMBERSHIP) { - phrase = geodb_phrases->a[geodb_phrase_index]; - - geo_phrase_string = get_phrase_string(tokenized, geodb_phrase_tokens, phrase); - geo.value = phrase.data; - uint32_t geodb_phrase_types = geo.components; - - if (last_index >= (ssize_t)phrase.start - 1 || next_index <= (ssize_t)phrase.start + phrase.len) { - last_index = (ssize_t)phrase.start - 1; - next_index = (ssize_t)phrase.start + phrase.len; - } - - if (geo_phrase_string != NULL && geodb_phrase_types ^ GEONAMES_ADDRESS_COMPONENT_POSTCODE) { - feature_array_add(features, 2, "phrase", geo_phrase_string); - add_word_feature = false; - } - - if (geodb_phrase_types ^ ADDRESS_ANY) { - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_LOCALITY, "gn city", geo_phrase_string, prev2, prev); - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN1, "gn admin1", geo_phrase_string, prev2, prev); - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN2, "gn admin2", geo_phrase_string, prev2, prev); - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN3, "gn admin3", geo_phrase_string, prev2, prev); - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN4, "gn admin4", geo_phrase_string, prev2, prev); - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER, "gn admin other", geo_phrase_string, prev2, prev); - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD, "gn neighborhood", geo_phrase_string, prev2, prev); - - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_COUNTRY, "gn country", geo_phrase_string, prev2, prev); - add_phrase_features(features, geodb_phrase_types, GEONAMES_ADDRESS_COMPONENT_POSTCODE, "gn postal code", geo_phrase_string, prev2, prev); - - } - - possible_postal_code = geodb_phrase_types & GEONAMES_ADDRESS_COMPONENT_POSTCODE; - - } - uint32_t word_freq = word_vocab_frequency(parser, word); + bool is_word = is_word_token(token.type); + bool is_unknown_word = false; + bool is_unknown = false; + + bool known_prefix = false; + bool known_suffix = false; + + size_t prefix_len = 0; + size_t suffix_len = 0; + + char *prefix = NULL; + char *suffix = NULL; if (add_word_feature) { // Bias unit, acts as an intercept feature_array_add(features, 1, "bias"); + phrase_t prefix_phrase = context->prefix_phrases->a[idx]; + phrase_t suffix_phrase = context->suffix_phrases->a[idx]; + + // Prefixes like hinter, etc. + if (prefix_phrase.len > 0) { + expansion_index = prefix_phrase.data; + expansion_value = address_dictionary_get_expansions(expansion_index); + + // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category + if (expansion_value->components ^ ADDRESS_ANY) { + known_prefix = true; + char_array_clear(phrase_tokens); + prefix_len = prefix_phrase.len; + char_array_add_len(phrase_tokens, word, prefix_len); + prefix = char_array_get_string(phrase_tokens); + log_debug("got prefix: %s\n", prefix); + feature_array_add(features, 2, "prefix", prefix); + } + } + + // Suffixes like straße, etc. + if (suffix_phrase.len > 0) { + expansion_index = suffix_phrase.data; + expansion_value = address_dictionary_get_expansions(expansion_index); + + if (expansion_value->components & ADDRESS_STREET) { + known_suffix = true; + char_array_clear(context->suffix_phrase); + suffix_len = suffix_phrase.len; + char_array_add_len(context->suffix_phrase, word + (token.len - suffix_phrase.len), suffix_len); + suffix = char_array_get_string(context->suffix_phrase); + log_debug("got suffix: %s\n", suffix); + feature_array_add(features, 2, "suffix", suffix); + } + } + + bool is_hyphenated = false; + + // For rare words and unknown words (so unknown words can benefit from statistics of known but super common words) + if (word_freq <= parser->options.rare_word_threshold && is_word) { + bool ngrams_added = false; + size_t hyphenated_word_offset = 0; + bool first_sub_token = true; + bool last_sub_token = true; + + ssize_t next_hyphen_index; + + do { + next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset); + char *sub_word = word; + size_t sub_word_len = word_len; + + if (next_hyphen_index >= 0) { + is_hyphenated = true; + char_array_clear(context->sub_token); + char_array_add_len(context->sub_token, word + hyphenated_word_offset, next_hyphen_index); + token_array_push(context->sub_tokens, (token_t){hyphenated_word_offset, next_hyphen_index, token.type}); + sub_word = char_array_get_string(context->sub_token); + sub_word_len = context->sub_token->n; + last_sub_token = false; + } else if (is_hyphenated) { + char_array_clear(context->sub_token); + char_array_add_len(context->sub_token, word + hyphenated_word_offset, word_len - hyphenated_word_offset); + sub_word = char_array_get_string(context->sub_token); + sub_word_len = context->sub_token->n; + last_sub_token = true; + } + + bool add_prefix = first_sub_token && prefix_len < sub_word_len; + bool add_suffix = last_sub_token && suffix_len < sub_word_len; + + if (is_hyphenated) { + uint32_t sub_word_freq = word_vocab_frequency(parser, sub_word); + if (sub_word_freq > 0) { + feature_array_add(features, 2, "sub_word", sub_word); + } + } + + // N-gram features from 3-6 characters + for (size_t ng = 3; ng <= 6; ng++) { + ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0); + } + + hyphenated_word_offset += next_hyphen_index + 1; + first_sub_token = false; + } while(next_hyphen_index >= 0); + + } + if (word_freq > 0) { // The individual word feature_array_add(features, 2, "word", word); } else { log_debug("word not in vocab: %s\n", word); + + is_unknown = true; word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC; + + if (is_word_token(token.type)) { + is_unknown_word = true; + } } + + if (idx == 0) { + //feature_array_add(features, 1, "prev tag=START"); + feature_array_add(features, 2, "idx-1 tag=START+word", word); + //feature_array_add(features, 3, "prev tag=START+word+next word", word, next_word); + } + } else if (component_phrase_string != NULL) { word = component_phrase_string; - } else if (geo_phrase_string != NULL) { - word = geo_phrase_string; } else if (phrase_string != NULL) { word = phrase_string; } - if (prev != NULL && last_index == i - 1) { + if (prev != NULL && last_index == idx - 1) { // Previous tag and current word - feature_array_add(features, 3, "i-1 tag+word", prev, word); - feature_array_add(features, 2, "i-1 tag", prev); + feature_array_add(features, 3, "prev tag+word", prev, word); + feature_array_add(features, 2, "prev tag", prev); if (prev2 != NULL) { // Previous two tags and current word - feature_array_add(features, 4, "i-2 tag+i-1 tag+word", prev2, prev, word); - feature_array_add(features, 3, "i-2 tag+i-1 tag", prev2, prev); + feature_array_add(features, 4, "prev2 tag+prev tag+word", prev2, prev, word); + feature_array_add(features, 3, "prev2 tag+prev tag", prev2, prev); } } @@ -930,14 +1109,14 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } // Previous word - feature_array_add(features, 2, "i-1 word", prev_word); + feature_array_add(features, 2, "prev word", prev_word); - if (last_index == i - 1) { - feature_array_add(features, 3, "i-1 tag+i-1 word", prev, prev_word); + if (last_index == idx - 1) { + feature_array_add(features, 3, "prev tag+prev word", prev, prev_word); } // Previous word and current word - feature_array_add(features, 3, "i-1 word+word", prev_word, word); + feature_array_add(features, 3, "prev word+word", prev_word, word); } size_t num_tokens = tokenized->tokens->n; @@ -958,21 +1137,24 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } // Next word e.g. if the current word is unknown and the next word is "street" - feature_array_add(features, 2, "i+1 word", next_word); + feature_array_add(features, 2, "next word", next_word); // Current word and next word - feature_array_add(features, 3, "word+i+1 word", word, next_word); + feature_array_add(features, 3, "word+next word", word, next_word); + + // Prev tag, current word and next word + //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word); } #ifndef PRINT_ADDRESS_PARSER_FEATURES if (0) { #endif - uint32_t idx; + uint32_t fidx; char *feature; printf("{"); - cstring_array_foreach(features, idx, feature, { + cstring_array_foreach(features, fidx, feature, { printf(" %s, ", feature); }) printf("}\n"); @@ -1058,21 +1240,23 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c response = address_parser_response_new(); - if (most_common == ADDRESS_PARSER_CITY) { + if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) { label = strdup(ADDRESS_PARSER_LABEL_CITY); - } else if (most_common == ADDRESS_PARSER_STATE) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) { label = strdup(ADDRESS_PARSER_LABEL_STATE); - } else if (most_common == ADDRESS_PARSER_COUNTRY) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) { label = strdup(ADDRESS_PARSER_LABEL_COUNTRY); - } else if (most_common == ADDRESS_PARSER_COUNTRY_REGION) { - label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION); - } else if (most_common == ADDRESS_PARSER_STATE_DISTRICT) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) { label = strdup(ADDRESS_PARSER_LABEL_STATE_DISTRICT); - } else if (most_common == ADDRESS_PARSER_SUBURB) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) { + label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) { label = strdup(ADDRESS_PARSER_LABEL_SUBURB); - } else if (most_common == ADDRESS_PARSER_CITY_DISTRICT) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) { label = strdup(ADDRESS_PARSER_LABEL_CITY_DISTRICT); - } else if (most_common == ADDRESS_PARSER_POSTAL_CODE) { + } else if (most_common == ADDRESS_PARSER_BOUNDARY_WORLD_REGION) { + label = strdup(ADDRESS_PARSER_LABEL_WORLD_REGION); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_POSTAL_CODE) { label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE); } diff --git a/src/address_parser.h b/src/address_parser.h index a00b9a8a..1bbc67cf 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -135,30 +135,42 @@ typedef struct address_parser_context { char_array *phrase; char_array *context_phrase; char_array *long_context_phrase; + char_array *prefix_phrase; + char_array *context_prefix_phrase; + char_array *suffix_phrase; + char_array *context_suffix_phrase; char_array *component_phrase; char_array *context_component_phrase; char_array *long_context_component_phrase; - char_array *geodb_phrase; - char_array *context_geodb_phrase; - char_array *long_context_geodb_phrase; + // ngrams and prefix/suffix features + cstring_array *ngrams; // For hyphenated words char_array *sub_token; token_array *sub_tokens; // Strings/arrays relating to the sentence uint32_array *separators; cstring_array *normalized; + token_array *normalized_tokens; + cstring_array *normalized_admin; + token_array *normalized_admin_tokens; // Known phrases phrase_array *address_dictionary_phrases; int64_array *address_phrase_memberships; // Index in address_dictionary_phrases or -1 - phrase_array *geodb_phrases; - int64_array *geodb_phrase_memberships; // Index in gedob_phrases or -1 phrase_array *component_phrases; int64_array *component_phrase_memberships; // Index in component_phrases or -1 + phrase_array *prefix_phrases; + phrase_array *suffix_phrases; + // The tokenized string used to conveniently access both words as C strings and tokens by index tokenized_string_t *tokenized_str; } address_parser_context_t; +typedef struct parser_options { + uint64_t rare_word_threshold; +} parser_options_t; + // Can add other gazetteers as well typedef struct address_parser { + parser_options_t options; averaged_perceptron_t *model; trie_t *vocab; trie_t *phrase_types; @@ -167,6 +179,7 @@ typedef struct address_parser { // General usage address_parser_t *address_parser_new(void); +address_parser_t *address_parser_new_options(parser_options_t options); address_parser_t *get_address_parser(void); bool address_parser_load(char *dir);