From 3f7abd5b24f965ebeed7eec143f3ccacd58a525b Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 15:48:07 -0500 Subject: [PATCH] [expand] adding a method that allows hash/equality comparisons of addresses like "100 Main" with "100 S Main St." or units like "Apt 101" vs. "#101". Instead of expanding the phrase abbreviations, this version tries its best to delete all but the root words in a string for a specific component. It's probably not perfect, but does handle a number of edge cases related to pre/post directionals in English e.g. "E St" will have a root word of simply "E", "Avenue E" => "E", etc. Also handles a variety of cases where the phrase could be a thoroughfare type but is really a root word such as "Park Pl" or the famous "Avenue Rd". This can be used for near dupe hashing to catch possible dupes for later analysis. Note that it will normalize "St Marks Pl" and "St Marks Ave" to the same thing, which is sometimes warranted (if the user typed the wrong thoroughfare), but can also be reconciled at deduping time. --- src/expand.c | 584 ++++++++++++++++++++++++++++++++++++++++++++------- src/expand.h | 12 +- 2 files changed, 518 insertions(+), 78 deletions(-) diff --git a/src/expand.c b/src/expand.c index 709c35ac..fc9cf572 100644 --- a/src/expand.c +++ b/src/expand.c @@ -449,31 +449,235 @@ bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, } -string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) { +inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_ACADEMIC_DEGREE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_BUILDING_TYPE: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_COMPANY_TYPE: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_ELISION: + return LIBPOSTAL_ADDRESS_ANY; + case DICTIONARY_ENTRANCE: + return LIBPOSTAL_ADDRESS_ENTRANCE; + case DICTIONARY_HOUSE_NUMBER: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + case DICTIONARY_LEVEL_NUMBERED: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_STANDALONE: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_MEZZANINE: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_BASEMENT: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_SUB_BASEMENT: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_NUMBER: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_NO_NUMBER: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + case DICTIONARY_PERSONAL_TITLE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_POST_OFFICE: + return LIBPOSTAL_ADDRESS_PO_BOX; + case DICTIONARY_POSTAL_CODE: + return LIBPOSTAL_ADDRESS_POSTAL_CODE; + case DICTIONARY_QUALIFIER: + return LIBPOSTAL_ADDRESS_TOPONYM; + case DICTIONARY_STAIRCASE: + return LIBPOSTAL_ADDRESS_STAIRCASE; + case DICTIONARY_STOPWORD: + return LIBPOSTAL_ADDRESS_ANY; + case DICTIONARY_STREET_TYPE: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_UNIT_NUMBERED: + return LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_UNIT_STANDALONE: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_UNIT_DIRECTION: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +inline uint32_t gazetter_edge_ignorable_components(uint16_t dictionary_id) { + switch (dictionary_id) { + // Pre/post directionals can be removed if there are non-phrase tokens + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +inline uint32_t gazetter_possible_root_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_ACADEMIC_DEGREE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PERSONAL_TITLE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_NUMBER: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_QUALIFIER: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_SYNONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_TOPONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +inline bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { + for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { + uint16_t dictionary_id = expansion.dictionary_ids[j]; + if (gazetter_ignorable_components(dictionary_id) & address_components) { + return true; + } + } + return false; +} + +inline bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { + for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { + uint16_t dictionary_id = expansion.dictionary_ids[j]; + if (gazetter_edge_ignorable_components(dictionary_id) & address_components) { + return true; + } + } + return false; +} + +inline bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) { + for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { + uint16_t dictionary_id = expansion.dictionary_ids[j]; + if (gazetter_possible_root_components(dictionary_id) & address_components) { + return true; + } + } + return false; +} + +bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions->a[i]; + + if (address_expansion_is_ignorable_for_components(expansion, address_components)) { + return true; + } + } + return false; +} + + +bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions->a[i]; + + if (address_expansion_is_edge_ignorable_for_components(expansion, address_components)) { + return true; + } + } + return false; +} + + +bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions->a[i]; + + if (address_expansion_is_possible_root_for_components(expansion, address_components)) { + return true; + } + } + return false; +} + + + +bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (!address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { + return true; + } + } + return false; +} + + + + +// Delete non-canonical phrases only + +string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { char_array *key = NULL; log_debug("input=%s\n", str); - token_array *tokens = tokenize_keep_whitespace(str); + token_array *token_array = tokenize_keep_whitespace(str); - if (tokens == NULL) { + if (token_array == NULL) { return NULL; } size_t len = strlen(str); - log_debug("tokenized, num tokens=%zu\n", tokens->n); + token_t *tokens = token_array->a; + size_t num_tokens = token_array->n; + + log_debug("tokenized, num tokens=%zu\n", num_tokens); bool last_was_punctuation = false; phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; - for (size_t i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; log_debug("lang=%s\n", lang); - lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); + lang_phrases = search_address_dictionaries_tokens(str, token_array, lang); if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); @@ -494,7 +698,7 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } - lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); + lang_phrases = search_address_dictionaries_tokens(str, token_array, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); @@ -526,6 +730,79 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); + log_debug("phrase_option = %d\n", phrase_option); + + bool delete_phrases = phrase_option == DELETE_PHRASES; + bool expand_phrases = phrase_option == EXPAND_PHRASES; + + size_t num_phrases = phrases->n; + + bool have_non_phrase_tokens = false; + bool have_canonical_phrases = false; + bool have_ambiguous = false; + bool have_strictly_ignorable = false; + bool have_strictly_ignorable_abbreviation = false; + + size_t prev_phrase_end = 0; + + if (delete_phrases) { + for (size_t i = 0; i < num_phrases; i++) { + phrase_lang = phrases->a[i]; + phrase = phrase_lang.phrase; + + log_debug("phrase.start = %zu, prev_phrase_end = %zu\n", phrase.start, prev_phrase_end); + + token_t inter_token; + if (phrase.start > prev_phrase_end) { + for (size_t j = prev_phrase_end; j < phrase.start; j++) { + inter_token = tokens[j]; + if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { + log_debug("have_non_phrase_tokens\n"); + have_non_phrase_tokens = true; + break; + } + } + } + + if (i == num_phrases - 1 && phrase.start + phrase.len < num_tokens) { + for (size_t j = phrase.start + phrase.len; j < num_tokens; j++) { + inter_token = tokens[j]; + if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { + have_non_phrase_tokens = true; + break; + } + } + } + + bool phrase_is_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(phrase, options.address_components) && !phrase_is_ambiguous; + bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase); + + have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous); + have_strictly_ignorable = have_strictly_ignorable || phrase_is_strictly_ignorable; + have_strictly_ignorable_abbreviation = have_strictly_ignorable_abbreviation || (phrase_is_strictly_ignorable && !phrase_is_canonical); + if (have_strictly_ignorable_abbreviation) { + log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical); + } + + have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous); + have_ambiguous = have_ambiguous || phrase_is_ambiguous; + + if (have_non_phrase_tokens) { + break; + } + + prev_phrase_end = phrase.start + phrase.len; + } + + + log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens); + log_debug("have_canonical_phrases = %d\n", have_canonical_phrases); + log_debug("have_ambiguous = %d\n", have_ambiguous); + log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable); + log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation); + } + for (size_t i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; @@ -550,54 +827,47 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t log_debug("start=%zu, end=%zu\n", start, end); for (size_t j = start; j < end; j++) { log_debug("Adding token %zu\n", j); - token_t token = tokens->a[j]; + token_t token = tokens[j]; if (is_punctuation(token.type)) { last_was_punctuation = true; continue; } if (token.type != WHITESPACE) { - if (phrase.start > 0 && last_was_punctuation && !last_added_was_whitespace) { + if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) ) { + log_debug("Adding space\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); } log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + string_tree_finalize_token(tree); last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { + } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_strings(tree) > 0 ) { log_debug("Adding pre-phrase whitespace\n"); last_added_was_whitespace = true; string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); } else { continue; } last_was_punctuation = false; - string_tree_finalize_token(tree); } - if (phrase.start > 0 && start < end) { - token_t prev_token = tokens->a[phrase.start - 1]; - log_debug("last_added_was_whitespace=%d\n", last_added_was_whitespace); - if (!last_added_was_whitespace && phrase.start - 1 > 0 && (!is_ideographic(prev_token.type) || last_was_punctuation)) { - log_debug("Adding space III\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - string_tree_finalize_token(tree); - } - } + size_t added_expansions = 0; + token_t token; uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - token_t token; + bool expansion_valid_components = value->components & options.address_components; - size_t added_expansions = 0; - if ((value->components & options.address_components) > 0) { + if (expansion_valid_components) { key->n = namespace_len; for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; + token = tokens[j]; if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); last_added_was_whitespace = false; @@ -612,22 +882,175 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t address_expansion_array *expansions = value->expansions; if (expansions != NULL) { - for (size_t j = 0; j < expansions->n; j++) { - address_expansion_t expansion = expansions->a[j]; + bool current_phrase_have_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool added_pre_phrase_space = false; + bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components); + bool current_phrase_have_edge_ignorable = false; - if ((expansion.address_components & options.address_components) == 0 && !address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { + bool current_phrase_have_unambiguous = address_phrase_contains_unambiguous_expansion(phrase); + + /* + Edge phrase handling. This is primarily for handling pre-directionals/post-directionals + in English and other languages. + */ + bool skip_edge_phrase = false; + bool other_phrase_have_edge_ignorable = false; + + if (delete_phrases) { + phrase_language_t other_phrase_lang; + phrase_t other_phrase; + + log_debug("i = %zu, phrase.start = %u\n", i, phrase.start); + if (i == 0 && phrase.start == 0 && phrase.start + phrase.len < num_tokens) { + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + // Delete "E" in "E 125th St" + if (current_phrase_have_edge_ignorable) { + log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); + skip_edge_phrase = true; + } + + if (!skip_edge_phrase || !have_non_phrase_tokens) { + for (size_t other_i = i + 1; other_i < phrases->n; other_i++) { + other_phrase_lang = phrases->a[other_i]; + other_phrase = other_phrase_lang.phrase; + log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len); + log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language); + if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) { + if (other_phrase.start + other_phrase.len == num_tokens) { + skip_edge_phrase = false; + if (current_phrase_have_edge_ignorable) { + // don't delete the "E" in "E St" + log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n"); + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + } else { + log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n"); + // delete "Avenue" in "Avenue E" + other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); + skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + + } + } else { + // If we encounter an ignorable phrase + skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase); + log_debug("phrase is possible root = %d\n", skip_edge_phrase); + } + break; + } + } + } + } else if (phrases->n > 1 && i == phrases->n - 1 && phrase.start + phrase.len == num_tokens && phrase.start > 0) { + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + if (current_phrase_have_edge_ignorable) { + log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); + skip_edge_phrase = true; + } + + log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens); + if (!skip_edge_phrase || !have_non_phrase_tokens) { + for (ssize_t other_j = i - 1; other_j >= 0; other_j--) { + other_phrase_lang = phrases->a[other_j]; + other_phrase = other_phrase_lang.phrase; + log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len); + log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language); + if (other_phrase.start + other_phrase.len <= phrase.start && string_equals(other_phrase_lang.language, phrase_lang.language)) { + if (other_phrase.start == 0) { + //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components); + skip_edge_phrase = false; + if (current_phrase_have_edge_ignorable) { + // don't delete the "E" in "Avenue E" + log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n"); + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + //skip_edge_phrase = !other_phrase_invalid; + } else { + log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n"); + // delete "St" in "E St" + other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); + skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); + } + } + break; + } + } + } + } + } + + for (size_t j = 0; j < expansions->n; j++) { + if (skip_edge_phrase) { + log_debug("skip edge phrase\n"); continue; } - if (expansion.canonical_index != NULL_CANONICAL_INDEX) { + address_expansion_t expansion = expansions->a[j]; + + bool current_phrase_ignorable = false; + bool current_phrase_expandable = expand_phrases && expansion.canonical_index != NULL_CANONICAL_INDEX; + + bool is_ambiguous = address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION); + + if (delete_phrases) { + bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components); + bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX; + + log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable); + + current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; + + // Edge phrase calculations from above + if (current_phrase_have_edge_ignorable || other_phrase_have_edge_ignorable) { + log_debug("current_phrase_have_edge_ignorable\n"); + log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); + current_phrase_ignorable = skip_edge_phrase; + // Delete "Avenue" in "5th Avenue" + } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) { + log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n"); + current_phrase_ignorable = have_non_phrase_tokens || string_tree_num_tokens(tree) > 0; + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S" + } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) { + log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n"); + current_phrase_ignorable = have_non_phrase_tokens || have_canonical_phrases || have_ambiguous; + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases)) { + log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases); + current_phrase_ignorable = is_ignorable || (is_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); + + log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n"); + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + } + + if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) { + log_debug("Adding space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + last_added_was_whitespace = true; + added_pre_phrase_space = true; + } + + } + + if (current_phrase_ignorable) { + continue; + } + + if (delete_phrases) { + current_phrase_expandable = !current_phrase_ignorable; + } + + log_debug("expand_phrases = %d\n", expand_phrases); + + log_debug("expansion.canonical_index = %d\n", expansion.canonical_index); + + if (expansion.canonical_index != NULL_CANONICAL_INDEX && current_phrase_expandable) { + log_debug("expansion.canonical_index != NULL_CANONICAL_INDEX, delete_phrases = %d, phrase_option = %d\n", delete_phrases, phrase_option); char *canonical = address_dictionary_get_canonical(expansion.canonical_index); char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); canonical = canonical_normalized != NULL ? canonical_normalized : canonical; - - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; + if (phrase.start + phrase.len < num_tokens - 1) { + token_t next_token = tokens[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { log_debug("non-canonical phrase, adding canonical string\n"); string_tree_add_string(tree, canonical); @@ -643,18 +1066,17 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } else { string_tree_add_string(tree, canonical); last_added_was_whitespace = false; - } if (canonical_normalized != NULL) { free(canonical_normalized); } - } else { + } else if (expansion.canonical_index == NULL_CANONICAL_INDEX || !current_phrase_expandable) { log_debug("canonical phrase, adding canonical string\n"); uint32_t start_index = cstring_array_start_token(tree->strings); for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) { - token = tokens->a[k]; + token = tokens[k]; if (token.type != WHITESPACE) { cstring_array_append_string_len(tree->strings, str + token.offset, token.len); last_added_was_whitespace = false; @@ -665,19 +1087,30 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } } cstring_array_terminate(tree->strings); + } else { + continue; } added_expansions++; } - } } - if (added_expansions == 0) { + log_debug("expansion_valid_components == %d\n", expansion_valid_components); + + if (added_expansions == 0 && (!delete_phrases || !expansion_valid_components)) { + if (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) { + log_debug("Adding space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + last_added_was_whitespace = true; + } + uint32_t start_index = cstring_array_start_token(tree->strings); + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; + token = tokens[j]; if (token.type != WHITESPACE) { log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset); @@ -691,31 +1124,25 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len + 1]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - } - cstring_array_terminate(tree->strings); } - log_debug("i=%zu\n", i); - bool end_of_phrase = false; - if (i < phrases->n - 1) { - phrase_t next_phrase = phrases->a[i + 1].phrase; - end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); - } else { - end_of_phrase = true; - } + if (!delete_phrases || !expansion_valid_components || added_expansions > 0) { + log_debug("i=%zu\n", i); + bool end_of_phrase = false; + if (i < phrases->n - 1) { + phrase_t next_phrase = phrases->a[i + 1].phrase; + end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); + } else { + end_of_phrase = true; + } - log_debug("end_of_phrase=%d\n", end_of_phrase); - if (end_of_phrase) { - log_debug("finalize at i=%zu\n", i); - string_tree_finalize_token(tree); + log_debug("end_of_phrase=%d\n", end_of_phrase); + if (end_of_phrase) { + log_debug("finalize at i=%zu\n", i); + string_tree_finalize_token(tree); + } } start = phrase.start + phrase.len; @@ -725,11 +1152,11 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t char_array_destroy(key); - end = (int)tokens->n; + end = (int)num_tokens; - if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { + if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1 && !last_added_was_whitespace) { + token_t next_token = tokens[phrase.start + phrase.len]; + if (next_token.type != WHITESPACE && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !is_ideographic(next_token.type)) { log_debug("space after phrase\n"); string_tree_add_string(tree, " "); last_added_was_whitespace = true; @@ -740,7 +1167,7 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t for (size_t j = start; j < end; j++) { log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; + token_t token = tokens[j]; if (is_punctuation(token.type)) { log_debug("last_was_punctuation\n"); last_was_punctuation = true; @@ -757,7 +1184,7 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { + } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding space IV\n"); string_tree_add_string(tree, " "); last_added_was_whitespace = true; @@ -773,10 +1200,10 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } else { - - for (size_t j = 0; j < tokens->n; j++) { + log_debug("phrases NULL\n"); + for (size_t j = 0; j < num_tokens; j++) { log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; + token_t token = tokens[j]; if (is_punctuation(token.type)) { log_debug("punctuation, skipping\n"); last_was_punctuation = true; @@ -809,12 +1236,11 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t phrase_language_array_destroy(phrases); } - token_array_destroy(tokens); + token_array_destroy(token_array); return tree; } - inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); @@ -895,7 +1321,7 @@ inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, tok } -void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options) { +void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { size_t len = strlen(str); token_array *tokens = tokenize_keep_whitespace(str); string_tree_t *token_tree = string_tree_new_size(len); @@ -939,7 +1365,7 @@ void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings int ret; log_debug("Adding alternatives for single normalization\n"); - alternatives = add_string_alternatives(tokenized_str, options); + alternatives = add_string_alternatives_phrase_option(tokenized_str, options, phrase_option); log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives)); @@ -998,7 +1424,7 @@ void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings -char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { +char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { options.address_components |= LIBPOSTAL_ADDRESS_ANY; uint64_t normalize_string_options = get_normalize_string_options(options); @@ -1028,7 +1454,7 @@ char **expand_address(char *input, libpostal_normalize_options_t options, size_t if (string_tree_num_strings(tree) == 1) { char *normalized = string_tree_get_alternative(tree, 0, 0); - expand_alternative(strings, unique_strings, normalized, options); + expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option); } else { log_debug("Adding alternatives for multiple normalizations\n"); @@ -1049,7 +1475,7 @@ char **expand_address(char *input, libpostal_normalize_options_t options, size_t char_array_terminate(temp_string); token = char_array_get_string(temp_string); log_debug("current permutation = %s\n", token); - expand_alternative(strings, unique_strings, token, options); + expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option); } string_tree_iterator_destroy(iter); @@ -1077,6 +1503,16 @@ char **expand_address(char *input, libpostal_normalize_options_t options, size_t } +char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(input, options, n, EXPAND_PHRASES); +} + +char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(input, options, n, DELETE_PHRASES); +} + + + void expansion_array_destroy(char **expansions, size_t n) { for (size_t i = 0; i < n; i++) { free(expansions[i]); diff --git a/src/expand.h b/src/expand.h index 0e24cae4..0f961f81 100644 --- a/src/expand.h +++ b/src/expand.h @@ -38,15 +38,19 @@ bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, l bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options); -string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options); - bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options); void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options); -void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options); +typedef enum { + EXPAND_PHRASES, + KEEP_PHRASES, + DELETE_PHRASES +} expansion_phrase_option_t; + char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); void expansion_array_destroy(char **expansions, size_t n); -#endif \ No newline at end of file +#endif