diff --git a/src/expand.c b/src/expand.c index 80b4250e..4670280c 100644 --- a/src/expand.c +++ b/src/expand.c @@ -44,6 +44,39 @@ inline uint64_t get_normalize_string_options(libpostal_normalize_options_t optio return normalize_string_options; } + +inline size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { + size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); + + int32_t unichr = 0; + const uint8_t *ptr = (const uint8_t *)str; + + if (len_ordinal_suffix > 0) { + ssize_t start = 0; + size_t token_offset = token.offset; + size_t token_len = token.len; + + if (len_ordinal_suffix < token.len) { + start = token.offset + token.len - len_ordinal_suffix; + token_offset = token.offset; + token_len = token.len - len_ordinal_suffix; + } else { + start = prev_token.offset + prev_token.len; + token_offset = prev_token.offset; + token_len = prev_token.len; + } + ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); + if (prev_char_len <= 0) return 0; + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { + return 0; + } + } else { + return 0; + } + + return len_ordinal_suffix; +} + void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { uint64_t normalize_token_options = get_normalize_token_options(options); @@ -82,6 +115,7 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok } } else if (is_numeric_token(token.type)) { + normalize_token(strings, str, token, normalize_token_options); if (options.replace_word_hyphens || options.replace_numeric_hyphens) { @@ -105,9 +139,21 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok } if (is_numeric_token(token.type) && options.split_alpha_from_numeric) { - normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + bool split_alpha_from_numeric = true; + + for (size_t i = 0; i < options.num_languages; i++) { + char *lang = options.languages[i]; + if (valid_ordinal_suffix_len(str, token, NULL_TOKEN, lang) > 1) { + split_alpha_from_numeric = false; + break; + } + } + + if (split_alpha_from_numeric) { + normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + } } } else { cstring_array_add_string(strings, " "); @@ -492,7 +538,7 @@ inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { case DICTIONARY_STAIRCASE: return LIBPOSTAL_ADDRESS_STAIRCASE; case DICTIONARY_STOPWORD: - return LIBPOSTAL_ADDRESS_ANY; + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; case DICTIONARY_STREET_TYPE: return LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_UNIT_NUMBERED: @@ -506,11 +552,31 @@ inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { } } + +inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE; + case DICTIONARY_STOPWORD: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; + case DICTIONARY_STREET_TYPE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_SYNONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) { switch (dictionary_id) { // Pre/post directionals can be removed if there are non-phrase tokens case DICTIONARY_DIRECTIONAL: return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_COMPANY_TYPE: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME; default: return LIBPOSTAL_ADDRESS_NONE; } @@ -538,12 +604,14 @@ inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_ACADEMIC_DEGREE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_PERSONAL_TITLE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_NUMBER: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_PLACE_NAME: - return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + return LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_QUALIFIER: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_SYNONYM: @@ -559,7 +627,8 @@ typedef enum { GAZETTEER_MATCH_IGNORABLE, GAZETTEER_MATCH_EDGE_IGNORABLE, GAZETTEER_MATCH_POSSIBLE_ROOT, - GAZETTEER_MATCH_SPECIFIER + GAZETTEER_MATCH_SPECIFIER, + GAZETTEER_MATCH_VALID_COMPONENTS } gazetteer_match_type_t; @@ -580,6 +649,9 @@ inline bool address_expansion_matches_type_for_components(address_expansion_t ex case GAZETTEER_MATCH_SPECIFIER: components = gazetteer_specifier_components(dictionary_id); break; + case GAZETTEER_MATCH_VALID_COMPONENTS: + components = gazetteer_valid_components(dictionary_id); + break; default: break; } @@ -606,6 +678,11 @@ inline bool address_expansion_is_specifier_for_components(address_expansion_t ex return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER); } +inline bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); +} + + bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) { uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); @@ -642,6 +719,11 @@ inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER); } +inline bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); +} + + bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); if (value == NULL) return false; @@ -751,6 +833,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool have_non_phrase_tokens = false; bool have_canonical_phrases = false; bool have_ambiguous = false; + bool have_possible_root = false; bool have_strictly_ignorable = false; bool have_strictly_ignorable_abbreviation = false; @@ -796,6 +879,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical); } + have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(phrase, options.address_components); + have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous); have_ambiguous = have_ambiguous || phrase_is_ambiguous; @@ -875,7 +960,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - bool expansion_valid_components = value->components & options.address_components; + bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components); if (expansion_valid_components) { key->n = namespace_len; @@ -884,7 +969,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); last_added_was_whitespace = false; - } else { + } else if (!last_added_was_whitespace) { char_array_cat(key, " "); last_added_was_whitespace = true; } @@ -902,6 +987,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components); bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase); + bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(phrase, options.address_components); log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier); @@ -950,9 +1036,9 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } } else { - // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", this is probably a legit token instead of a pre-directional - skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase); - log_debug("phrase is possible root = %d\n", skip_edge_phrase); + // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", the first token is probably a legit token instead of a pre-directional + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(other_phrase) || address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + log_debug("phrase is possible root. skip_edge_phrase = %d\n", skip_edge_phrase); } break; } @@ -1019,7 +1105,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components); bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX; - log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable); + log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d, current_phrase_have_possible_root=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable, current_phrase_have_possible_root); current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; @@ -1034,22 +1120,22 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); current_phrase_ignorable = skip_edge_phrase; // Don't delete "PH" in "PH 1" for unit expansions - } else if (is_ignorable && have_non_phrase_tokens && current_phrase_have_specifier) { + } else if (is_ignorable && current_phrase_have_specifier) { log_debug("current_phrase_have_specifier\n"); current_phrase_ignorable = false; // Delete "Avenue" in "5th Avenue" } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) { log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n"); - current_phrase_ignorable = have_non_phrase_tokens || string_tree_num_tokens(tree) > 0; + current_phrase_ignorable = have_non_phrase_tokens || string_tree_num_tokens(tree) > 0 || (have_possible_root && !current_phrase_have_possible_root); log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S" } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) { log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n"); - current_phrase_ignorable = have_non_phrase_tokens || have_canonical_phrases || have_ambiguous; + current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0; log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); - } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases)) { + } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases || have_possible_root)) { log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases); - current_phrase_ignorable = is_ignorable || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); + current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n"); log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); @@ -1075,7 +1161,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal current_phrase_expandable = !current_phrase_ignorable; } - log_debug("expand_phrases = %d\n", expand_phrases); + log_debug("current_phrase_expandable = %d\n", current_phrase_expandable); log_debug("expansion.canonical_index = %d\n", expansion.canonical_index); @@ -1213,7 +1299,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } if (token.type != WHITESPACE) { - if (j > 0 && last_was_punctuation && !last_added_was_whitespace) { + if (j > 0 && last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding another space\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); @@ -1280,33 +1366,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { - size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - - int32_t unichr = 0; - const uint8_t *ptr = (const uint8_t *)str; - - if (len_ordinal_suffix > 0) { - ssize_t start = 0; - size_t token_offset = token.offset; - size_t token_len = token.len; - - if (len_ordinal_suffix < token.len) { - start = token.offset + token.len - len_ordinal_suffix; - token_offset = token.offset; - token_len = token.len - len_ordinal_suffix; - } else { - start = prev_token.offset + prev_token.len; - token_offset = prev_token.offset; - token_len = prev_token.len; - } - ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); - if (prev_char_len <= 0) return false; - if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { - return false; - } - } else { - return false; - } + size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang); cstring_array *strings = tree->strings; // Add the original form first. When this function returns true, @@ -1440,15 +1500,17 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * continue; } + char *dupe_token = strndup(token + left_spaces, token_len - left_spaces - right_spaces); + log_debug("full string=%s\n", token); - khiter_t k = kh_get(str_set, unique_strings, token); + khiter_t k = kh_get(str_set, unique_strings, dupe_token); if (k == kh_end(unique_strings)) { - char *dupe_token = strndup(str + left_spaces, len - left_spaces - right_spaces); - log_debug("doing postprocessing\n"); - add_postprocessed_string(strings, token, options); + add_postprocessed_string(strings, dupe_token, options); k = kh_put(str_set, unique_strings, dupe_token, &ret); + } else { + free(dupe_token); } log_debug("iter->remaining = %d\n", iter->remaining); @@ -1476,7 +1538,7 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * -char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { +cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { options.address_components |= LIBPOSTAL_ADDRESS_ANY; uint64_t normalize_string_options = get_normalize_string_options(options); @@ -1551,15 +1613,15 @@ char **expand_address_phrase_option(char *input, libpostal_normalize_options_t o *n = cstring_array_num_strings(strings); - return cstring_array_to_strings(strings); + return strings; } -char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { +cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { return expand_address_phrase_option(input, options, n, EXPAND_PHRASES); } -char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { +cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { return expand_address_phrase_option(input, options, n, DELETE_PHRASES); } diff --git a/src/expand.h b/src/expand.h index 0f961f81..2857f402 100644 --- a/src/expand.h +++ b/src/expand.h @@ -48,9 +48,9 @@ typedef enum { DELETE_PHRASES } expansion_phrase_option_t; -char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n); -char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); -char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); +cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); +cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); void expansion_array_destroy(char **expansions, size_t n); #endif diff --git a/src/tokens.h b/src/tokens.h index 8823a628..bf61f5bc 100644 --- a/src/tokens.h +++ b/src/tokens.h @@ -14,6 +14,8 @@ typedef libpostal_token_t token_t; +#define NULL_TOKEN (token_t){0, 0, END} + VECTOR_INIT(token_array, token_t) typedef struct tokenized_string {