From 3610ffaa0508d1d726efbf29c9997d55647ce35c Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 16 Feb 2019 22:20:20 -0500 Subject: [PATCH] [expand/dedupe] expansion with multiple languages (or multiple predicted languages) can sometimes produce weird string trees and thus either too many results or incorrect results, particularly for root expansions which we depend on for matching/deduping. Making one call per language identified. This may slightly affect performance on languages that are highly ambiguous (even that's doubtful, as libpostal usually identifies one or two languages with high accurracy and/or people are using a known geography) but should improve the results and was simpler implementation-wise than trying to use the single single string tree for multiple languages where, say, a two word phrase in one language might simply be token-space-token in another. --- src/expand.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/expand.c b/src/expand.c index 77c623cc..898c17d1 100644 --- a/src/expand.c +++ b/src/expand.c @@ -878,6 +878,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("have_ambiguous = %d\n", have_ambiguous); log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable); log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation); + } bool skipped_last_edge_phrase = false; @@ -913,7 +914,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } if (token.type != WHITESPACE) { - if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) ) { + if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) || (prev_phrase.start == phrase.start && prev_phrase.len == phrase.len) ) { log_debug("Adding space\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); @@ -1536,6 +1537,29 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * +void expand_alternative_phrase_option_languages(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { + char **temp_languages = calloc(1, sizeof(char *)); + libpostal_normalize_options_t temp_options = options; + + for (size_t i = 0; i < options.num_languages; i++) { + char *lang = options.languages[i]; + + temp_languages[0] = lang; + temp_options.languages = temp_languages; + temp_options.num_languages = 1; + expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option); + } + + if (options.num_languages == 0) { + temp_options.languages = options.languages; + temp_options.num_languages = options.num_languages; + expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option); + } + + free(temp_languages); +} + + cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { options.address_components |= LIBPOSTAL_ADDRESS_ANY; @@ -1566,7 +1590,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt if (string_tree_num_strings(tree) == 1) { char *normalized = string_tree_get_alternative(tree, 0, 0); - expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option); + expand_alternative_phrase_option_languages(strings, unique_strings, normalized, options, phrase_option); } else { log_debug("Adding alternatives for multiple normalizations\n"); @@ -1587,7 +1611,7 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt char_array_terminate(temp_string); token = char_array_get_string(temp_string); log_debug("current permutation = %s\n", token); - expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option); + expand_alternative_phrase_option_languages(strings, unique_strings, token, options, phrase_option); } string_tree_iterator_destroy(iter); @@ -1612,7 +1636,6 @@ cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_opt *n = cstring_array_num_strings(strings); return strings; - } cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {