[expand] in cases like "Avenue D" where there are two phrases, one is ambiguous (and canonical) but not necessarily edge-ignorable (pre/post-directional), allow deletion of the other token (so "Avenue" in this case). Also allows skipping in cases where the language classifier may predict a second language with some small probability, such as French for a short string like "Avenue D" (in addition to English). If the token was ignorable in the highest probability language, ignore it in both.
This commit is contained in:
36
src/expand.c
36
src/expand.c
@@ -803,6 +803,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
|||||||
log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
|
log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool skipped_last_edge_phrase = false;
|
||||||
|
|
||||||
for (size_t i = 0; i < phrases->n; i++) {
|
for (size_t i = 0; i < phrases->n; i++) {
|
||||||
phrase_lang = phrases->a[i];
|
phrase_lang = phrases->a[i];
|
||||||
|
|
||||||
@@ -882,19 +884,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
|||||||
address_expansion_array *expansions = value->expansions;
|
address_expansion_array *expansions = value->expansions;
|
||||||
|
|
||||||
if (expansions != NULL) {
|
if (expansions != NULL) {
|
||||||
bool current_phrase_have_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
|
bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
|
||||||
bool added_pre_phrase_space = false;
|
bool added_pre_phrase_space = false;
|
||||||
bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
|
bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
|
||||||
bool current_phrase_have_edge_ignorable = false;
|
bool current_phrase_have_edge_ignorable = false;
|
||||||
|
|
||||||
bool current_phrase_have_unambiguous = address_phrase_contains_unambiguous_expansion(phrase);
|
bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase);
|
||||||
|
|
||||||
|
bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Edge phrase handling. This is primarily for handling pre-directionals/post-directionals
|
Edge phrase handling. This is primarily for handling pre-directionals/post-directionals
|
||||||
in English and other languages.
|
in English and other languages.
|
||||||
*/
|
*/
|
||||||
bool skip_edge_phrase = false;
|
bool skip_edge_phrase = false;
|
||||||
bool other_phrase_have_edge_ignorable = false;
|
bool other_phrase_is_ignorable = false;
|
||||||
|
|
||||||
if (delete_phrases) {
|
if (delete_phrases) {
|
||||||
phrase_language_t other_phrase_lang;
|
phrase_language_t other_phrase_lang;
|
||||||
@@ -918,19 +922,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
|||||||
if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) {
|
if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) {
|
||||||
if (other_phrase.start + other_phrase.len == num_tokens) {
|
if (other_phrase.start + other_phrase.len == num_tokens) {
|
||||||
skip_edge_phrase = false;
|
skip_edge_phrase = false;
|
||||||
if (current_phrase_have_edge_ignorable) {
|
if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
|
||||||
// don't delete the "E" in "E St"
|
// don't delete the "E" in "E St"
|
||||||
log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n");
|
log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n");
|
||||||
|
|
||||||
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
|
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
|
||||||
|
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
|
||||||
} else {
|
} else {
|
||||||
log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n");
|
log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n");
|
||||||
// delete "Avenue" in "Avenue E"
|
// delete "Avenue" in "Avenue E"
|
||||||
other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
|
other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
|
||||||
skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
|
skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
|
||||||
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// If we encounter an ignorable phrase
|
// If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", this is probably a legit token instead of a pre-directional
|
||||||
skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase);
|
skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase);
|
||||||
log_debug("phrase is possible root = %d\n", skip_edge_phrase);
|
log_debug("phrase is possible root = %d\n", skip_edge_phrase);
|
||||||
}
|
}
|
||||||
@@ -956,16 +962,15 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
|||||||
if (other_phrase.start == 0) {
|
if (other_phrase.start == 0) {
|
||||||
//other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components);
|
//other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components);
|
||||||
skip_edge_phrase = false;
|
skip_edge_phrase = false;
|
||||||
if (current_phrase_have_edge_ignorable) {
|
if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
|
||||||
// don't delete the "E" in "Avenue E"
|
// don't delete the "E" in "Avenue E"
|
||||||
log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n");
|
log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n");
|
||||||
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
|
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
|
||||||
//skip_edge_phrase = !other_phrase_invalid;
|
|
||||||
} else {
|
} else {
|
||||||
log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n");
|
log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n");
|
||||||
// delete "St" in "E St"
|
// delete "St" in "E St"
|
||||||
other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
|
other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
|
||||||
skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
|
skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
|
||||||
//skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
|
//skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -976,10 +981,17 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) {
|
||||||
|
skip_edge_phrase = true;
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t j = 0; j < expansions->n; j++) {
|
for (size_t j = 0; j < expansions->n; j++) {
|
||||||
if (skip_edge_phrase) {
|
if (skip_edge_phrase) {
|
||||||
|
skipped_last_edge_phrase = true;
|
||||||
log_debug("skip edge phrase\n");
|
log_debug("skip edge phrase\n");
|
||||||
continue;
|
continue;
|
||||||
|
} else {
|
||||||
|
skipped_last_edge_phrase = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
address_expansion_t expansion = expansions->a[j];
|
address_expansion_t expansion = expansions->a[j];
|
||||||
@@ -998,7 +1010,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
|
|||||||
current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
|
current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
|
||||||
|
|
||||||
// Edge phrase calculations from above
|
// Edge phrase calculations from above
|
||||||
if (current_phrase_have_edge_ignorable || other_phrase_have_edge_ignorable) {
|
if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) {
|
||||||
log_debug("current_phrase_have_edge_ignorable\n");
|
log_debug("current_phrase_have_edge_ignorable\n");
|
||||||
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
|
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
|
||||||
current_phrase_ignorable = skip_edge_phrase;
|
current_phrase_ignorable = skip_edge_phrase;
|
||||||
|
|||||||
Reference in New Issue
Block a user