[expand] no longer delete phrases in cases like "PH 1" for units, where there's a phrase that can accompany numbered units and thus be ignored similar to "Apt 1" but that phrase may also be a qualifier (i.e. Apt 1 and Penthouse 1 are not the same)

This commit is contained in:
Al
2017-12-17 21:57:21 -05:00
parent 1d22da603f
commit 727469b736

View File

@@ -449,7 +449,7 @@ bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token,
}
inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) {
inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) {
switch (dictionary_id) {
case DICTIONARY_ACADEMIC_DEGREE:
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
@@ -468,17 +468,17 @@ inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) {
case DICTIONARY_LEVEL_NUMBERED:
return LIBPOSTAL_ADDRESS_LEVEL;
case DICTIONARY_LEVEL_STANDALONE:
return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL;
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
case DICTIONARY_LEVEL_MEZZANINE:
return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL;
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL| LIBPOSTAL_ADDRESS_ANY);
case DICTIONARY_LEVEL_BASEMENT:
return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL;
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
case DICTIONARY_LEVEL_SUB_BASEMENT:
return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL;
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
case DICTIONARY_NUMBER:
return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET;
case DICTIONARY_NO_NUMBER:
return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_HOUSE_NUMBER;
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY);
case DICTIONARY_PERSONAL_TITLE:
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
case DICTIONARY_PLACE_NAME:
@@ -498,15 +498,15 @@ inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) {
case DICTIONARY_UNIT_NUMBERED:
return LIBPOSTAL_ADDRESS_UNIT;
case DICTIONARY_UNIT_STANDALONE:
return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT;
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
case DICTIONARY_UNIT_DIRECTION:
return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT;
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
default:
return LIBPOSTAL_ADDRESS_NONE;
}
}
inline uint32_t gazetter_edge_ignorable_components(uint16_t dictionary_id) {
inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) {
switch (dictionary_id) {
// Pre/post directionals can be removed if there are non-phrase tokens
case DICTIONARY_DIRECTIONAL:
@@ -516,7 +516,25 @@ inline uint32_t gazetter_edge_ignorable_components(uint16_t dictionary_id) {
}
}
inline uint32_t gazetter_possible_root_components(uint16_t dictionary_id) {
inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) {
switch (dictionary_id) {
case DICTIONARY_LEVEL_STANDALONE:
return LIBPOSTAL_ADDRESS_LEVEL;
case DICTIONARY_LEVEL_MEZZANINE:
return LIBPOSTAL_ADDRESS_LEVEL;
case DICTIONARY_LEVEL_BASEMENT:
return LIBPOSTAL_ADDRESS_LEVEL;
case DICTIONARY_LEVEL_SUB_BASEMENT:
return LIBPOSTAL_ADDRESS_LEVEL;
case DICTIONARY_UNIT_STANDALONE:
return LIBPOSTAL_ADDRESS_UNIT;
default:
return LIBPOSTAL_ADDRESS_NONE;
}
}
inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) {
switch (dictionary_id) {
case DICTIONARY_ACADEMIC_DEGREE:
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
@@ -537,37 +555,58 @@ inline uint32_t gazetter_possible_root_components(uint16_t dictionary_id) {
}
}
inline bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
typedef enum {
GAZETTEER_MATCH_IGNORABLE,
GAZETTEER_MATCH_EDGE_IGNORABLE,
GAZETTEER_MATCH_POSSIBLE_ROOT,
GAZETTEER_MATCH_SPECIFIER
} gazetteer_match_type_t;
inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) {
for (uint32_t j = 0; j < expansion.num_dictionaries; j++) {
uint16_t dictionary_id = expansion.dictionary_ids[j];
if (gazetter_ignorable_components(dictionary_id) & address_components) {
uint32_t components = 0;
switch (match_type) {
case GAZETTEER_MATCH_IGNORABLE:
components = gazetteer_ignorable_components(dictionary_id);
break;
case GAZETTEER_MATCH_EDGE_IGNORABLE:
components = gazetteer_edge_ignorable_components(dictionary_id);
break;
case GAZETTEER_MATCH_POSSIBLE_ROOT:
components = gazetteer_possible_root_components(dictionary_id);
break;
case GAZETTEER_MATCH_SPECIFIER:
components = gazetteer_specifier_components(dictionary_id);
break;
default:
break;
}
if (components & address_components) {
return true;
}
}
return false;
}
inline bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_IGNORABLE);
}
inline bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
for (uint32_t j = 0; j < expansion.num_dictionaries; j++) {
uint16_t dictionary_id = expansion.dictionary_ids[j];
if (gazetter_edge_ignorable_components(dictionary_id) & address_components) {
return true;
}
}
return false;
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
}
inline bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) {
for (uint32_t j = 0; j < expansion.num_dictionaries; j++) {
uint16_t dictionary_id = expansion.dictionary_ids[j];
if (gazetter_possible_root_components(dictionary_id) & address_components) {
return true;
}
}
return false;
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
}
bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
inline bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) {
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER);
}
bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) {
uint32_t expansion_index = phrase.data;
address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
@@ -579,54 +618,29 @@ bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t addres
for (size_t i = 0; i < expansions->n; i++) {
address_expansion_t expansion = expansions->a[i];
if (address_expansion_is_ignorable_for_components(expansion, address_components)) {
if (address_expansion_matches_type_for_components(expansion, address_components, match_type)) {
return true;
}
}
return false;
}
bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
uint32_t expansion_index = phrase.data;
address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
if (value == NULL) return false;
address_expansion_array *expansions = value->expansions;
if (expansions == NULL) return false;
for (size_t i = 0; i < expansions->n; i++) {
address_expansion_t expansion = expansions->a[i];
if (address_expansion_is_edge_ignorable_for_components(expansion, address_components)) {
return true;
inline bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_IGNORABLE);
}
}
return false;
inline bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
}
bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) {
uint32_t expansion_index = phrase.data;
address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
if (value == NULL) return false;
address_expansion_array *expansions = value->expansions;
if (expansions == NULL) return false;
for (size_t i = 0; i < expansions->n; i++) {
address_expansion_t expansion = expansions->a[i];
if (address_expansion_is_possible_root_for_components(expansion, address_components)) {
return true;
}
}
return false;
inline bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) {
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
}
inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components) {
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER);
}
bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) {
address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data);
@@ -646,9 +660,6 @@ bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) {
return false;
}
// Delete non-canonical phrases only
string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
@@ -889,8 +900,11 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
bool current_phrase_have_edge_ignorable = false;
bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components);
bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase);
log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier);
bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase);
/*
@@ -1009,11 +1023,20 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
if (!is_canonical) {
char *canon = address_dictionary_get_canonical(expansion.canonical_index);
log_debug("canonical = %s\n", canon);
}
// Edge phrase calculations from above
if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) {
log_debug("current_phrase_have_edge_ignorable\n");
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
current_phrase_ignorable = skip_edge_phrase;
// Don't delete "PH" in "PH 1" for unit expansions
} else if (is_ignorable && have_non_phrase_tokens && current_phrase_have_specifier) {
log_debug("current_phrase_have_specifier\n");
current_phrase_ignorable = false;
// Delete "Avenue" in "5th Avenue"
} else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) {
log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n");
@@ -1026,10 +1049,12 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
} else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases)) {
log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases);
current_phrase_ignorable = is_ignorable || (is_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous);
current_phrase_ignorable = is_ignorable || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous);
log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n");
log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
} else {
log_debug("none of the above\n");
}
if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) {
@@ -1064,11 +1089,11 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
if (phrase.start + phrase.len < num_tokens - 1) {
token_t next_token = tokens[phrase.start + phrase.len];
if (!is_numeric_token(next_token.type)) {
log_debug("non-canonical phrase, adding canonical string\n");
log_debug("non-canonical phrase, adding canonical string: %s\n", canonical);
string_tree_add_string(tree, canonical);
last_added_was_whitespace = false;
} else {
log_debug("adding canonical with cstring_array methods\n");
log_debug("adding canonical with cstring_array methods: %s\n", canonical);
uint32_t start_index = cstring_array_start_token(tree->strings);
cstring_array_append_string(tree->strings, canonical);
cstring_array_append_string(tree->strings, " ");
@@ -1076,6 +1101,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
cstring_array_terminate(tree->strings);
}
} else {
log_debug("adding canonical: %s\n", canonical);
string_tree_add_string(tree, canonical);
last_added_was_whitespace = false;
}
@@ -1223,7 +1249,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
}
if (token.type != WHITESPACE) {
if (last_was_punctuation && !last_added_was_whitespace) {
if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_strings(tree) > 0) {
log_debug("Adding space V\n");
string_tree_add_string(tree, " ");
string_tree_finalize_token(tree);
@@ -1231,7 +1257,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
last_added_was_whitespace = false;
} else if (!last_added_was_whitespace) {
} else if (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) {
log_debug("Adding space VI\n");
string_tree_add_string(tree, " ");
last_added_was_whitespace = true;