[expansion] Prefix/suffix expansions by default can apply to ADDRESS_ANY but also inherit the types of any dictionary that lists their canonical form (so we can add suffixes without worrying about whether they're for streets or place names, etc.)
This commit is contained in:
@@ -30,20 +30,30 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
khiter_t k;
|
khiter_t k;
|
||||||
|
|
||||||
|
int ret;
|
||||||
|
|
||||||
for (int g = 0; g < NUM_DICTIONARY_TYPES; g++) {
|
for (int g = 0; g < NUM_DICTIONARY_TYPES; g++) {
|
||||||
gazetteer_t gazetteer = gazetteer_config[g];
|
gazetteer_t gazetteer = gazetteer_config[g];
|
||||||
int ret;
|
|
||||||
k = kh_put(int_uint32, dictionary_components, (uint32_t)gazetteer.type, &ret);
|
k = kh_put(int_uint32, dictionary_components, (uint32_t)gazetteer.type, &ret);
|
||||||
kh_value(dictionary_components, k) = (uint32_t) gazetteer.address_components;
|
kh_value(dictionary_components, k) = (uint32_t) gazetteer.address_components;
|
||||||
}
|
}
|
||||||
|
|
||||||
char_array *key = char_array_new();
|
char_array *key = char_array_new();
|
||||||
|
|
||||||
for (int i = 0; i < sizeof(expansion_languages) / sizeof(address_language_index_t); i++) {
|
size_t num_expansion_languages = sizeof(expansion_languages) / sizeof(address_language_index_t);
|
||||||
|
|
||||||
|
log_info("num_expansion_languages = %zu\n", num_expansion_languages);
|
||||||
|
|
||||||
|
khash_t(str_uint32) *phrase_address_components = kh_init(str_uint32);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < num_expansion_languages; i++) {
|
||||||
address_language_index_t lang_index = expansion_languages[i];
|
address_language_index_t lang_index = expansion_languages[i];
|
||||||
char *language = lang_index.language;
|
char *language = lang_index.language;
|
||||||
|
|
||||||
|
kh_clear(str_uint32, phrase_address_components);
|
||||||
|
|
||||||
log_info("Doing language: %s\n", language);
|
log_info("Doing language: %s\n", language);
|
||||||
|
for (int add_affixes = 0; add_affixes <= 1; add_affixes++) {
|
||||||
|
|
||||||
for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) {
|
for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) {
|
||||||
address_expansion_rule_t expansion_rule = expansion_rules[j];
|
address_expansion_rule_t expansion_rule = expansion_rules[j];
|
||||||
@@ -56,6 +66,8 @@ int main(int argc, char **argv) {
|
|||||||
strcpy(expansion.language, language);
|
strcpy(expansion.language, language);
|
||||||
expansion.num_dictionaries = expansion_rule.num_dictionaries;
|
expansion.num_dictionaries = expansion_rule.num_dictionaries;
|
||||||
|
|
||||||
|
bool is_affix = false;
|
||||||
|
|
||||||
for (int d = 0; d < expansion_rule.num_dictionaries; d++) {
|
for (int d = 0; d < expansion_rule.num_dictionaries; d++) {
|
||||||
uint16_t dictionary_id = (uint16_t) expansion_rule.dictionaries[d];
|
uint16_t dictionary_id = (uint16_t) expansion_rule.dictionaries[d];
|
||||||
|
|
||||||
@@ -65,6 +77,9 @@ int main(int argc, char **argv) {
|
|||||||
dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE ||
|
dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE ||
|
||||||
dictionary_id == DICTIONARY_ELISION) {
|
dictionary_id == DICTIONARY_ELISION) {
|
||||||
expansion.separable = 1;
|
expansion.separable = 1;
|
||||||
|
is_affix = true;
|
||||||
|
} else if (dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE) {
|
||||||
|
is_affix = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
k = kh_get(int_uint32, dictionary_components, (uint32_t)dictionary_id);
|
k = kh_get(int_uint32, dictionary_components, (uint32_t)dictionary_id);
|
||||||
@@ -75,29 +90,70 @@ int main(int argc, char **argv) {
|
|||||||
address_components |= (uint16_t) kh_value(dictionary_components, k);
|
address_components |= (uint16_t) kh_value(dictionary_components, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
expansion.address_components = address_components;
|
|
||||||
|
|
||||||
char *canonical = NULL;
|
char *canonical = NULL;
|
||||||
if (expansion_rule.canonical_index != -1) {
|
if (expansion_rule.canonical_index != -1) {
|
||||||
canonical = canonical_strings[expansion_rule.canonical_index];
|
canonical = canonical_strings[expansion_rule.canonical_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!add_affixes || !is_affix) {
|
||||||
|
expansion.address_components = address_components;
|
||||||
|
} else {
|
||||||
|
char *phrase = canonical == NULL ? expansion_rule.phrase : canonical;
|
||||||
|
|
||||||
|
k = kh_get(str_uint32, phrase_address_components, phrase);
|
||||||
|
|
||||||
|
if (k != kh_end(phrase_address_components)) {
|
||||||
|
uint32_t val = kh_value(phrase_address_components, k);
|
||||||
|
expansion.address_components = (uint16_t)val;
|
||||||
|
} else {
|
||||||
|
expansion.address_components = address_components;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (canonical == NULL) {
|
if (canonical == NULL) {
|
||||||
expansion.canonical_index = -1;
|
expansion.canonical_index = -1;
|
||||||
|
|
||||||
|
if (!add_affixes && is_affix) {
|
||||||
|
uint32_t phrase_components = (uint32_t)address_components;
|
||||||
|
|
||||||
|
k = kh_get(str_uint32, phrase_address_components, expansion_rule.phrase);
|
||||||
|
if (k != kh_end(phrase_address_components)) {
|
||||||
|
phrase_components |= kh_value(phrase_address_components, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
k = kh_put(str_uint32, phrase_address_components, expansion_rule.phrase, &ret);
|
||||||
|
|
||||||
|
if (ret < 0) {
|
||||||
|
log_error("Error in kh_put on phrase_address_components\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
kh_value(phrase_address_components, k) = phrase_components;
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
if (add_affixes) {
|
||||||
k = kh_get(str_uint32, canonical_indices, canonical);
|
k = kh_get(str_uint32, canonical_indices, canonical);
|
||||||
if (k != kh_end(canonical_indices)) {
|
if (k != kh_end(canonical_indices)) {
|
||||||
expansion.canonical_index = kh_value(canonical_indices, k);
|
expansion.canonical_index = kh_value(canonical_indices, k);
|
||||||
} else {
|
} else {
|
||||||
expansion.canonical_index = address_dictionary_next_canonical_index();
|
uint32_t canonical_index = address_dictionary_next_canonical_index();
|
||||||
if (!address_dictionary_add_canonical(canonical)) {
|
if (!address_dictionary_add_canonical(canonical)) {
|
||||||
log_error("Error adding canonical string: %s\n", canonical);
|
log_error("Error adding canonical string: %s\n", canonical);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
expansion.canonical_index = canonical_index;
|
||||||
|
k = kh_put(str_uint32, canonical_indices, canonical, &ret);
|
||||||
|
if (ret < 0) {
|
||||||
|
log_error("Error in kh_put on canonical_indices\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
kh_value(canonical_indices, k) = canonical_index;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
if (add_affixes) {
|
||||||
|
|
||||||
// Add the phrase itself to the base namespace for existence checks
|
// Add the phrase itself to the base namespace for existence checks
|
||||||
|
|
||||||
if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) {
|
if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) {
|
||||||
@@ -111,8 +167,13 @@ int main(int argc, char **argv) {
|
|||||||
log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase);
|
log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
address_dictionary_save(output_file);
|
address_dictionary_save(output_file);
|
||||||
|
|
||||||
@@ -121,6 +182,7 @@ int main(int argc, char **argv) {
|
|||||||
kh_destroy(int_uint32, dictionary_components);
|
kh_destroy(int_uint32, dictionary_components);
|
||||||
|
|
||||||
kh_destroy(str_uint32, canonical_indices);
|
kh_destroy(str_uint32, canonical_indices);
|
||||||
|
kh_destroy(str_uint32, phrase_address_components);
|
||||||
|
|
||||||
address_dictionary_module_teardown();
|
address_dictionary_module_teardown();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,9 +6,9 @@ gazetteer_t gazetteer_config[] = {
|
|||||||
{DICTIONARY_CATEGORY, ADDRESS_CATEGORY},
|
{DICTIONARY_CATEGORY, ADDRESS_CATEGORY},
|
||||||
{DICTIONARY_CHAIN, ADDRESS_NAME},
|
{DICTIONARY_CHAIN, ADDRESS_NAME},
|
||||||
{DICTIONARY_COMPANY_TYPE, ADDRESS_NAME},
|
{DICTIONARY_COMPANY_TYPE, ADDRESS_NAME},
|
||||||
{DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_STREET},
|
{DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_ANY},
|
||||||
{DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_STREET},
|
{DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_ANY},
|
||||||
{DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_STREET},
|
{DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_ANY},
|
||||||
{DICTIONARY_CROSS_STREET, ADDRESS_STREET},
|
{DICTIONARY_CROSS_STREET, ADDRESS_STREET},
|
||||||
{DICTIONARY_DIRECTIONAL, ADDRESS_ANY},
|
{DICTIONARY_DIRECTIONAL, ADDRESS_ANY},
|
||||||
{DICTIONARY_ELISION, ADDRESS_ANY},
|
{DICTIONARY_ELISION, ADDRESS_ANY},
|
||||||
|
|||||||
Reference in New Issue
Block a user