diff --git a/src/address_dictionary.c b/src/address_dictionary.c index 2651c179..29d30d06 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -18,12 +18,24 @@ address_expansion_array *address_dictionary_get_expansions(char *key) { return k != kh_end(address_dict->expansions) ? kh_value(address_dict->expansions, k) : NULL; } +int32_t address_dictionary_next_canonical_index(void) { + if (address_dict == NULL || address_dict->canonical == NULL) return -1; + return (int32_t)cstring_array_num_strings(address_dict->canonical); + +} + +bool address_dictionary_add_canonical(char *canonical) { + if (address_dict == NULL || address_dict->canonical == NULL) return false; + cstring_array_add_string(address_dict->canonical, canonical); + return true; +} + char *address_dictionary_get_canonical(uint32_t index) { if (address_dict == NULL || address_dict->canonical == NULL || index > cstring_array_num_strings(address_dict->canonical)) return NULL; return cstring_array_get_string(address_dict->canonical, index); } -bool address_dictionary_add_expansion(char *key, char *canonical, char *language, uint16_t dictionary_id, uint16_t address_components) { +bool address_dictionary_add_expansion(char *key, address_expansion_t expansion) { int ret; log_debug("key=%s\n", key); @@ -33,22 +45,7 @@ bool address_dictionary_add_expansion(char *key, char *canonical, char *language expansion_value_t value; value.value = 0; - - if (canonical == NULL) { - canonical_index = -1; - value.canonical = 1; - } else { - canonical_index = (int32_t) cstring_array_num_strings(address_dict->canonical); - cstring_array_add_string(address_dict->canonical, canonical); - value.canonical = 0; - } - - address_expansion_t expansion; - - expansion.canonical_index = canonical_index; - strcpy(expansion.language, language); - expansion.dictionary_id = dictionary_id; - expansion.address_components = address_components; + value.canonical = expansion.canonical_index == -1; if (expansions == NULL) { expansions = address_expansion_array_new_size(1); @@ -57,16 +54,14 @@ bool address_dictionary_add_expansion(char *key, char *canonical, char *language kh_value(address_dict->expansions, k) = expansions; value.count = 1; - value.components = address_components; + value.components = expansion.address_components; log_debug("value.count=%d, value.components=%d\n", value.count, value.components); trie_add(address_dict->trie, key, value.value); } else { - uint32_t node_id = trie_get(address_dict->trie, key); log_debug("node_id=%d\n", node_id); if (node_id != NULL_NODE_ID) { - if (!trie_get_data_at_index(address_dict->trie, node_id, &value.value)) { log_warn("get_data_at_index returned false\n"); return false; @@ -79,7 +74,7 @@ bool address_dictionary_add_expansion(char *key, char *canonical, char *language } value.count++; - value.components |= address_components; + value.components |= expansion.address_components; if (!trie_set_data_at_index(address_dict->trie, node_id, value.value)) { log_warn("set_data_at_index returned false for node_id=%d and value=%d\n", node_id, value.value); @@ -187,10 +182,16 @@ static bool address_expansion_read(FILE *f, address_expansion_t *expansion) { return false; } - if (!file_read_uint16(f, &expansion->dictionary_id)) { + if (!file_read_uint32(f, (uint32_t *)&expansion->num_dictionaries)) { return false; } + for (int i = 0; i < expansion->num_dictionaries; i++) { + if (!file_read_uint16(f, (uint16_t *)expansion->dictionary_ids + i)) { + return false; + } + } + if (!file_read_uint16(f, &expansion->address_components)) { return false; } @@ -207,12 +208,21 @@ static bool address_expansion_write(FILE *f, address_expansion_t expansion) { if (!file_write_uint32(f, (uint32_t)expansion.canonical_index) || !file_write_uint32(f, language_len) || !file_write_chars(f, expansion.language, language_len) || - !file_write_uint16(f, expansion.dictionary_id) || - !file_write_uint16(f, expansion.address_components) + !file_write_uint32(f, expansion.num_dictionaries) ) { return false; } + for (int i = 0; i < expansion.num_dictionaries; i++) { + if (!file_write_uint16(f, expansion.dictionary_ids[i])) { + return false; + } + } + + if (!file_write_uint16(f, expansion.address_components)) { + return false; + } + return true; } diff --git a/src/address_dictionary.h b/src/address_dictionary.h index 7d7e920b..51f865a4 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -35,7 +35,8 @@ typedef union expansion_value { typedef struct address_expansion { int32_t canonical_index; char language[MAX_LANGUAGE_LEN]; - uint16_t dictionary_id; + uint32_t num_dictionaries; + uint16_t dictionary_ids[MAX_DICTIONARY_TYPES]; uint16_t address_components; } address_expansion_t; @@ -56,7 +57,9 @@ bool address_dictionary_init(void); phrase_array *search_address_dictionaries(char *str, char *lang); address_expansion_array *address_dictionary_get_expansions(char *key); char *address_dictionary_get_canonical(uint32_t index); -bool address_dictionary_add_expansion(char *key, char *canonical, char *language, uint16_t dictionary_id, uint16_t address_components); +int32_t address_dictionary_next_canonical_index(void); +bool address_dictionary_add_canonical(char *canonical); +bool address_dictionary_add_expansion(char *key, address_expansion_t expansion); void address_dictionary_destroy(address_dictionary_t *self); diff --git a/src/address_dictionary_builder.c b/src/address_dictionary_builder.c index ad42a79d..1ffd11bd 100644 --- a/src/address_dictionary_builder.c +++ b/src/address_dictionary_builder.c @@ -26,6 +26,8 @@ int main(int argc, char **argv) { khash_t(int_int) *dictionary_components = kh_init(int_int); + khash_t(str_int) *canonical_indices = kh_init(str_int); + khiter_t k; for (int g = 0; g < NUM_DICTIONARY_TYPES; g++) { @@ -41,26 +43,57 @@ int main(int argc, char **argv) { address_language_index_t lang_index = expansion_languages[i]; char *language = lang_index.language; + log_info("Doing language: %s\n", language); + for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) { address_expansion_rule_t expansion_rule = expansion_rules[j]; - uint16_t dictionary_id = (uint16_t) expansion_rule.dictionary; - k = kh_get(int_int, dictionary_components, (uint32_t)dictionary_id); - if (k == kh_end(dictionary_components)) { - log_error("Invalid dictionary_type: %d\n", dictionary_id); - exit(EXIT_FAILURE); + uint16_t address_components = 0; + + address_expansion_t expansion; + + strcpy(expansion.language, language); + expansion.num_dictionaries = expansion_rule.num_dictionaries; + + for (int d = 0; d < expansion_rule.num_dictionaries; d++) { + uint16_t dictionary_id = (uint16_t) expansion_rule.dictionaries[d]; + + expansion.dictionary_ids[d] = dictionary_id; + + k = kh_get(int_int, dictionary_components, (uint32_t)dictionary_id); + if (k == kh_end(dictionary_components)) { + log_error("Invalid dictionary_type: %d\n", dictionary_id); + exit(EXIT_FAILURE); + } + address_components |= (uint16_t) kh_value(dictionary_components, k); } - uint16_t address_components = (uint16_t) kh_value(dictionary_components, k); + expansion.address_components = address_components; char *canonical = NULL; if (expansion_rule.canonical_index != -1) { canonical = canonical_strings[expansion_rule.canonical_index]; } + if (canonical == NULL) { + expansion.canonical_index = -1; + } else { + k = kh_get(str_int, canonical_indices, canonical); + if (k != kh_end(canonical_indices)) { + expansion.canonical_index = kh_value(canonical_indices, k); + } else { + expansion.canonical_index = address_dictionary_next_canonical_index(); + if (!address_dictionary_add_canonical(canonical)) { + log_error("Error adding canonical string: %s\n", canonical); + exit(EXIT_FAILURE); + } + } + + } + // Add the phrase itself to the base namespace for existence checks - if (!address_dictionary_add_expansion(expansion_rule.phrase, canonical, language, dictionary_id, address_components)) { + if (!address_dictionary_add_expansion(expansion_rule.phrase, expansion)) { log_error("Could not add expansion {%s}\n", expansion_rule.phrase); exit(EXIT_FAILURE); } @@ -73,7 +106,7 @@ int main(int argc, char **argv) { char_array_cat(key, expansion_rule.phrase); char *token = char_array_get_string(key); - if (!address_dictionary_add_expansion(token, canonical, language, dictionary_id, address_components)) { + if (!address_dictionary_add_expansion(token, expansion)) { log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase); exit(EXIT_FAILURE); } @@ -86,5 +119,7 @@ int main(int argc, char **argv) { kh_destroy(int_int, dictionary_components); + kh_destroy(str_int, canonical_indices); + address_dictionary_module_teardown(); } \ No newline at end of file