From 92051863ba807b73fc7945b573afaa3c54e71d6c Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 18 Apr 2017 17:20:02 -0400 Subject: [PATCH 1/4] [numex] adding ordinal suffixes themselves to the numex trie so they can be removed from strings --- src/numex.c | 93 ++++++++++++++++++++++++++++-- src/numex.h | 12 +++- src/numex_table_builder.c | 118 +++++++++++++++++++++----------------- 3 files changed, 161 insertions(+), 62 deletions(-) diff --git a/src/numex.c b/src/numex.c index 17013bbd..ff34fcfb 100644 --- a/src/numex.c +++ b/src/numex.c @@ -911,7 +911,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { return results; } -static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) { +static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, char *ns, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) { numex_language_t *language = get_numex_language(lang); if (language == NULL) { @@ -926,7 +926,7 @@ static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lan return NULL_PREFIX_RESULT; } - prefix = trie_get_prefix_from_index(trie, ORDINAL_NAMESPACE_PREFIX, ORDINAL_NAMESPACE_PREFIX_LEN, prefix.node_id, prefix.tail_pos); + prefix = trie_get_prefix_from_index(trie, ns, strlen(ns), prefix.node_id, prefix.tail_pos); if (prefix.node_id == NULL_NODE_ID) { return NULL_PREFIX_RESULT; @@ -976,7 +976,7 @@ static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lan return prefix; } -char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result) { +static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, gender_t gender, grammatical_category_t category) { if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return NULL; @@ -988,13 +988,13 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result } bool use_default_if_not_found = true; - trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, result.gender, result.category, use_default_if_not_found); + trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); if (prefix.node_id == NULL_NODE_ID) { return NULL; } - phrase_t phrase = trie_search_suffixes_from_index(trie, numeric_string, strlen(numeric_string), prefix.node_id); + phrase_t phrase = trie_search_suffixes_from_index(trie, numeric_string, len, prefix.node_id); if (phrase.len == 0) { return NULL; @@ -1009,6 +1009,87 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result } +static size_t possible_ordinal_digit_len(char *str, size_t len) { + uint8_t *ptr = (uint8_t *)str; + size_t idx = 0; + + bool ignorable = true; + + bool is_digit = false; + bool last_was_digit = false; + + int32_t ch; + + size_t digit_len = 0; + + while (idx < len) { + ssize_t char_len = utf8proc_iterate(ptr, len, &ch); + + if (char_len <= 0) break; + if (ch == 0) break; + if (!(utf8proc_codepoint_valid(ch))) return 0; + + // 0-9 only for this + is_digit = ch >= 48 && ch <= 57; + + if ((idx == 0 && !is_digit) || (idx > 0 && is_digit && !last_was_digit)) { + return 0; + } + + if (is_digit) { + digit_len += char_len; + } + + ptr += char_len; + idx += char_len; + last_was_digit = is_digit; + } + + return digit_len; +} + +size_t ordinal_suffix_len(char *str, size_t len, char *lang) { + if (str == NULL || len == 0) { + return 0; + } + + size_t ordinal_digit_len = possible_ordinal_digit_len(str, len); + if (ordinal_digit_len == 0) { + return 0; + } + + if (numex_table == NULL) { + log_error(NUMEX_SETUP_ERROR); + return 0; + } + + trie_t *trie = numex_table->trie; + if (trie == NULL) { + return 0; + } + + bool use_default_if_not_found = false; + + // Default (GENDER_NONE and CATEGORY_DEFAULT) are at the end of the enums, so iterate backward + for (int gender = NUM_GENDERS - 1; gender >= 0; gender--) { + for (int category = NUM_CATEGORIES - 1; category >= 0; category--) { + trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_PHRASE_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); + + if (prefix.node_id == NULL_NODE_ID) { + continue; + } + + phrase_t phrase = trie_search_suffixes_from_index(trie, str, len, prefix.node_id); + + if (phrase.len == len - ordinal_digit_len) { + return len - ordinal_digit_len; + } + } + } + + return 0; +} + char *replace_numeric_expressions(char *str, char *lang) { numex_result_array *results = convert_numeric_expressions(str, lang); if (results == NULL) return NULL; @@ -1040,7 +1121,7 @@ char *replace_numeric_expressions(char *str, char *lang) { char_array_append(replacement, numeric_string); if (result.is_ordinal) { - char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result); + char *ordinal_suffix = get_ordinal_suffix(numeric_string, strlen(numeric_string), lang, result.gender, result.category); if (ordinal_suffix != NULL) { char_array_append(replacement, ordinal_suffix); } diff --git a/src/numex.h b/src/numex.h index 9d8d9f4a..c000ff9c 100644 --- a/src/numex.h +++ b/src/numex.h @@ -34,7 +34,8 @@ typedef enum { GENDER_MASCULINE, GENDER_FEMININE, GENDER_NEUTER, - GENDER_NONE + GENDER_NONE, + NUM_GENDERS } gender_t; #define CATEGORY_PLURAL_PREFIX "p" @@ -42,7 +43,8 @@ typedef enum { typedef enum { CATEGORY_PLURAL, - CATEGORY_DEFAULT + CATEGORY_DEFAULT, + NUM_CATEGORIES } grammatical_category_t; typedef enum { @@ -85,10 +87,14 @@ typedef struct numex_rule { VECTOR_INIT(numex_rule_array, numex_rule_t) #define ORDINAL_NAMESPACE_CHAR "o" +#define ORDINAL_PHRASE_NAMESPACE_CHAR "p" #define ORDINAL_NAMESPACE_PREFIX NAMESPACE_SEPARATOR_CHAR ORDINAL_NAMESPACE_CHAR NAMESPACE_SEPARATOR_CHAR #define ORDINAL_NAMESPACE_PREFIX_LEN strlen(ORDINAL_NAMESPACE_PREFIX) +#define ORDINAL_PHRASE_NAMESPACE_PREFIX NAMESPACE_SEPARATOR_CHAR ORDINAL_PHRASE_NAMESPACE_CHAR NAMESPACE_SEPARATOR_CHAR +#define ORDINAL_PHRASE_NAMESPACE_PREFIX_LEN strlen(ORDINAL_PHRASE_NAMESPACE_PREFIX) + typedef struct ordinal_indicator { char *key; gender_t gender; @@ -142,7 +148,7 @@ VECTOR_INIT(numex_result_array, numex_result_t) char *replace_numeric_expressions(char *str, char *lang); numex_result_array *convert_numeric_expressions(char *str, char *lang); -char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result); +size_t ordinal_suffix_len(char *s, size_t len, char *lang); bool numex_table_write(FILE *file); bool numex_table_save(char *filename); diff --git a/src/numex_table_builder.c b/src/numex_table_builder.c index 624a98c5..be9f8a14 100644 --- a/src/numex_table_builder.c +++ b/src/numex_table_builder.c @@ -92,72 +92,84 @@ int main(int argc, char **argv) { } for (j = ordinal_indicator_index; j < ordinal_indicator_index + num_ordinal_indicators; j++) { - value = numex_table->ordinal_indicators->n; - ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j]; + for (int ordinal_phrases = 0; ordinal_phrases <= 1; ordinal_phrases++) { + value = numex_table->ordinal_indicators->n; + ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j]; - if (ordinal_source.key == NULL) { - log_error("ordinal source key was NULL at index %d\n", j); - exit(EXIT_FAILURE); - } + if (ordinal_source.key == NULL) { + log_error("ordinal source key was NULL at index %d\n", j); + exit(EXIT_FAILURE); + } - char *ordinal_indicator_key = strdup(ordinal_source.key); - if (ordinal_indicator_key == NULL) { - log_error("Error in strdup\n"); - exit(EXIT_FAILURE); - } - - char *suffix = NULL; - if (ordinal_source.suffix != NULL) { - suffix = strdup(ordinal_source.suffix); - if (suffix == NULL) { + char *ordinal_indicator_key = strdup(ordinal_source.key); + if (ordinal_indicator_key == NULL) { log_error("Error in strdup\n"); exit(EXIT_FAILURE); } - } - ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix); - ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal); - char_array_clear(key); - char_array_cat(key, lang); - char_array_cat(key, ORDINAL_NAMESPACE_PREFIX); + char *suffix = NULL; + if (ordinal_source.suffix != NULL) { + suffix = strdup(ordinal_source.suffix); + if (suffix == NULL) { + log_error("Error in strdup\n"); + exit(EXIT_FAILURE); + } + } - switch (ordinal_source.gender) { - case GENDER_MASCULINE: - char_array_cat(key, GENDER_MASCULINE_PREFIX); - break; - case GENDER_FEMININE: - char_array_cat(key, GENDER_FEMININE_PREFIX); - break; - case GENDER_NEUTER: - char_array_cat(key, GENDER_NEUTER_PREFIX); - break; - case GENDER_NONE: - default: - char_array_cat(key, GENDER_NONE_PREFIX); - } + char_array_clear(key); + char_array_cat(key, lang); - switch (ordinal_source.category) { - case CATEGORY_PLURAL: - char_array_cat(key, CATEGORY_PLURAL_PREFIX); - break; - case CATEGORY_DEFAULT: - default: - char_array_cat(key, CATEGORY_DEFAULT_PREFIX); + if (!ordinal_phrases) { + ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix); + ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal); - } + char_array_cat(key, ORDINAL_NAMESPACE_PREFIX); + } else { + char_array_cat(key, ORDINAL_PHRASE_NAMESPACE_PREFIX); + } - char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); + switch (ordinal_source.gender) { + case GENDER_MASCULINE: + char_array_cat(key, GENDER_MASCULINE_PREFIX); + break; + case GENDER_FEMININE: + char_array_cat(key, GENDER_FEMININE_PREFIX); + break; + case GENDER_NEUTER: + char_array_cat(key, GENDER_NEUTER_PREFIX); + break; + case GENDER_NONE: + default: + char_array_cat(key, GENDER_NONE_PREFIX); + } - char *reversed = utf8_reversed_string(ordinal_source.key); - char_array_cat(key, reversed); - free(reversed); + switch (ordinal_source.category) { + case CATEGORY_PLURAL: + char_array_cat(key, CATEGORY_PLURAL_PREFIX); + break; + case CATEGORY_DEFAULT: + default: + char_array_cat(key, CATEGORY_DEFAULT_PREFIX); - char *str_key = char_array_get_string(key); + } - if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) { - trie_add(numex_table->trie, str_key, value); - } else { - log_warn("Key exists: %s, skipping\n", str_key); + char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); + + char *key_str = ordinal_source.key; + + if (ordinal_phrases) { + key_str = suffix; + } + + char *reversed = utf8_reversed_string(key_str); + char_array_cat(key, reversed); + free(reversed); + + char *str_key = char_array_get_string(key); + + if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) { + trie_add(numex_table->trie, str_key, value); + } } } From cddc368533b583b8b24218be79d0402b4d507214 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 18 Apr 2017 21:39:54 -0400 Subject: [PATCH 2/4] [numex] adding one form of normalization which strips ordinal suffixes so {96th, Ninety-sixth} => 96. This is an additional form of normalization, so there's still one form where the suffixes are kept. One case that's still not handled is something like "IXe Arrondissement" --- src/libpostal.c | 63 +++++++++++++++++------------------- src/normalize.c | 86 ++++++++++++++++++++++++++++++++++++------------- src/normalize.h | 2 ++ 3 files changed, 96 insertions(+), 55 deletions(-) diff --git a/src/libpostal.c b/src/libpostal.c index 92a10dfd..d9b0f436 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -79,6 +79,7 @@ static inline uint64_t get_normalize_string_options(libpostal_normalize_options_ normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0; normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0; normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0; + normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0; return normalize_string_options; } @@ -558,7 +559,6 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok address_expansion_t prefix_expansion; address_expansion_t suffix_expansion; - char_array *key = char_array_new_size(token.len); char *expansion; size_t num_strings = 0; @@ -582,10 +582,11 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok } if (!have_suffix && !have_prefix) { - char_array_destroy(key); return false; } - + + char_array *key = char_array_new_size(token.len); + if (have_prefix && have_suffix) { for (size_t i = 0; i < prefix_expansions->n; i++) { prefix_expansion = prefix_expansions->a[i]; @@ -760,16 +761,34 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to if ((suffix.len == 0 && prefix.len == 0)) return false; - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options); } +static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); + + if (len_ordinal_suffix == 0) return false; + + cstring_array *strings = tree->strings; + // Add the original form first. When this function returns true, + // add_normalized_strings_token won't be called a second time. + add_normalized_strings_token(strings, str, token, options); + + char_array *key = char_array_new_size(token.len - len_ordinal_suffix + 1); + char_array_cat_len(key, str + token.offset, token.len - len_ordinal_suffix); + char *expansion = char_array_get_string(key); + cstring_array_add_string(strings, expansion); + char_array_destroy(key); + return true; +} + static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { cstring_array *strings = tree->strings; for (size_t i = 0; i < tokens->n; i++) { token_t token = tokens->a[i]; bool have_phrase = false; + bool have_ordinal = false; if (is_special_token(token.type)) { string_tree_add_string_len(tree, str + token.offset, token.len); @@ -783,9 +802,14 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s have_phrase = true; break; } + + if (normalize_ordinal_suffixes(tree, str, lang, token, options)) { + have_ordinal = true; + break; + } } - if (!have_phrase) { + if (!have_phrase && !have_ordinal) { add_normalized_strings_token(strings, str, token, options); } @@ -826,45 +850,18 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_ char_array_terminate(temp_string); char *tokenized_str = char_array_get_string(temp_string); - - char *new_str = tokenized_str; - char *last_numex_str = NULL; - if (options.expand_numex) { - char *numex_replaced = NULL; - for (size_t i = 0; i < options.num_languages; i++) { - lang = options.languages[i]; - - numex_replaced = replace_numeric_expressions(new_str, lang); - if (numex_replaced != NULL) { - new_str = numex_replaced; - - if (last_numex_str != NULL) { - free(last_numex_str); - } - last_numex_str = numex_replaced; - } - } - - } string_tree_t *alternatives; int ret; - log_debug("new_str=%s\n", new_str); - log_debug("Adding alternatives for single normalization\n"); - alternatives = add_string_alternatives(new_str, options); - - if (last_numex_str != NULL) { - free(last_numex_str); - } + alternatives = add_string_alternatives(tokenized_str, options); if (alternatives == NULL) { log_debug("alternatives = NULL\n"); continue; } - iter = string_tree_iterator_new(alternatives); log_debug("iter->num_tokens=%d\n", iter->num_tokens); diff --git a/src/normalize.c b/src/normalize.c index e2ae9f6e..dd236703 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -3,8 +3,24 @@ #define FULL_STOP_CODEPOINT 0x002e #define APOSTROPHE_CODEPOINT 0x0027 +char *normalize_replace_numex(char *str, size_t num_languages, char **languages) { + char *numex_normalized = NULL; -char *normalize_string_utf8(char *str, uint64_t options) { + for (size_t i = 0; i < num_languages; i++) { + char *lang = languages[i]; + char *numex_replaced = replace_numeric_expressions(numex_normalized == NULL ? str : numex_normalized, lang); + if (numex_replaced != NULL) { + if (numex_normalized != NULL) { + free(numex_normalized); + } + numex_normalized = numex_replaced; + } + } + + return numex_normalized; +} + +char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages) { int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC; uint8_t *utf8proc_normalized = NULL; @@ -63,7 +79,7 @@ char *normalize_string_utf8(char *str, uint64_t options) { } } - if (options & NORMALIZE_STRING_REPLACE_HYPHENS && strchr(str, '-') != NULL) { + if (options & NORMALIZE_STRING_REPLACE_HYPHENS && string_contains_hyphen(str)) { char *replaced = string_replace_char(str, '-', ' '); if (replaced != NULL) { if (normalized_allocated) { @@ -76,11 +92,28 @@ char *normalize_string_utf8(char *str, uint64_t options) { } } + if (options & NORMALIZE_STRING_REPLACE_NUMEX && num_languages > 0) { + char *numex_normalized = normalize_replace_numex(str, num_languages, languages); + if (numex_normalized != NULL) { + if (normalized_allocated) { + free(normalized); + } + normalized = numex_normalized; + str = normalized; + normalized_allocated = true; + } + + } + return normalized; } +char *normalize_string_utf8(char *str, uint64_t options) { + return normalize_string_utf8_languages(str, options, 0, NULL); +} -char *normalize_string_latin(char *str, size_t len, uint64_t options) { + +char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { char *latin_transliterator = LATIN_ASCII; if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) { latin_transliterator = LATIN_ASCII_SIMPLE; @@ -90,9 +123,9 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) { char *utf8_normalized; if (transliterated == NULL) { - utf8_normalized = normalize_string_utf8(str, options); + utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages); } else { - utf8_normalized = normalize_string_utf8(transliterated, options); + utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages); free(transliterated); transliterated = NULL; } @@ -100,7 +133,11 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) { return utf8_normalized; } -void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) { +char *normalize_string_latin(char *str, size_t len, uint64_t options) { + return normalize_string_latin_languages(str, len, options, 0, NULL); +} + +void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) { char *transliterated = NULL; char *utf8_normalized = NULL; @@ -114,7 +151,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t if (options & NORMALIZE_STRING_LATIN_ASCII) { transliterated = transliterate(latin_transliterator, str, len); if (transliterated != NULL) { - utf8_normalized = normalize_string_utf8(transliterated, options); + utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages); free(transliterated); transliterated = NULL; } @@ -127,7 +164,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t } char *str_copy = strndup(str, len); - utf8_normalized = normalize_string_utf8(str_copy, options); + utf8_normalized = normalize_string_utf8_languages(str_copy, options, num_languages, languages); free(str_copy); if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) { @@ -150,9 +187,9 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t if (prev_string != NULL) { free(prev_string); } - } + string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { size_t len = strlen(str); string_tree_t *tree = string_tree_new_size(len); @@ -161,15 +198,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu khash_t(int_set) *scripts = kh_init(int_set); char *utf8_normalized = NULL; - - char *ptr = str; + char *numex_replaced = NULL; script_t script; char *trans_name = NULL; char *lang; - bool transliterate_latin = false; + char *ptr = str; + + bool have_latin_transliterator = false; while (consumed < len) { string_script_t script_span = get_string_script(ptr, len - consumed); script = script_span.script; @@ -182,12 +220,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu if (html_escaped != NULL) { str = html_escaped; } - utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); + + options ^= NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_STRIP_ACCENTS | NORMALIZE_STRING_LATIN_ASCII; + + utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages); if (utf8_normalized != NULL) { if (html_escaped != NULL) { free(html_escaped); html_escaped = NULL; } + string_tree_add_string(tree, utf8_normalized); string_tree_finalize_token(tree); free(utf8_normalized); @@ -200,22 +242,22 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu log_debug("script_len=%zu\n", script_len); - if (script == SCRIPT_LATIN && num_languages > 0 && !transliterate_latin) { + if (script == SCRIPT_LATIN && num_languages > 0 && !have_latin_transliterator) { for (size_t i = 0; i < num_languages; i++) { lang = languages[i]; foreach_transliterator(script, lang, trans_name, { if (!string_equals(trans_name, LATIN_ASCII)) { - transliterate_latin = true; + have_latin_transliterator = true; break; } }) - if (transliterate_latin) break; + if (have_latin_transliterator) break; } } - if ((script != SCRIPT_LATIN || transliterate_latin) && script_len > 0) { + if ((script != SCRIPT_LATIN || have_latin_transliterator) && script_len > 0) { int ret; khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret); if (ret < 0) { @@ -230,8 +272,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu ptr += script_len; } - if (!transliterate_latin) { - add_latin_alternatives(tree, str, len, options); + if (!have_latin_transliterator) { + add_latin_alternatives(tree, str, len, options, num_languages, languages); } size_t transliterate_scripts = kh_size(scripts); @@ -276,7 +318,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu prev = transliterated; }) - add_latin_alternatives(tree, transliterated, strlen(transliterated), options); + add_latin_alternatives(tree, transliterated, strlen(transliterated), options, num_languages, languages); if (transliterated != str) { free(transliterated); } @@ -287,8 +329,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu } - if (transliterate_latin) { - add_latin_alternatives(tree, str, len, options); + if (have_latin_transliterator) { + add_latin_alternatives(tree, str, len, options, num_languages, languages); } kh_destroy(int_set, scripts); diff --git a/src/normalize.h b/src/normalize.h index e18de053..d485f67f 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -33,6 +33,7 @@ As well as normalizations for individual string tokens: #include "string_utils.h" #include "utf8proc/utf8proc.h" #include "unicode_scripts.h" +#include "numex.h" #include "transliterate.h" #include "trie.h" #include "tokens.h" @@ -47,6 +48,7 @@ As well as normalizations for individual string tokens: #define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 #define NORMALIZE_STRING_COMPOSE 1 << 7 #define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 +#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 #define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 #define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 From f3cf119e5848d14d48252ffff0501f207468f031 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 18 Apr 2017 21:41:24 -0400 Subject: [PATCH 3/4] [build] Makefile changes to support moving numeric expression parsing to normalize.c --- src/Makefile.am | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index ae17c8ce..6707d5aa 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,7 +12,7 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include CFLAGS = lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c numex.c utf8proc/utf8proc.c cmp/cmp.c normalize.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c +libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) libpostal_la_CFLAGS = $(CFLAGS_O2) libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ @@ -34,7 +34,7 @@ libpostal_CFLAGS = $(CFLAGS_O3) bench_SOURCES = bench.c bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS) bench_CFLAGS = $(CFLAGS_O3) -address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c numex.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c +address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c address_parser_LDADD = libscanner.la $(CBLAS_LIBS) address_parser_CFLAGS = $(CFLAGS_O3) @@ -44,21 +44,21 @@ build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_ut build_numex_table_CFLAGS = $(CFLAGS_O3) build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c build_trans_table_CFLAGS = $(CFLAGS_O3) -address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c +address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS) address_parser_train_CFLAGS = $(CFLAGS_O3) -address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c +address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c numex.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS) address_parser_test_CFLAGS = $(CFLAGS_O3) -language_classifier_train_SOURCES = language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c +language_classifier_train_SOURCES = language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS) language_classifier_train_CFLAGS = $(CFLAGS_O3) -language_classifier_SOURCES = language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c +language_classifier_SOURCES = language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c language_classifier_LDADD = libscanner.la $(CBLAS_LIBS) language_classifier_CFLAGS = $(CFLAGS_O3) -language_classifier_test_SOURCES = language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c +language_classifier_test_SOURCES = language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c numex.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS) language_classifier_test_CFLAGS = $(CFLAGS_O3) From 9cd3ec37f963797a66b0e67e56836c641195e0fe Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 18 Apr 2017 21:42:01 -0400 Subject: [PATCH 4/4] [build] rebuild numex table in Travis if either the configs change or numex_table_builder.c changes --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 38b5bf6e..6973a9d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ env: - secure: "OGNJ6Cj3trq4nASgm4BK331aij+FZ11St7/YF9rfxeQBwg4MCPH2+D0jvAULBHvJR7K2RmepX/FG5d4S+rtwKNGngg3ovPdd1MbwFltHpn5/KM+hxe7kCZx2+V9/FN+4YSyO0zSUDra6AXHOs72mfyrZoB3a36SS4lg2sAp33gU=" - GH_REF=github.com/openvenues/libpostal - DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/*.txt" | wc -l) - - NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex" | wc -l) + - NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep -E "(resources/numex|src/numex_table_builder.c)|" | wc -l) - TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l) compiler: - clang