From 92051863ba807b73fc7945b573afaa3c54e71d6c Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 18 Apr 2017 17:20:02 -0400 Subject: [PATCH] [numex] adding ordinal suffixes themselves to the numex trie so they can be removed from strings --- src/numex.c | 93 ++++++++++++++++++++++++++++-- src/numex.h | 12 +++- src/numex_table_builder.c | 118 +++++++++++++++++++++----------------- 3 files changed, 161 insertions(+), 62 deletions(-) diff --git a/src/numex.c b/src/numex.c index 17013bbd..ff34fcfb 100644 --- a/src/numex.c +++ b/src/numex.c @@ -911,7 +911,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { return results; } -static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) { +static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lang, char *ns, gender_t gender, grammatical_category_t category, bool use_default_if_not_found) { numex_language_t *language = get_numex_language(lang); if (language == NULL) { @@ -926,7 +926,7 @@ static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lan return NULL_PREFIX_RESULT; } - prefix = trie_get_prefix_from_index(trie, ORDINAL_NAMESPACE_PREFIX, ORDINAL_NAMESPACE_PREFIX_LEN, prefix.node_id, prefix.tail_pos); + prefix = trie_get_prefix_from_index(trie, ns, strlen(ns), prefix.node_id, prefix.tail_pos); if (prefix.node_id == NULL_NODE_ID) { return NULL_PREFIX_RESULT; @@ -976,7 +976,7 @@ static trie_prefix_result_t get_ordinal_namespace_prefix(trie_t *trie, char *lan return prefix; } -char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result) { +static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, gender_t gender, grammatical_category_t category) { if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return NULL; @@ -988,13 +988,13 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result } bool use_default_if_not_found = true; - trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, result.gender, result.category, use_default_if_not_found); + trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); if (prefix.node_id == NULL_NODE_ID) { return NULL; } - phrase_t phrase = trie_search_suffixes_from_index(trie, numeric_string, strlen(numeric_string), prefix.node_id); + phrase_t phrase = trie_search_suffixes_from_index(trie, numeric_string, len, prefix.node_id); if (phrase.len == 0) { return NULL; @@ -1009,6 +1009,87 @@ char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result } +static size_t possible_ordinal_digit_len(char *str, size_t len) { + uint8_t *ptr = (uint8_t *)str; + size_t idx = 0; + + bool ignorable = true; + + bool is_digit = false; + bool last_was_digit = false; + + int32_t ch; + + size_t digit_len = 0; + + while (idx < len) { + ssize_t char_len = utf8proc_iterate(ptr, len, &ch); + + if (char_len <= 0) break; + if (ch == 0) break; + if (!(utf8proc_codepoint_valid(ch))) return 0; + + // 0-9 only for this + is_digit = ch >= 48 && ch <= 57; + + if ((idx == 0 && !is_digit) || (idx > 0 && is_digit && !last_was_digit)) { + return 0; + } + + if (is_digit) { + digit_len += char_len; + } + + ptr += char_len; + idx += char_len; + last_was_digit = is_digit; + } + + return digit_len; +} + +size_t ordinal_suffix_len(char *str, size_t len, char *lang) { + if (str == NULL || len == 0) { + return 0; + } + + size_t ordinal_digit_len = possible_ordinal_digit_len(str, len); + if (ordinal_digit_len == 0) { + return 0; + } + + if (numex_table == NULL) { + log_error(NUMEX_SETUP_ERROR); + return 0; + } + + trie_t *trie = numex_table->trie; + if (trie == NULL) { + return 0; + } + + bool use_default_if_not_found = false; + + // Default (GENDER_NONE and CATEGORY_DEFAULT) are at the end of the enums, so iterate backward + for (int gender = NUM_GENDERS - 1; gender >= 0; gender--) { + for (int category = NUM_CATEGORIES - 1; category >= 0; category--) { + trie_prefix_result_t prefix = get_ordinal_namespace_prefix(trie, lang, ORDINAL_PHRASE_NAMESPACE_PREFIX, gender, category, use_default_if_not_found); + + if (prefix.node_id == NULL_NODE_ID) { + continue; + } + + phrase_t phrase = trie_search_suffixes_from_index(trie, str, len, prefix.node_id); + + if (phrase.len == len - ordinal_digit_len) { + return len - ordinal_digit_len; + } + } + } + + return 0; +} + char *replace_numeric_expressions(char *str, char *lang) { numex_result_array *results = convert_numeric_expressions(str, lang); if (results == NULL) return NULL; @@ -1040,7 +1121,7 @@ char *replace_numeric_expressions(char *str, char *lang) { char_array_append(replacement, numeric_string); if (result.is_ordinal) { - char *ordinal_suffix = get_ordinal_suffix(numeric_string, lang, result); + char *ordinal_suffix = get_ordinal_suffix(numeric_string, strlen(numeric_string), lang, result.gender, result.category); if (ordinal_suffix != NULL) { char_array_append(replacement, ordinal_suffix); } diff --git a/src/numex.h b/src/numex.h index 9d8d9f4a..c000ff9c 100644 --- a/src/numex.h +++ b/src/numex.h @@ -34,7 +34,8 @@ typedef enum { GENDER_MASCULINE, GENDER_FEMININE, GENDER_NEUTER, - GENDER_NONE + GENDER_NONE, + NUM_GENDERS } gender_t; #define CATEGORY_PLURAL_PREFIX "p" @@ -42,7 +43,8 @@ typedef enum { typedef enum { CATEGORY_PLURAL, - CATEGORY_DEFAULT + CATEGORY_DEFAULT, + NUM_CATEGORIES } grammatical_category_t; typedef enum { @@ -85,10 +87,14 @@ typedef struct numex_rule { VECTOR_INIT(numex_rule_array, numex_rule_t) #define ORDINAL_NAMESPACE_CHAR "o" +#define ORDINAL_PHRASE_NAMESPACE_CHAR "p" #define ORDINAL_NAMESPACE_PREFIX NAMESPACE_SEPARATOR_CHAR ORDINAL_NAMESPACE_CHAR NAMESPACE_SEPARATOR_CHAR #define ORDINAL_NAMESPACE_PREFIX_LEN strlen(ORDINAL_NAMESPACE_PREFIX) +#define ORDINAL_PHRASE_NAMESPACE_PREFIX NAMESPACE_SEPARATOR_CHAR ORDINAL_PHRASE_NAMESPACE_CHAR NAMESPACE_SEPARATOR_CHAR +#define ORDINAL_PHRASE_NAMESPACE_PREFIX_LEN strlen(ORDINAL_PHRASE_NAMESPACE_PREFIX) + typedef struct ordinal_indicator { char *key; gender_t gender; @@ -142,7 +148,7 @@ VECTOR_INIT(numex_result_array, numex_result_t) char *replace_numeric_expressions(char *str, char *lang); numex_result_array *convert_numeric_expressions(char *str, char *lang); -char *get_ordinal_suffix(char *numeric_string, char *lang, numex_result_t result); +size_t ordinal_suffix_len(char *s, size_t len, char *lang); bool numex_table_write(FILE *file); bool numex_table_save(char *filename); diff --git a/src/numex_table_builder.c b/src/numex_table_builder.c index 624a98c5..be9f8a14 100644 --- a/src/numex_table_builder.c +++ b/src/numex_table_builder.c @@ -92,72 +92,84 @@ int main(int argc, char **argv) { } for (j = ordinal_indicator_index; j < ordinal_indicator_index + num_ordinal_indicators; j++) { - value = numex_table->ordinal_indicators->n; - ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j]; + for (int ordinal_phrases = 0; ordinal_phrases <= 1; ordinal_phrases++) { + value = numex_table->ordinal_indicators->n; + ordinal_indicator_t ordinal_source = ordinal_indicator_rules[j]; - if (ordinal_source.key == NULL) { - log_error("ordinal source key was NULL at index %d\n", j); - exit(EXIT_FAILURE); - } + if (ordinal_source.key == NULL) { + log_error("ordinal source key was NULL at index %d\n", j); + exit(EXIT_FAILURE); + } - char *ordinal_indicator_key = strdup(ordinal_source.key); - if (ordinal_indicator_key == NULL) { - log_error("Error in strdup\n"); - exit(EXIT_FAILURE); - } - - char *suffix = NULL; - if (ordinal_source.suffix != NULL) { - suffix = strdup(ordinal_source.suffix); - if (suffix == NULL) { + char *ordinal_indicator_key = strdup(ordinal_source.key); + if (ordinal_indicator_key == NULL) { log_error("Error in strdup\n"); exit(EXIT_FAILURE); } - } - ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix); - ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal); - char_array_clear(key); - char_array_cat(key, lang); - char_array_cat(key, ORDINAL_NAMESPACE_PREFIX); + char *suffix = NULL; + if (ordinal_source.suffix != NULL) { + suffix = strdup(ordinal_source.suffix); + if (suffix == NULL) { + log_error("Error in strdup\n"); + exit(EXIT_FAILURE); + } + } - switch (ordinal_source.gender) { - case GENDER_MASCULINE: - char_array_cat(key, GENDER_MASCULINE_PREFIX); - break; - case GENDER_FEMININE: - char_array_cat(key, GENDER_FEMININE_PREFIX); - break; - case GENDER_NEUTER: - char_array_cat(key, GENDER_NEUTER_PREFIX); - break; - case GENDER_NONE: - default: - char_array_cat(key, GENDER_NONE_PREFIX); - } + char_array_clear(key); + char_array_cat(key, lang); - switch (ordinal_source.category) { - case CATEGORY_PLURAL: - char_array_cat(key, CATEGORY_PLURAL_PREFIX); - break; - case CATEGORY_DEFAULT: - default: - char_array_cat(key, CATEGORY_DEFAULT_PREFIX); + if (!ordinal_phrases) { + ordinal_indicator_t *ordinal = ordinal_indicator_new(ordinal_indicator_key, ordinal_source.gender, ordinal_source.category, suffix); + ordinal_indicator_array_push(numex_table->ordinal_indicators, ordinal); - } + char_array_cat(key, ORDINAL_NAMESPACE_PREFIX); + } else { + char_array_cat(key, ORDINAL_PHRASE_NAMESPACE_PREFIX); + } - char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); + switch (ordinal_source.gender) { + case GENDER_MASCULINE: + char_array_cat(key, GENDER_MASCULINE_PREFIX); + break; + case GENDER_FEMININE: + char_array_cat(key, GENDER_FEMININE_PREFIX); + break; + case GENDER_NEUTER: + char_array_cat(key, GENDER_NEUTER_PREFIX); + break; + case GENDER_NONE: + default: + char_array_cat(key, GENDER_NONE_PREFIX); + } - char *reversed = utf8_reversed_string(ordinal_source.key); - char_array_cat(key, reversed); - free(reversed); + switch (ordinal_source.category) { + case CATEGORY_PLURAL: + char_array_cat(key, CATEGORY_PLURAL_PREFIX); + break; + case CATEGORY_DEFAULT: + default: + char_array_cat(key, CATEGORY_DEFAULT_PREFIX); - char *str_key = char_array_get_string(key); + } - if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) { - trie_add(numex_table->trie, str_key, value); - } else { - log_warn("Key exists: %s, skipping\n", str_key); + char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); + + char *key_str = ordinal_source.key; + + if (ordinal_phrases) { + key_str = suffix; + } + + char *reversed = utf8_reversed_string(key_str); + char_array_cat(key, reversed); + free(reversed); + + char *str_key = char_array_get_string(key); + + if (trie_get(numex_table->trie, str_key) == NULL_NODE_ID) { + trie_add(numex_table->trie, str_key, value); + } } }