From b4fdc51bf952eb9eece330f8799ac032519522b1 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Dec 2017 19:27:23 -0500 Subject: [PATCH] [numex] changing is_roman_numeral to is_likely_roman_numeral to get rid of most of the false positives like \"La\" in Spanish which could be L(=50) + the ordinal suffix \"a\", but in practice it never means that. For Roman numerals that are shorter than two characters (whether on their own like "DC" or "MD", or attached to a potential ordinal suffix like \"Ce\" in French), will be ignored unless they're composed of more likely, smaller, Roman numerals: I, V, and X, so VI, IX, etc. are expanded as Roman numerals but LI is not. --- src/numex.c | 153 ++++++++++++++++++++++++++++++++-------------------- src/numex.h | 4 +- 2 files changed, 98 insertions(+), 59 deletions(-) diff --git a/src/numex.c b/src/numex.c index 2d4161d9..7f4ef630 100644 --- a/src/numex.c +++ b/src/numex.c @@ -439,7 +439,7 @@ bool numex_table_read(FILE *f) { log_debug("read num_languages = %" PRIu64 "\n", num_languages); - int i = 0; + size_t i = 0; numex_language_t *language; @@ -541,7 +541,7 @@ bool numex_table_write(FILE *f) { numex_rule_t rule; - int i = 0; + size_t i = 0; for (i = 0; i < num_rules; i++) { rule = numex_table->rules->a[i]; @@ -1137,23 +1137,115 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { return 0; } + + +static inline bool is_roman_numeral_char(char c) { + return (c == 'i' || + c == 'v' || + c == 'x' || + c == 'l' || + c == 'c' || + c == 'd' || + c == 'm' || + c == 'I' || + c == 'V' || + c == 'X' || + c == 'L' || + c == 'C' || + c == 'D' || + c == 'M'); +} + +static inline bool is_likely_single_roman_numeral_char(char c) { + return (c == 'i' || + c == 'v' || + c == 'x' || + c == 'I' || + c == 'V' || + c == 'X'); +} + + +bool is_valid_roman_numeral(char *str, size_t len) { + char *copy = strndup(str, len); + if (copy == NULL) return false; + + numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE); + if (results == NULL) { + free(copy); + return false; + } + + bool ret = results->n == 1 && results->a[0].len == len; + numex_result_array_destroy(results); + free(copy); + return ret; +} + +bool is_likely_roman_numeral_len(char *str, size_t len) { + bool seen_roman = false; + for (size_t i = 0; i < len; i++) { + char c = *(str + i); + if (c == 0) break; + if ((len <= 2 && is_likely_single_roman_numeral_char(c)) || (len > 2 && is_roman_numeral_char(c))) { + seen_roman = true; + } else { + return false; + } + } + + return seen_roman && is_valid_roman_numeral(str, len); +} + +inline bool is_likely_roman_numeral(char *str) { + return is_likely_roman_numeral_len(str, strlen(str)); +} + char *replace_numeric_expressions(char *str, char *lang) { numex_result_array *results = convert_numeric_expressions(str, lang); if (results == NULL) return NULL; + bool is_latin = string_equals(lang, LATIN_LANGUAGE_CODE); + size_t len = strlen(str); char_array *replacement = char_array_new_size(len); size_t start = 0; size_t end = 0; - for (int i = 0; i < results->n; i++) { - numex_result_t result = results->a[i]; + bool have_valid_numex = false; + numex_result_t result = NULL_NUMEX_RESULT; + + for (size_t i = 0; i < results->n; i++) { + result = results->a[i]; if (result.len == 0) { continue; } + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + continue; + } + have_valid_numex = true; + break; + } + + if (!have_valid_numex) { + numex_result_array_destroy(results); + return NULL; + } + + for (size_t i = 0; i < results->n; i++) { + result = results->a[i]; + + if (result.len == 0) { + continue; + } + + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + continue; + } + end = result.start; log_debug("lang=%s, start = %zu, len = %zu, value=%" PRId64 "\n", lang, result.start, result.len, result.value); @@ -1184,56 +1276,3 @@ char *replace_numeric_expressions(char *str, char *lang) { return char_array_to_string(replacement); } - -static inline bool is_roman_numeral_char(char c) { - return (c == 'i' || - c == 'v' || - c == 'x' || - c == 'l' || - c == 'c' || - c == 'd' || - c == 'm' || - c == 'I' || - c == 'V' || - c == 'X' || - c == 'L' || - c == 'C' || - c == 'D' || - c == 'M'); -} - -bool is_valid_roman_numeral(char *str, size_t len) { - char *copy = strndup(str, len); - if (copy == NULL) return false; - - numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE); - if (results == NULL) { - free(copy); - return false; - } - - bool ret = results->n == 1 && results->a[0].len == len; - numex_result_array_destroy(results); - free(copy); - return ret; -} - -bool is_roman_numeral_len(char *str, size_t len) { - size_t i = 0; - bool seen_roman = false; - for (size_t i = 0; i < len; i++) { - char c = *(str + i); - if (c == 0) break; - if (is_roman_numeral_char(c)) { - seen_roman = true; - } else { - return false; - } - } - - return seen_roman && is_valid_roman_numeral(str, len); -} - -inline bool is_roman_numeral(char *str) { - return is_roman_numeral_len(str, strlen(str)); -} diff --git a/src/numex.h b/src/numex.h index f4536bb7..1a0d89b7 100644 --- a/src/numex.h +++ b/src/numex.h @@ -152,8 +152,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang); size_t ordinal_suffix_len(char *s, size_t len, char *lang); size_t possible_ordinal_digit_len(char *str, size_t len); -bool is_roman_numeral(char *str); -bool is_roman_numeral_len(char *str, size_t len); +bool is_likely_roman_numeral(char *str); +bool is_likely_roman_numeral_len(char *str, size_t len); bool numex_table_write(FILE *file); bool numex_table_save(char *filename);