[numex] changing is_roman_numeral to is_likely_roman_numeral to get rid of most of the false positives like \"La\" in Spanish which could be L(=50) + the ordinal suffix \"a\", but in practice it never means that. For Roman numerals that are shorter than two characters (whether on their own like "DC" or "MD", or attached to a potential ordinal suffix like \"Ce\" in French), will be ignored unless they're composed of more likely, smaller, Roman numerals: I, V, and X, so VI, IX, etc. are expanded as Roman numerals but LI is not.

This commit is contained in:
Al
2017-12-27 19:27:23 -05:00
parent b17b2bdcc4
commit b4fdc51bf9
2 changed files with 98 additions and 59 deletions

View File

@@ -439,7 +439,7 @@ bool numex_table_read(FILE *f) {
log_debug("read num_languages = %" PRIu64 "\n", num_languages);
int i = 0;
size_t i = 0;
numex_language_t *language;
@@ -541,7 +541,7 @@ bool numex_table_write(FILE *f) {
numex_rule_t rule;
int i = 0;
size_t i = 0;
for (i = 0; i < num_rules; i++) {
rule = numex_table->rules->a[i];
@@ -1137,23 +1137,115 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) {
return 0;
}
static inline bool is_roman_numeral_char(char c) {
return (c == 'i' ||
c == 'v' ||
c == 'x' ||
c == 'l' ||
c == 'c' ||
c == 'd' ||
c == 'm' ||
c == 'I' ||
c == 'V' ||
c == 'X' ||
c == 'L' ||
c == 'C' ||
c == 'D' ||
c == 'M');
}
static inline bool is_likely_single_roman_numeral_char(char c) {
return (c == 'i' ||
c == 'v' ||
c == 'x' ||
c == 'I' ||
c == 'V' ||
c == 'X');
}
bool is_valid_roman_numeral(char *str, size_t len) {
char *copy = strndup(str, len);
if (copy == NULL) return false;
numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE);
if (results == NULL) {
free(copy);
return false;
}
bool ret = results->n == 1 && results->a[0].len == len;
numex_result_array_destroy(results);
free(copy);
return ret;
}
bool is_likely_roman_numeral_len(char *str, size_t len) {
bool seen_roman = false;
for (size_t i = 0; i < len; i++) {
char c = *(str + i);
if (c == 0) break;
if ((len <= 2 && is_likely_single_roman_numeral_char(c)) || (len > 2 && is_roman_numeral_char(c))) {
seen_roman = true;
} else {
return false;
}
}
return seen_roman && is_valid_roman_numeral(str, len);
}
inline bool is_likely_roman_numeral(char *str) {
return is_likely_roman_numeral_len(str, strlen(str));
}
char *replace_numeric_expressions(char *str, char *lang) {
numex_result_array *results = convert_numeric_expressions(str, lang);
if (results == NULL) return NULL;
bool is_latin = string_equals(lang, LATIN_LANGUAGE_CODE);
size_t len = strlen(str);
char_array *replacement = char_array_new_size(len);
size_t start = 0;
size_t end = 0;
for (int i = 0; i < results->n; i++) {
numex_result_t result = results->a[i];
bool have_valid_numex = false;
numex_result_t result = NULL_NUMEX_RESULT;
for (size_t i = 0; i < results->n; i++) {
result = results->a[i];
if (result.len == 0) {
continue;
}
if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) {
continue;
}
have_valid_numex = true;
break;
}
if (!have_valid_numex) {
numex_result_array_destroy(results);
return NULL;
}
for (size_t i = 0; i < results->n; i++) {
result = results->a[i];
if (result.len == 0) {
continue;
}
if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) {
continue;
}
end = result.start;
log_debug("lang=%s, start = %zu, len = %zu, value=%" PRId64 "\n", lang, result.start, result.len, result.value);
@@ -1184,56 +1276,3 @@ char *replace_numeric_expressions(char *str, char *lang) {
return char_array_to_string(replacement);
}
static inline bool is_roman_numeral_char(char c) {
return (c == 'i' ||
c == 'v' ||
c == 'x' ||
c == 'l' ||
c == 'c' ||
c == 'd' ||
c == 'm' ||
c == 'I' ||
c == 'V' ||
c == 'X' ||
c == 'L' ||
c == 'C' ||
c == 'D' ||
c == 'M');
}
bool is_valid_roman_numeral(char *str, size_t len) {
char *copy = strndup(str, len);
if (copy == NULL) return false;
numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE);
if (results == NULL) {
free(copy);
return false;
}
bool ret = results->n == 1 && results->a[0].len == len;
numex_result_array_destroy(results);
free(copy);
return ret;
}
bool is_roman_numeral_len(char *str, size_t len) {
size_t i = 0;
bool seen_roman = false;
for (size_t i = 0; i < len; i++) {
char c = *(str + i);
if (c == 0) break;
if (is_roman_numeral_char(c)) {
seen_roman = true;
} else {
return false;
}
}
return seen_roman && is_valid_roman_numeral(str, len);
}
inline bool is_roman_numeral(char *str) {
return is_roman_numeral_len(str, strlen(str));
}

View File

@@ -152,8 +152,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang);
size_t ordinal_suffix_len(char *s, size_t len, char *lang);
size_t possible_ordinal_digit_len(char *str, size_t len);
bool is_roman_numeral(char *str);
bool is_roman_numeral_len(char *str, size_t len);
bool is_likely_roman_numeral(char *str);
bool is_likely_roman_numeral_len(char *str, size_t len);
bool numex_table_write(FILE *file);
bool numex_table_save(char *filename);