[numex] regen data file. utf8_is_hyphen requires a character, all other methods use category

This commit is contained in:
Al
2015-06-08 21:32:01 -04:00
parent c1d0afa52c
commit 81be8e771e
3 changed files with 19 additions and 16 deletions

View File

@@ -722,6 +722,8 @@ numex_rule_source_t numex_rules[] = {
{"octante", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 80LL}},
{"quatre vignts", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}},
{"quatrevignts", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}},
{"quatre vignt", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}},
{"quatrevignt", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}},
{"nonante", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 90LL}},
{"cent", (numex_rule_t){NUMEX_LEFT_CONTEXT_MULTIPLY, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 100LL}},
{"cents", (numex_rule_t){NUMEX_LEFT_CONTEXT_MULTIPLY, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 100LL}},
@@ -1805,6 +1807,7 @@ ordinal_indicator_t ordinal_indicator_rules[] = {
{"1", GENDER_FEMININE, CATEGORY_DEFAULT, "era"},
{"0", GENDER_FEMININE, CATEGORY_DEFAULT, "a"},
{"3", GENDER_FEMININE, CATEGORY_DEFAULT, "a"},
{"3", GENDER_FEMININE, CATEGORY_DEFAULT, "ra"},
{"2", GENDER_FEMININE, CATEGORY_DEFAULT, "a"},
{"5", GENDER_FEMININE, CATEGORY_DEFAULT, "a"},
{"4", GENDER_FEMININE, CATEGORY_DEFAULT, "a"},
@@ -2182,16 +2185,16 @@ numex_language_source_t numex_languages[] = {
{"da", 141, 42, 38, 0},
{"de", 183, 194, 38, 0},
{"en", 377, 70, 38, 13},
{"es", 447, 133, 51, 28},
{"fi", 580, 107, 79, 19},
{"fr", 687, 192, 98, 80},
{"it", 879, 163, 178, 20},
{"ja", 1042, 18, 198, 0},
{"la", 1060, 31, 198, 0},
{"nl", 1091, 68, 198, 184},
{"pl", 1159, 82, 382, 0},
{"pt", 1241, 170, 382, 20},
{"ru", 1411, 192, 402, 20},
{"sv", 1603, 94, 422, 20},
{"zh", 1697, 33, 442, 0}
{"es", 447, 133, 51, 29},
{"fi", 580, 107, 80, 19},
{"fr", 687, 194, 99, 80},
{"it", 881, 163, 179, 20},
{"ja", 1044, 18, 199, 0},
{"la", 1062, 31, 199, 0},
{"nl", 1093, 68, 199, 184},
{"pl", 1161, 82, 383, 0},
{"pt", 1243, 170, 383, 20},
{"ru", 1413, 192, 403, 20},
{"sv", 1605, 94, 423, 20},
{"zh", 1699, 33, 443, 0}
};

View File

@@ -142,8 +142,7 @@ error_free_output:
return NULL;
}
inline bool utf8_is_letter(int32_t ch) {
int cat = utf8proc_category(ch);
inline bool utf8_is_letter(int cat) {
return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \
|| cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \
|| cat == UTF8PROC_CATEGORY_LM;
@@ -160,7 +159,8 @@ inline bool utf8_is_letter_or_number(int cat) {
|| cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO;
}
inline bool utf8_is_hyphen(int cat) {
inline bool utf8_is_hyphen(int32_t ch) {
int cat = utf8proc_category(ch);
return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212;
}

View File

@@ -30,10 +30,10 @@ uint string_translate(char *str, size_t len, char *word_chars, char *word_repls,
char *utf8_reversed_string(const char *s); // returns a copy, caller frees
ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst);
bool utf8_is_hyphen(int32_t ch);
bool utf8_is_letter(int cat);
bool utf8_is_number(int cat);
bool utf8_is_letter_or_number(int cat);
bool utf8_is_hyphen(int cat);
bool utf8_is_punctuation(int cat);
bool utf8_is_symbol(int cat);
bool utf8_is_separator(int cat);