From 81be8e771e9c2dc6cde2165af6354cbe987a4112 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 8 Jun 2015 21:32:01 -0400 Subject: [PATCH] [numex] regen data file. utf8_is_hyphen requires a character, all other methods use category --- src/numex_data.c | 27 +++++++++++++++------------ src/string_utils.c | 6 +++--- src/string_utils.h | 2 +- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/numex_data.c b/src/numex_data.c index 452b1d22..28583ef4 100644 --- a/src/numex_data.c +++ b/src/numex_data.c @@ -722,6 +722,8 @@ numex_rule_source_t numex_rules[] = { {"octante", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 80LL}}, {"quatre vignts", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}}, {"quatrevignts", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}}, + {"quatre vignt", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}}, + {"quatrevignt", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 20, 80LL}}, {"nonante", (numex_rule_t){NUMEX_LEFT_CONTEXT_NONE, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 90LL}}, {"cent", (numex_rule_t){NUMEX_LEFT_CONTEXT_MULTIPLY, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 100LL}}, {"cents", (numex_rule_t){NUMEX_LEFT_CONTEXT_MULTIPLY, NUMEX_RIGHT_CONTEXT_ADD, NUMEX_CARDINAL_RULE, GENDER_NONE, CATEGORY_DEFAULT, 10, 100LL}}, @@ -1805,6 +1807,7 @@ ordinal_indicator_t ordinal_indicator_rules[] = { {"1", GENDER_FEMININE, CATEGORY_DEFAULT, "era"}, {"0", GENDER_FEMININE, CATEGORY_DEFAULT, "a"}, {"3", GENDER_FEMININE, CATEGORY_DEFAULT, "a"}, + {"3", GENDER_FEMININE, CATEGORY_DEFAULT, "ra"}, {"2", GENDER_FEMININE, CATEGORY_DEFAULT, "a"}, {"5", GENDER_FEMININE, CATEGORY_DEFAULT, "a"}, {"4", GENDER_FEMININE, CATEGORY_DEFAULT, "a"}, @@ -2182,16 +2185,16 @@ numex_language_source_t numex_languages[] = { {"da", 141, 42, 38, 0}, {"de", 183, 194, 38, 0}, {"en", 377, 70, 38, 13}, - {"es", 447, 133, 51, 28}, - {"fi", 580, 107, 79, 19}, - {"fr", 687, 192, 98, 80}, - {"it", 879, 163, 178, 20}, - {"ja", 1042, 18, 198, 0}, - {"la", 1060, 31, 198, 0}, - {"nl", 1091, 68, 198, 184}, - {"pl", 1159, 82, 382, 0}, - {"pt", 1241, 170, 382, 20}, - {"ru", 1411, 192, 402, 20}, - {"sv", 1603, 94, 422, 20}, - {"zh", 1697, 33, 442, 0} + {"es", 447, 133, 51, 29}, + {"fi", 580, 107, 80, 19}, + {"fr", 687, 194, 99, 80}, + {"it", 881, 163, 179, 20}, + {"ja", 1044, 18, 199, 0}, + {"la", 1062, 31, 199, 0}, + {"nl", 1093, 68, 199, 184}, + {"pl", 1161, 82, 383, 0}, + {"pt", 1243, 170, 383, 20}, + {"ru", 1413, 192, 403, 20}, + {"sv", 1605, 94, 423, 20}, + {"zh", 1699, 33, 443, 0} }; diff --git a/src/string_utils.c b/src/string_utils.c index 0af42682..3a114a87 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -142,8 +142,7 @@ error_free_output: return NULL; } -inline bool utf8_is_letter(int32_t ch) { - int cat = utf8proc_category(ch); +inline bool utf8_is_letter(int cat) { return cat == UTF8PROC_CATEGORY_LL || cat == UTF8PROC_CATEGORY_LU \ || cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LO \ || cat == UTF8PROC_CATEGORY_LM; @@ -160,7 +159,8 @@ inline bool utf8_is_letter_or_number(int cat) { || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO; } -inline bool utf8_is_hyphen(int cat) { +inline bool utf8_is_hyphen(int32_t ch) { + int cat = utf8proc_category(ch); return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212; } diff --git a/src/string_utils.h b/src/string_utils.h index a2be888b..edde2419 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -30,10 +30,10 @@ uint string_translate(char *str, size_t len, char *word_chars, char *word_repls, char *utf8_reversed_string(const char *s); // returns a copy, caller frees ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst); +bool utf8_is_hyphen(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); bool utf8_is_letter_or_number(int cat); -bool utf8_is_hyphen(int cat); bool utf8_is_punctuation(int cat); bool utf8_is_symbol(int cat); bool utf8_is_separator(int cat);