[expand] adding ability to expand Roman numerals with ordinal suffixes like IXe in French

This commit is contained in:
Al
2017-10-20 02:51:26 -04:00
parent b7eda37e44
commit 5c927e780f

View File

@@ -774,18 +774,32 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to
} }
static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) {
size_t token_digit_len = possible_ordinal_digit_len(str + token.offset, token.len);
size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang);
bool ret = false; int32_t unichr = 0;
const uint8_t *ptr = (const uint8_t *)str;
if (len_ordinal_suffix == 0 || token_digit_len == 0 || token_digit_len + len_ordinal_suffix < token.len) { if (len_ordinal_suffix > 0) {
return false; ssize_t start = 0;
} else if (len_ordinal_suffix == token.len && i > 0 && prev_token.len > 0) { size_t token_offset = token.offset;
size_t prev_token_digit_len = possible_ordinal_digit_len(str + prev_token.offset, prev_token.len); size_t token_len = token.len;
ret = prev_token_digit_len == prev_token.len;
if (len_ordinal_suffix < token.len) {
start = token.offset + token.len - len_ordinal_suffix;
token_offset = token.offset;
token_len = token.len - len_ordinal_suffix;
} else {
start = prev_token.offset + prev_token.len;
token_offset = prev_token.offset;
token_len = prev_token.len;
}
ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr);
if (prev_char_len <= 0) return false;
if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) {
return false;
}
} else { } else {
ret = true; return false;
} }
cstring_array *strings = tree->strings; cstring_array *strings = tree->strings;
@@ -793,12 +807,10 @@ static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, ch
// add_normalized_strings_token won't be called a second time. // add_normalized_strings_token won't be called a second time.
add_normalized_strings_token(strings, str, token, options); add_normalized_strings_token(strings, str, token, options);
char_array *key = char_array_new_size(token.len - len_ordinal_suffix + 1); token_t normalized_token = token;
char_array_cat_len(key, str + token.offset, token.len - len_ordinal_suffix); normalized_token.len = token.len - len_ordinal_suffix;
char *expansion = char_array_get_string(key); add_normalized_strings_token(strings, str, normalized_token, options);
cstring_array_add_string(strings, expansion); return true;
char_array_destroy(key);
return ret;
} }
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {