[normalize] Permuting transliterators only once on the entire string rather than at each script break (so # permutations is bounded and can't get huge). Fixing some spacing issues. Adding method to check for an alpha+numeric token in normalization.

This commit is contained in:
Al
2016-02-08 01:16:47 -05:00
parent 393fd7e0f3
commit afd5844f21
2 changed files with 119 additions and 57 deletions

View File

@@ -66,6 +66,8 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options);
void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options);
void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options);
bool numeric_starts_with_alpha(char *str, token_t token);
// Takes NORMALIZE_STRING_* options
string_tree_t *normalize_string(char *str, uint64_t options);
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages);