[normalize] Adding an option when normalizing tokens to split tokens of the form [\w]+[\.\-]?[\d]+ for cases like I35, CR123, R-66, RN.7, etc. where the alpha component is an expansion

This commit is contained in:
Al
2015-08-02 14:34:32 -06:00
parent f10dd49c58
commit 46141a6c36
2 changed files with 26 additions and 1 deletions

View File

@@ -52,6 +52,7 @@ extern "C" {
#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3
#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4
#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5
#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6
char *normalize_string_utf8(char *str, uint64_t options);