[languages] adding replace_hyphens and split_alpha_from_numeric in language classifier input normalization
This commit is contained in:
@@ -11,8 +11,8 @@
|
|||||||
#define QUADGRAMS 4
|
#define QUADGRAMS 4
|
||||||
#define OCTAGRAMS 8
|
#define OCTAGRAMS 8
|
||||||
|
|
||||||
#define LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
|
#define LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_REPLACE_HYPHENS
|
||||||
#define LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
|
#define LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
|
||||||
|
|
||||||
inline char *language_classifier_normalize_string(char *str) {
|
inline char *language_classifier_normalize_string(char *str) {
|
||||||
return normalize_string_latin(str, strlen(str), LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS);
|
return normalize_string_latin(str, strlen(str), LANGUAGE_CLASSIFIER_NORMALIZE_STRING_OPTIONS);
|
||||||
|
|||||||
Reference in New Issue
Block a user