From f6c30778bfa4492b9911b57fca6fb84ac218f96b Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 23 Sep 2015 19:40:51 -0400 Subject: [PATCH] [normalize] New token normalization option for replacing digits with 'D' for masking numbers e.g. when learning patterns (so 1234 and 5678 both normalize to DDDD). Shouldn't be used by libpostal API, just by the feature extractors in the machine learning models. Also adding better possessive handling. --- src/normalize.c | 32 +++++++++++++++++--------------- src/normalize.h | 6 +++++- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/normalize.c b/src/normalize.c index 30dabfd3..45eb887c 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -42,14 +42,16 @@ char *normalize_string_utf8(char *str, uint64_t options) { char *normalize_string_latin(char *str, size_t len, uint64_t options) { char *transliterated = transliterate(LATIN_ASCII, str, len); - if (transliterated != NULL) { - char *utf8_normalized = normalize_string_utf8(transliterated, options); + char *utf8_normalized; + if (transliterated == NULL) { + utf8_normalized = normalize_string_utf8(str, options); + } else { + utf8_normalized = normalize_string_utf8(transliterated, options); free(transliterated); transliterated = NULL; - return utf8_normalized; } - return NULL; + return utf8_normalized; } void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) { @@ -164,7 +166,7 @@ string_tree_t *normalize_string(char *str, uint64_t options) { } -void append_normalized_token(char_array *array, char *str, token_t token, uint64_t options) { +void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options) { size_t idx = 0; uint8_t *ptr = (uint8_t *)str + token.offset; @@ -213,6 +215,11 @@ void append_normalized_token(char_array *array, char *str, token_t token, uint64 append_if_not_numeric = NULL; } + if (is_number && options & NORMALIZE_TOKEN_REPLACE_DIGITS) { + char_array_append(array, DIGIT_CHAR); + append_char = false; + } + if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) { char_array_append(array, " "); alpha_numeric_split = true; @@ -232,9 +239,7 @@ void append_normalized_token(char_array *array, char *str, token_t token, uint64 char this_char = *ptr; char next_char = *(ptr + 1); - if (this_char == '\'' && next_char == 's') { - break; - } else if (this_char == 's' && next_char == '\'') { + if ((this_char == '\'' && next_char == 's') || (this_char == 's' && next_char == '\'')) { char_array_append(array, "s"); break; } @@ -256,14 +261,11 @@ void append_normalized_token(char_array *array, char *str, token_t token, uint64 } + char_array_terminate(array); + } -void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options) { - +inline void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options) { cstring_array_start_token(array); - - append_normalized_token(array->str, str, token, options); - - cstring_array_terminate(array); - + add_normalized_token(array->str, str, token, options); } diff --git a/src/normalize.h b/src/normalize.h index 16b08af0..427924ac 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -52,13 +52,17 @@ As well as normalizations for individual string tokens: #define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 #define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 #define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 +#define NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 + +// Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings +#define DIGIT_CHAR "D" char *normalize_string_utf8(char *str, uint64_t options); char *normalize_string_latin(char *str, size_t len, uint64_t options); // Takes NORMALIZE_TOKEN_* options -void append_normalized_token(char_array *array, char *str, token_t token, uint64_t options); +void add_normalized_token(char_array *array, char *str, token_t token, uint64_t options); void normalize_token(cstring_array *array, char *str, token_t token, uint64_t options); // Takes NORMALIZE_STRING_* options