From 46141a6c36eca1652a691ae8b364ebd1d9122a76 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 2 Aug 2015 14:34:32 -0600 Subject: [PATCH] [normalize] Adding an option when normalizing tokens to split tokens of the form [\w]+[\.\-]?[\d]+ for cases like I35, CR123, R-66, RN.7, etc. where the alpha component is an expansion --- src/normalize.c | 26 +++++++++++++++++++++++++- src/normalize.h | 1 + 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/normalize.c b/src/normalize.c index c48710ce..e24b6681 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -155,6 +155,9 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op size_t len = token.len; if (token.len == 0) return; + bool alpha_numeric_split = false; + char *append_if_not_numeric = NULL; + int32_t ch; ssize_t char_len; @@ -172,6 +175,9 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op int cat = utf8proc_category(ch); bool is_letter = utf8_is_letter(cat); + bool is_number = utf8_is_number(cat); + + bool is_full_stop = ch == FULL_STOP_CODEPOINT; if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) { cstring_array_append_string(array, " "); @@ -180,7 +186,25 @@ void normalize_token(cstring_array *array, char *str, token_t token, uint64_t op append_char = false; } - if (ch == FULL_STOP_CODEPOINT) { + if ((is_hyphen || is_full_stop) && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) { + ptr += char_len; + idx += char_len; + append_if_not_numeric = is_hyphen ? "-" : "."; + append_char = true; + continue; + } + + if (!is_number && append_if_not_numeric != NULL) { + cstring_array_append_string(array, append_if_not_numeric); + append_if_not_numeric = NULL; + } + + if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) { + cstring_array_append_string(array, " "); + alpha_numeric_split = true; + } + + if (is_full_stop) { if (options & NORMALIZE_TOKEN_DELETE_FINAL_PERIOD && idx == len - 1) { break; } diff --git a/src/normalize.h b/src/normalize.h index a09caf65..4c576b74 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -52,6 +52,7 @@ extern "C" { #define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 #define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 #define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 +#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 char *normalize_string_utf8(char *str, uint64_t options);