[expand/normalize] the split_alpha_from_numeric option now applies to both e.g. A1 and 1A since we now strip out ordinal suffixes prior to normalization

This commit is contained in:
Al
2017-12-17 19:53:11 -05:00
parent 8b2a4d1ecf
commit a1db4d7734
2 changed files with 6 additions and 3 deletions

View File

@@ -104,7 +104,7 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok
}
}
if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) {
if (is_numeric_token(token.type) && options.split_alpha_from_numeric) {
normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
normalize_token(strings, str, token, normalize_token_options);
normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;

View File

@@ -423,15 +423,18 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch);
int next_cat = utf8proc_category(next_ch);
bool next_is_number = utf8_is_number(next_cat);
bool next_is_letter = utf8_is_letter(next_cat);
bool is_full_stop = ch == FULL_STOP_CODEPOINT;
bool is_hyphen_between_letter_and_number = is_hyphen && ((next_is_number && last_was_letter) || (next_is_letter && last_was_number));
if (is_hyphen && options & NORMALIZE_TOKEN_REPLACE_HYPHENS && (!(last_was_number && next_is_number) || options & NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS)) {
char_array_append(array, " ");
append_char = false;
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
append_char = false;
append_char = !is_hyphen_between_letter_and_number;
}
if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) {
@@ -452,7 +455,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
append_char = false;
}
if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) {
if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && ((last_was_letter && is_number) || (last_was_number && is_letter)) && !alpha_numeric_split) {
char_array_append(array, " ");
alpha_numeric_split = true;
}