[expansion] Fixing case where non-ideographic tokens like # can potentially be concatenated with surrounding tokens and should normalized with whitespace in between

This commit is contained in:
Al
2015-12-07 19:18:46 -05:00
parent f5739dd42b
commit d35f519629

View File

@@ -33,6 +33,10 @@ inline bool is_word_token(uint16_t type) {
return type == WORD || type == ABBREVIATION || type == ACRONYM || type == IDEOGRAPHIC_CHAR || type == HANGUL_SYLLABLE; return type == WORD || type == ABBREVIATION || type == ACRONYM || type == IDEOGRAPHIC_CHAR || type == HANGUL_SYLLABLE;
} }
inline bool is_ideographic(uint16_t type) {
return type == IDEOGRAPHIC_CHAR || type == HANGUL_SYLLABLE || type == IDEOGRAPHIC_NUMBER;
}
inline bool is_numeric_token(uint16_t type) { inline bool is_numeric_token(uint16_t type) {
return type == NUMERIC; return type == NUMERIC;
} }
@@ -163,12 +167,14 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
int start = 0; int start = 0;
int end = 0; int end = 0;
phrase_t phrase = NULL_PHRASE;
key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);
for (int i = 0; i < phrases->n; i++) { for (int i = 0; i < phrases->n; i++) {
phrase_lang = phrases->a[i]; phrase_lang = phrases->a[i];
phrase_t phrase = phrase_lang.phrase; phrase = phrase_lang.phrase;
if (phrase.start < start) { if (phrase.start < start) {
continue; continue;
} }
@@ -195,6 +201,14 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
string_tree_finalize_token(tree); string_tree_finalize_token(tree);
} }
if (phrase.start > 0) {
token_t prev_token = tokens->a[phrase.start - 1];
if (!(prev_token.type == WHITESPACE && !is_ideographic(prev_token.type))) {
string_tree_add_string(tree, " ");
string_tree_finalize_token(tree);
}
}
expansion_value_t value; expansion_value_t value;
value.value = phrase.data; value.value = phrase.data;
@@ -262,6 +276,15 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
string_tree_finalize_token(tree); string_tree_finalize_token(tree);
} }
if (phrase.start + phrase.len < tokens->n - 1) {
token_t next_token = tokens->a[phrase.start + phrase.len + 1];
if (!(next_token.type == WHITESPACE && !is_ideographic(next_token.type))) {
string_tree_add_string(tree, " ");
string_tree_finalize_token(tree);
}
}
} }
start = phrase.start + phrase.len; start = phrase.start + phrase.len;
@@ -272,6 +295,15 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
end = (int)tokens->n; end = (int)tokens->n;
if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) {
token_t next_token = tokens->a[phrase.start + phrase.len];
if (!(next_token.type == WHITESPACE && !is_ideographic(next_token.type))) {
string_tree_add_string(tree, " ");
string_tree_finalize_token(tree);
}
}
for (int j = start; j < end; j++) { for (int j = start; j < end; j++) {
token_t token = tokens->a[j]; token_t token = tokens->a[j];
if (token.type != WHITESPACE) { if (token.type != WHITESPACE) {
@@ -283,6 +315,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) {
string_tree_add_string(tree, " "); string_tree_add_string(tree, " ");
} }
string_tree_finalize_token(tree); string_tree_finalize_token(tree);
} }