From d35f5196292cba644e860333a002f01c22d29ac7 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 7 Dec 2015 19:18:46 -0500 Subject: [PATCH] [expansion] Fixing case where non-ideographic tokens like # can potentially be concatenated with surrounding tokens and should normalized with whitespace in between --- src/libpostal.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/libpostal.c b/src/libpostal.c index f7236db0..d663b926 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -33,6 +33,10 @@ inline bool is_word_token(uint16_t type) { return type == WORD || type == ABBREVIATION || type == ACRONYM || type == IDEOGRAPHIC_CHAR || type == HANGUL_SYLLABLE; } +inline bool is_ideographic(uint16_t type) { + return type == IDEOGRAPHIC_CHAR || type == HANGUL_SYLLABLE || type == IDEOGRAPHIC_NUMBER; +} + inline bool is_numeric_token(uint16_t type) { return type == NUMERIC; } @@ -163,12 +167,14 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { int start = 0; int end = 0; + phrase_t phrase = NULL_PHRASE; + key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); for (int i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; - phrase_t phrase = phrase_lang.phrase; + phrase = phrase_lang.phrase; if (phrase.start < start) { continue; } @@ -195,6 +201,14 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { string_tree_finalize_token(tree); } + if (phrase.start > 0) { + token_t prev_token = tokens->a[phrase.start - 1]; + if (!(prev_token.type == WHITESPACE && !is_ideographic(prev_token.type))) { + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + } + expansion_value_t value; value.value = phrase.data; @@ -262,6 +276,15 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { string_tree_finalize_token(tree); } + + if (phrase.start + phrase.len < tokens->n - 1) { + token_t next_token = tokens->a[phrase.start + phrase.len + 1]; + if (!(next_token.type == WHITESPACE && !is_ideographic(next_token.type))) { + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + } + } start = phrase.start + phrase.len; @@ -272,6 +295,15 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { end = (int)tokens->n; + if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) { + token_t next_token = tokens->a[phrase.start + phrase.len]; + if (!(next_token.type == WHITESPACE && !is_ideographic(next_token.type))) { + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + } + + for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { @@ -282,7 +314,8 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { log_debug("Adding space\n"); string_tree_add_string(tree, " "); } - string_tree_finalize_token(tree); + string_tree_finalize_token(tree); + }