diff --git a/src/libpostal.c b/src/libpostal.c index ddb619c4..978f9b70 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -238,13 +238,13 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt for (int j = start; j < end; j++) { log_debug("Adding token %d\n", j); token_t token = tokens->a[j]; - if (is_punctuation(token.type)) { + if (is_punctuation(token.type) && !is_special_punctuation(token.type)) { last_was_punctuation = true; continue; } if (token.type != WHITESPACE) { - if (phrase.start > 0 && last_was_punctuation && !last_added_was_whitespace) { + if ((phrase.start > 0 && last_was_punctuation && !last_added_was_whitespace) || (is_special_punctuation(token.type))) { string_tree_add_string(tree, " "); string_tree_finalize_token(tree); } @@ -260,7 +260,7 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt continue; } - last_was_punctuation = false; + last_was_punctuation = is_special_punctuation(token.type); string_tree_finalize_token(tree); } @@ -787,7 +787,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s token_t token = tokens->a[i]; bool have_phrase = false; - if (is_special_token(token.type)) { + if (is_special_token(token.type) || is_special_punctuation(token.type)) { string_tree_add_string_len(tree, str + token.offset, token.len); string_tree_finalize_token(tree); continue; diff --git a/src/token_types.h b/src/token_types.h index 23248767..80dfab68 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -69,6 +69,8 @@ #define is_punctuation(type) ((type) >= PERIOD && (type) < OTHER) +#define is_special_punctuation(type) ((type) == AMPERSAND || (type) == PLUS || (type) == POUND) + #define is_special_token(type) ((type) == EMAIL || (type) == URL || (type) == US_PHONE || (type) == INTL_PHONE) #define is_whitespace(type) ((type) == WHITESPACE)