[expand] adding a normalization for a single non-acronym internal period where there's an expansion at the prefix/suffix (for #218 and https://github.com/openvenues/libpostal/issues/216#issuecomment-306617824). Helps in cases like "St.Michaels" or "Jln.Utara" without needing to specify concatenated prefix phrases for every possibility

This commit is contained in:
Al
2017-10-28 02:38:15 -04:00
parent 6d430f7e9b
commit 053dca82ba
5 changed files with 402 additions and 285 deletions

View File

@@ -400,9 +400,12 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
char *append_if_not_numeric = NULL;
int32_t ch;
int32_t next_ch;
ssize_t char_len;
ssize_t next_char_len;
bool last_was_letter = false;
bool last_was_number = false;
bool append_char = true;
while (idx < len) {
@@ -416,9 +419,14 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
bool is_letter = utf8_is_letter(cat);
bool is_number = utf8_is_number(cat);
next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch);
int next_cat = utf8proc_category(next_ch);
bool next_is_number = utf8_is_number(next_cat);
bool is_full_stop = ch == FULL_STOP_CODEPOINT;
if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
if (is_hyphen && options & NORMALIZE_TOKEN_REPLACE_HYPHENS && (!(last_was_number && next_is_number) || options & NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS)) {
char_array_append(array, " ");
append_char = false;
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
@@ -481,7 +489,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
append_char = true;
last_was_letter = is_letter;
last_was_number = is_number;
}
}