[expand] adding a normalization for a single non-acronym internal period where there's an expansion at the prefix/suffix (for #218 and https://github.com/openvenues/libpostal/issues/216#issuecomment-306617824). Helps in cases like "St.Michaels" or "Jln.Utara" without needing to specify concatenated prefix phrases for every possibility
This commit is contained in:
@@ -400,9 +400,12 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
|
||||
char *append_if_not_numeric = NULL;
|
||||
|
||||
int32_t ch;
|
||||
int32_t next_ch;
|
||||
ssize_t char_len;
|
||||
ssize_t next_char_len;
|
||||
|
||||
bool last_was_letter = false;
|
||||
bool last_was_number = false;
|
||||
bool append_char = true;
|
||||
|
||||
while (idx < len) {
|
||||
@@ -416,9 +419,14 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
|
||||
bool is_letter = utf8_is_letter(cat);
|
||||
bool is_number = utf8_is_number(cat);
|
||||
|
||||
next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch);
|
||||
int next_cat = utf8proc_category(next_ch);
|
||||
bool next_is_number = utf8_is_number(next_cat);
|
||||
|
||||
|
||||
bool is_full_stop = ch == FULL_STOP_CODEPOINT;
|
||||
|
||||
if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) {
|
||||
if (is_hyphen && options & NORMALIZE_TOKEN_REPLACE_HYPHENS && (!(last_was_number && next_is_number) || options & NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS)) {
|
||||
char_array_append(array, " ");
|
||||
append_char = false;
|
||||
} else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) {
|
||||
@@ -481,7 +489,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
|
||||
append_char = true;
|
||||
|
||||
last_was_letter = is_letter;
|
||||
|
||||
last_was_number = is_number;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user