[fix] normalize canonical strings (after expanding abbreviations, concatenated suffixes, etc.) with Latin-ASCII, Latin-ASCII-Simple or simple UTF-8 normalization depending on the options
This commit is contained in:
@@ -310,7 +310,7 @@ static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_opt
|
|||||||
|
|
||||||
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
|
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
|
||||||
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
||||||
char *canonical_normalized = normalize_string_utf8(canonical, normalize_string_options);
|
char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
|
||||||
|
|
||||||
canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
|
canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
|
||||||
|
|
||||||
@@ -533,7 +533,7 @@ static inline void cat_affix_expansion(char_array *key, char *str, address_expan
|
|||||||
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
|
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
|
||||||
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
||||||
uint64_t normalize_string_options = get_normalize_string_options(options);
|
uint64_t normalize_string_options = get_normalize_string_options(options);
|
||||||
char *canonical_normalized = normalize_string_utf8(canonical, normalize_string_options);
|
char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
|
||||||
canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
|
canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
|
||||||
|
|
||||||
char_array_cat(key, canonical);
|
char_array_cat(key, canonical);
|
||||||
|
|||||||
@@ -114,12 +114,18 @@ char *normalize_string_utf8(char *str, uint64_t options) {
|
|||||||
|
|
||||||
|
|
||||||
char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
|
char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
|
||||||
char *latin_transliterator = LATIN_ASCII;
|
char *transliterated = NULL;
|
||||||
|
char *latin_transliterator = NULL;
|
||||||
|
|
||||||
if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) {
|
if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) {
|
||||||
latin_transliterator = LATIN_ASCII_SIMPLE;
|
latin_transliterator = LATIN_ASCII_SIMPLE;
|
||||||
|
} else if (options & NORMALIZE_STRING_LATIN_ASCII) {
|
||||||
|
latin_transliterator = LATIN_ASCII;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *transliterated = transliterate(latin_transliterator, str, len);
|
if (latin_transliterator != NULL) {
|
||||||
|
transliterated = transliterate(latin_transliterator, str, len);
|
||||||
|
}
|
||||||
|
|
||||||
char *utf8_normalized;
|
char *utf8_normalized;
|
||||||
if (transliterated == NULL) {
|
if (transliterated == NULL) {
|
||||||
|
|||||||
Reference in New Issue
Block a user