[numex] adding one form of normalization which strips ordinal suffixes so {96th, Ninety-sixth} => 96. This is an additional form of normalization, so there's still one form where the suffixes are kept. One case that's still not handled is something like "IXe Arrondissement"

This commit is contained in:
Al
2017-04-18 21:39:54 -04:00
parent 92051863ba
commit cddc368533
3 changed files with 96 additions and 55 deletions

View File

@@ -79,6 +79,7 @@ static inline uint64_t get_normalize_string_options(libpostal_normalize_options_
normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0;
normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0;
normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0;
normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0;
return normalize_string_options;
}
@@ -558,7 +559,6 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok
address_expansion_t prefix_expansion;
address_expansion_t suffix_expansion;
char_array *key = char_array_new_size(token.len);
char *expansion;
size_t num_strings = 0;
@@ -582,10 +582,11 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok
}
if (!have_suffix && !have_prefix) {
char_array_destroy(key);
return false;
}
char_array *key = char_array_new_size(token.len);
if (have_prefix && have_suffix) {
for (size_t i = 0; i < prefix_expansions->n; i++) {
prefix_expansion = prefix_expansions->a[i];
@@ -760,16 +761,34 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to
if ((suffix.len == 0 && prefix.len == 0)) return false;
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options);
}
static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang);
if (len_ordinal_suffix == 0) return false;
cstring_array *strings = tree->strings;
// Add the original form first. When this function returns true,
// add_normalized_strings_token won't be called a second time.
add_normalized_strings_token(strings, str, token, options);
char_array *key = char_array_new_size(token.len - len_ordinal_suffix + 1);
char_array_cat_len(key, str + token.offset, token.len - len_ordinal_suffix);
char *expansion = char_array_get_string(key);
cstring_array_add_string(strings, expansion);
char_array_destroy(key);
return true;
}
static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
cstring_array *strings = tree->strings;
for (size_t i = 0; i < tokens->n; i++) {
token_t token = tokens->a[i];
bool have_phrase = false;
bool have_ordinal = false;
if (is_special_token(token.type)) {
string_tree_add_string_len(tree, str + token.offset, token.len);
@@ -783,9 +802,14 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s
have_phrase = true;
break;
}
if (normalize_ordinal_suffixes(tree, str, lang, token, options)) {
have_ordinal = true;
break;
}
}
if (!have_phrase) {
if (!have_phrase && !have_ordinal) {
add_normalized_strings_token(strings, str, token, options);
}
@@ -826,45 +850,18 @@ static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_
char_array_terminate(temp_string);
char *tokenized_str = char_array_get_string(temp_string);
char *new_str = tokenized_str;
char *last_numex_str = NULL;
if (options.expand_numex) {
char *numex_replaced = NULL;
for (size_t i = 0; i < options.num_languages; i++) {
lang = options.languages[i];
numex_replaced = replace_numeric_expressions(new_str, lang);
if (numex_replaced != NULL) {
new_str = numex_replaced;
if (last_numex_str != NULL) {
free(last_numex_str);
}
last_numex_str = numex_replaced;
}
}
}
string_tree_t *alternatives;
int ret;
log_debug("new_str=%s\n", new_str);
log_debug("Adding alternatives for single normalization\n");
alternatives = add_string_alternatives(new_str, options);
if (last_numex_str != NULL) {
free(last_numex_str);
}
alternatives = add_string_alternatives(tokenized_str, options);
if (alternatives == NULL) {
log_debug("alternatives = NULL\n");
continue;
}
iter = string_tree_iterator_new(alternatives);
log_debug("iter->num_tokens=%d\n", iter->num_tokens);

View File

@@ -3,8 +3,24 @@
#define FULL_STOP_CODEPOINT 0x002e
#define APOSTROPHE_CODEPOINT 0x0027
char *normalize_replace_numex(char *str, size_t num_languages, char **languages) {
char *numex_normalized = NULL;
char *normalize_string_utf8(char *str, uint64_t options) {
for (size_t i = 0; i < num_languages; i++) {
char *lang = languages[i];
char *numex_replaced = replace_numeric_expressions(numex_normalized == NULL ? str : numex_normalized, lang);
if (numex_replaced != NULL) {
if (numex_normalized != NULL) {
free(numex_normalized);
}
numex_normalized = numex_replaced;
}
}
return numex_normalized;
}
char *normalize_string_utf8_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC;
uint8_t *utf8proc_normalized = NULL;
@@ -63,7 +79,7 @@ char *normalize_string_utf8(char *str, uint64_t options) {
}
}
if (options & NORMALIZE_STRING_REPLACE_HYPHENS && strchr(str, '-') != NULL) {
if (options & NORMALIZE_STRING_REPLACE_HYPHENS && string_contains_hyphen(str)) {
char *replaced = string_replace_char(str, '-', ' ');
if (replaced != NULL) {
if (normalized_allocated) {
@@ -76,11 +92,28 @@ char *normalize_string_utf8(char *str, uint64_t options) {
}
}
if (options & NORMALIZE_STRING_REPLACE_NUMEX && num_languages > 0) {
char *numex_normalized = normalize_replace_numex(str, num_languages, languages);
if (numex_normalized != NULL) {
if (normalized_allocated) {
free(normalized);
}
normalized = numex_normalized;
str = normalized;
normalized_allocated = true;
}
}
return normalized;
}
char *normalize_string_utf8(char *str, uint64_t options) {
return normalize_string_utf8_languages(str, options, 0, NULL);
}
char *normalize_string_latin(char *str, size_t len, uint64_t options) {
char *normalize_string_latin_languages(char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
char *latin_transliterator = LATIN_ASCII;
if (options & NORMALIZE_STRING_SIMPLE_LATIN_ASCII) {
latin_transliterator = LATIN_ASCII_SIMPLE;
@@ -90,9 +123,9 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) {
char *utf8_normalized;
if (transliterated == NULL) {
utf8_normalized = normalize_string_utf8(str, options);
utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages);
} else {
utf8_normalized = normalize_string_utf8(transliterated, options);
utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages);
free(transliterated);
transliterated = NULL;
}
@@ -100,7 +133,11 @@ char *normalize_string_latin(char *str, size_t len, uint64_t options) {
return utf8_normalized;
}
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) {
char *normalize_string_latin(char *str, size_t len, uint64_t options) {
return normalize_string_latin_languages(str, len, options, 0, NULL);
}
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options, size_t num_languages, char **languages) {
char *transliterated = NULL;
char *utf8_normalized = NULL;
@@ -114,7 +151,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
if (options & NORMALIZE_STRING_LATIN_ASCII) {
transliterated = transliterate(latin_transliterator, str, len);
if (transliterated != NULL) {
utf8_normalized = normalize_string_utf8(transliterated, options);
utf8_normalized = normalize_string_utf8_languages(transliterated, options, num_languages, languages);
free(transliterated);
transliterated = NULL;
}
@@ -127,7 +164,7 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
}
char *str_copy = strndup(str, len);
utf8_normalized = normalize_string_utf8(str_copy, options);
utf8_normalized = normalize_string_utf8_languages(str_copy, options, num_languages, languages);
free(str_copy);
if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
@@ -150,9 +187,9 @@ void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t
if (prev_string != NULL) {
free(prev_string);
}
}
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
size_t len = strlen(str);
string_tree_t *tree = string_tree_new_size(len);
@@ -161,15 +198,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
khash_t(int_set) *scripts = kh_init(int_set);
char *utf8_normalized = NULL;
char *ptr = str;
char *numex_replaced = NULL;
script_t script;
char *trans_name = NULL;
char *lang;
bool transliterate_latin = false;
char *ptr = str;
bool have_latin_transliterator = false;
while (consumed < len) {
string_script_t script_span = get_string_script(ptr, len - consumed);
script = script_span.script;
@@ -182,12 +220,16 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
if (html_escaped != NULL) {
str = html_escaped;
}
utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE);
options ^= NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_STRIP_ACCENTS | NORMALIZE_STRING_LATIN_ASCII;
utf8_normalized = normalize_string_utf8_languages(str, options, num_languages, languages);
if (utf8_normalized != NULL) {
if (html_escaped != NULL) {
free(html_escaped);
html_escaped = NULL;
}
string_tree_add_string(tree, utf8_normalized);
string_tree_finalize_token(tree);
free(utf8_normalized);
@@ -200,22 +242,22 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
log_debug("script_len=%zu\n", script_len);
if (script == SCRIPT_LATIN && num_languages > 0 && !transliterate_latin) {
if (script == SCRIPT_LATIN && num_languages > 0 && !have_latin_transliterator) {
for (size_t i = 0; i < num_languages; i++) {
lang = languages[i];
foreach_transliterator(script, lang, trans_name, {
if (!string_equals(trans_name, LATIN_ASCII)) {
transliterate_latin = true;
have_latin_transliterator = true;
break;
}
})
if (transliterate_latin) break;
if (have_latin_transliterator) break;
}
}
if ((script != SCRIPT_LATIN || transliterate_latin) && script_len > 0) {
if ((script != SCRIPT_LATIN || have_latin_transliterator) && script_len > 0) {
int ret;
khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret);
if (ret < 0) {
@@ -230,8 +272,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
ptr += script_len;
}
if (!transliterate_latin) {
add_latin_alternatives(tree, str, len, options);
if (!have_latin_transliterator) {
add_latin_alternatives(tree, str, len, options, num_languages, languages);
}
size_t transliterate_scripts = kh_size(scripts);
@@ -276,7 +318,7 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
prev = transliterated;
})
add_latin_alternatives(tree, transliterated, strlen(transliterated), options);
add_latin_alternatives(tree, transliterated, strlen(transliterated), options, num_languages, languages);
if (transliterated != str) {
free(transliterated);
}
@@ -287,8 +329,8 @@ string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t nu
}
if (transliterate_latin) {
add_latin_alternatives(tree, str, len, options);
if (have_latin_transliterator) {
add_latin_alternatives(tree, str, len, options, num_languages, languages);
}
kh_destroy(int_set, scripts);

View File

@@ -33,6 +33,7 @@ As well as normalizations for individual string tokens:
#include "string_utils.h"
#include "utf8proc/utf8proc.h"
#include "unicode_scripts.h"
#include "numex.h"
#include "transliterate.h"
#include "trie.h"
#include "tokens.h"
@@ -47,6 +48,7 @@ As well as normalizations for individual string tokens:
#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
#define NORMALIZE_STRING_COMPOSE 1 << 7
#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8
#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9
#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1