From c968dd4ecc0b1e7eddebb354c03f8c23335225f7 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 19 Apr 2017 19:22:28 -0400 Subject: [PATCH 1/3] =?UTF-8?q?[numex]=20adding=20"=C2=B0"=20as=20addition?= =?UTF-8?q?al=20ordinal=20suffix=20for=20Spanish,=20Italian,=20and=20Portu?= =?UTF-8?q?guese?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- resources/numex/es.yaml | 10 ++++++++++ resources/numex/it.yaml | 10 ++++++++++ resources/numex/pt.yaml | 10 ++++++++++ 3 files changed, 30 insertions(+) diff --git a/resources/numex/es.yaml b/resources/numex/es.yaml index c4c010b8..ecc088fa 100644 --- a/resources/numex/es.yaml +++ b/resources/numex/es.yaml @@ -722,6 +722,7 @@ - ".º" - "º" - "o" + - "°" "1": - "ᵉʳ" - "er" @@ -730,10 +731,12 @@ - ".º" - "º" - "o" + - "°" "2": - ".º" - "º" - "o" + - "°" "3": - "ᵉʳ" - "er" @@ -742,30 +745,37 @@ - ".º" - "º" - "o" + - "°" "4": - ".º" - "º" - "o" + - "°" "5": - ".º" - "º" - "o" + - "°" "6": - ".º" - "º" - "o" + - "°" "7": - ".º" - "º" - "o" + - "°" "8": - ".º" - "º" - "o" + - "°" "9": - ".º" - "º" - "o" + - "°" - gender: "f" suffixes: diff --git a/resources/numex/it.yaml b/resources/numex/it.yaml index 3b7d58b5..b0c5a2d4 100644 --- a/resources/numex/it.yaml +++ b/resources/numex/it.yaml @@ -749,33 +749,43 @@ "0": - "º" - "o" + - "°" "1": - "º" - "o" + - "°" "2": - "º" - "o" + - "°" "3": - "º" - "o" + - "°" "4": - "º" - "o" + - "°" "5": - "º" - "o" + - "°" "6": - "º" - "o" + - "°" "7": - "º" - "o" + - "°" "8": - "º" - "o" + - "°" "9": - "º" - "o" + - "°" - gender: "f" suffixes: diff --git a/resources/numex/pt.yaml b/resources/numex/pt.yaml index 9970abc9..05b3e88d 100644 --- a/resources/numex/pt.yaml +++ b/resources/numex/pt.yaml @@ -884,33 +884,43 @@ "0": - "º" - "o" + - "°" "1": - "º" - "o" + - "°" "2": - "º" - "o" + - "°" "3": - "º" - "o" + - "°" "4": - "º" - "o" + - "°" "5": - "º" - "o" + - "°" "6": - "º" - "o" + - "°" "7": - "º" - "o" + - "°" "8": - "º" - "o" + - "°" "9": - "º" - "o" + - "°" - gender: "f" suffixes: From 19899b2f7dca49b63f4cb2811ce203273418b5ee Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 19 Apr 2017 19:25:25 -0400 Subject: [PATCH 2/3] =?UTF-8?q?[dictionaries]=20adding=20degree=20symbol?= =?UTF-8?q?=20"=C2=B0"=20variant=20for=20any=20surface=20forms=20that=20ha?= =?UTF-8?q?ve=20"=C2=BA"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- resources/dictionaries/es/given_names.txt | 4 ++-- resources/dictionaries/es/house_numbers.txt | 2 +- .../dictionaries/es/level_types_standalone.txt | 2 +- resources/dictionaries/es/no_number.txt | 2 +- resources/dictionaries/es/number.txt | 2 +- resources/dictionaries/es/personal_titles.txt | 2 +- resources/dictionaries/es/qualifiers.txt | 2 +- resources/dictionaries/es/street_types.txt | 2 +- resources/dictionaries/es/unit_types_numbered.txt | 2 +- resources/dictionaries/fr/number.txt | 2 +- resources/dictionaries/it/level_types_numbered.txt | 2 +- resources/dictionaries/it/number.txt | 2 +- resources/dictionaries/pt/no_number.txt | 2 +- resources/dictionaries/pt/number.txt | 2 +- resources/dictionaries/pt/personal_titles.txt | 14 +++++++------- resources/dictionaries/pt/qualifiers.txt | 2 +- resources/dictionaries/pt/street_types.txt | 2 +- resources/dictionaries/ro/number.txt | 2 +- 18 files changed, 25 insertions(+), 25 deletions(-) diff --git a/resources/dictionaries/es/given_names.txt b/resources/dictionaries/es/given_names.txt index 590e4afb..5631d2b2 100644 --- a/resources/dictionaries/es/given_names.txt +++ b/resources/dictionaries/es/given_names.txt @@ -1,3 +1,3 @@ -federico|fco|fcº -francisco|fco|fcº|franc +federico|fco|fcº|fc° +francisco|fco|fcº|franc|fc° maria|mª|m.a|m.ª \ No newline at end of file diff --git a/resources/dictionaries/es/house_numbers.txt b/resources/dictionaries/es/house_numbers.txt index 056249f6..41b9cd84 100644 --- a/resources/dictionaries/es/house_numbers.txt +++ b/resources/dictionaries/es/house_numbers.txt @@ -1,2 +1,2 @@ exterior|ext|ex -número exterior|no ext|# ext|nº ext|n.º ext|núm ext|númr ext|numr ext|númro ext|numro ext|núm.ro ext|nr ext|num.ro ext|nro ext|n.ro ext|no ex|# ex|nº ex|n.º ex|núm ex|númr ex|numr ex|númro ex|numro ex|núm.ro ex|nr ex|num.ro ex|nro ex|n.ro ex \ No newline at end of file +número exterior|no ext|# ext|nº ext|n° ext|n.º ext|n.° ext|núm ext|númr ext|numr ext|númro ext|numro ext|núm.ro ext|nr ext|num.ro ext|nro ext|n.ro ext|no ex|# ex|nº ex|n.º ex|n.° ex|núm ex|númr ex|numr ex|númro ex|numro ex|núm.ro ex|nr ex|num.ro ex|nro ex|n.ro ex \ No newline at end of file diff --git a/resources/dictionaries/es/level_types_standalone.txt b/resources/dictionaries/es/level_types_standalone.txt index e1235efe..a21acb90 100644 --- a/resources/dictionaries/es/level_types_standalone.txt +++ b/resources/dictionaries/es/level_types_standalone.txt @@ -1,4 +1,4 @@ -bajo|bjo|bo|bº +bajo|bjo|bo|bº|b° bajos|bjos|bjs planta baja|pb|pl baja planta primera|pl primera diff --git a/resources/dictionaries/es/no_number.txt b/resources/dictionaries/es/no_number.txt index 0b6190c9..ba442716 100644 --- a/resources/dictionaries/es/no_number.txt +++ b/resources/dictionaries/es/no_number.txt @@ -1 +1 @@ -sin número|s / n|sin numero|s.n.|s.n|s n|sn|s / nº|s.nº|snº|s / no \ No newline at end of file +sin número|s / n|sin numero|s.n.|s.n|s n|sn|s / nº|s / n°|s.nº|s.n°|snº|sn°|s / no \ No newline at end of file diff --git a/resources/dictionaries/es/number.txt b/resources/dictionaries/es/number.txt index 07202f94..1f0a0439 100644 --- a/resources/dictionaries/es/number.txt +++ b/resources/dictionaries/es/number.txt @@ -1 +1 @@ -número|#|num|núm|no|numero|№|nº|n.º|númr|numr|númro|numro|núm.ro|nr|num.ro|nro|n.ro|nr.º|nmro|nmr.o|nmrº|nmr.º \ No newline at end of file +número|#|num|núm|no|numero|№|nº|n°|n.º|n.°|númr|numr|númro|numro|núm.ro|nr|num.ro|nro|n.ro|nrº|nr°|nr.º|nr.°|nmro|nmr.o|nmrº|nmr°|nmr.º|nmr.° \ No newline at end of file diff --git a/resources/dictionaries/es/personal_titles.txt b/resources/dictionaries/es/personal_titles.txt index 0ddadcd5..85978b1a 100644 --- a/resources/dictionaries/es/personal_titles.txt +++ b/resources/dictionaries/es/personal_titles.txt @@ -105,7 +105,7 @@ reyes san|s santa|stª|st.ª|sta|st.a sant|st -santo|stº|sto|st.o|st.º +santo|stº|st°|sto|st.o|st.º|st.° sargento|sarg|sgto|sargto sargento ayudante|sarg ay|sgto ay|sargto ay sargento mayor|sm|s.m.|s.m|s m|sarg my|sgto my|sargto my diff --git a/resources/dictionaries/es/qualifiers.txt b/resources/dictionaries/es/qualifiers.txt index 699f0761..9baa25e7 100644 --- a/resources/dictionaries/es/qualifiers.txt +++ b/resources/dictionaries/es/qualifiers.txt @@ -2,7 +2,7 @@ aldea ampliación|amplicacion|ampl|amp auzoa barriada|barda -barrio|bo|brio|brrio|bº|b.º|b /|br +barrio|bo|brio|brrio|bº|b°|b.º|b.°|b /|br barrios|bos|brios|brrios bloque|blque|bloq|blq|bl|bq|blo colonia|col diff --git a/resources/dictionaries/es/street_types.txt b/resources/dictionaries/es/street_types.txt index 9c930dd0..654d8da1 100644 --- a/resources/dictionaries/es/street_types.txt +++ b/resources/dictionaries/es/street_types.txt @@ -71,7 +71,7 @@ particular|parti partida|ptda pasadizo|pzo pasaje|psaje|psj -paseo|pº|po|pso|pseo|pas|ps|p|p.o|p.º +paseo|pº|p°|po|pso|pseo|pas|ps|p|p.o|p.º|p.° paseo maritimo|psmar|ps mar paso pasillo|psllo diff --git a/resources/dictionaries/es/unit_types_numbered.txt b/resources/dictionaries/es/unit_types_numbered.txt index 05a7615a..205d0e78 100644 --- a/resources/dictionaries/es/unit_types_numbered.txt +++ b/resources/dictionaries/es/unit_types_numbered.txt @@ -4,7 +4,7 @@ casa consultorio|cn departamento|dpto|dept|dep|dto|depto interior|int|in -número interior|no int|# int|nº int|№ int|n.º int|núm int|númr int|numr int|númro int|numro int|núm.ro int|nr int|num.ro int|nro int|n.ro int|no in|# in|nº in|№ in|n.º in|núm in|númr in|numr in|númro in|numro in|núm.ro in|nr in|num.ro in|nro in|n.ro in +número interior|no int|# int|nº int|n° int|№ int|n.º int|n.° int|núm int|númr int|numr int|númro int|numro int|núm.ro int|nr int|num.ro int|nro int|n.ro int|no in|# in|nº in|n° in|№ in|n.º in|n.° in|núm in|númr in|numr in|númro in|numro in|núm.ro in|nr in|num.ro in|nro in|n.ro in letra lote|lt oficina|of|ofc diff --git a/resources/dictionaries/fr/number.txt b/resources/dictionaries/fr/number.txt index 65536e85..cc4f0405 100644 --- a/resources/dictionaries/fr/number.txt +++ b/resources/dictionaries/fr/number.txt @@ -1 +1 @@ -numéro|nº|#|№|n.º|no|num|numero|numr|num.ro|nr|nro|n.ro \ No newline at end of file +numéro|nº|n°|#|№|n.º|n.°|no|num|numero|numr|num.ro|nr|nro|n.ro \ No newline at end of file diff --git a/resources/dictionaries/it/level_types_numbered.txt b/resources/dictionaries/it/level_types_numbered.txt index 710e862d..750485f1 100644 --- a/resources/dictionaries/it/level_types_numbered.txt +++ b/resources/dictionaries/it/level_types_numbered.txt @@ -1,2 +1,2 @@ livello -piano|pº|p.º|p.o|p.nº|p.no|pnº|pno \ No newline at end of file +piano|pº|p°|p.º|p.°|p.o|p.nº|p.n°|p.no|pnº|pn°|pno \ No newline at end of file diff --git a/resources/dictionaries/it/number.txt b/resources/dictionaries/it/number.txt index 4a9beee6..08a908ce 100644 --- a/resources/dictionaries/it/number.txt +++ b/resources/dictionaries/it/number.txt @@ -1 +1 @@ -numero|nº|no|n.º|n.o|n|#|№ \ No newline at end of file +numero|nº|n°|no|n.º|n.°|n.o|n|#|№ \ No newline at end of file diff --git a/resources/dictionaries/pt/no_number.txt b/resources/dictionaries/pt/no_number.txt index 99965dac..7774569d 100644 --- a/resources/dictionaries/pt/no_number.txt +++ b/resources/dictionaries/pt/no_number.txt @@ -1 +1 @@ -sem número|sem numero|sn|s.n.|s.n|s / n|s n|s / nº|s.nº|snº|s / no \ No newline at end of file +sem número|sem numero|sn|s.n.|s.n|s / n|s n|s / nº|s / n°|s.nº|s.n°|snº|sn°|s / no \ No newline at end of file diff --git a/resources/dictionaries/pt/number.txt b/resources/dictionaries/pt/number.txt index 07202f94..88182ead 100644 --- a/resources/dictionaries/pt/number.txt +++ b/resources/dictionaries/pt/number.txt @@ -1 +1 @@ -número|#|num|núm|no|numero|№|nº|n.º|númr|numr|númro|numro|núm.ro|nr|num.ro|nro|n.ro|nr.º|nmro|nmr.o|nmrº|nmr.º \ No newline at end of file +número|#|num|núm|no|numero|№|nº|n°|n.º|n.°|númr|numr|númro|numro|núm.ro|nr|num.ro|nro|n.ro|nr.º|nr.°|nrº|nr°|nmro|nmr.o|nmrº|nmr°|nmr.º|nmr.° \ No newline at end of file diff --git a/resources/dictionaries/pt/personal_titles.txt b/resources/dictionaries/pt/personal_titles.txt index 19910fa6..44e33a3a 100644 --- a/resources/dictionaries/pt/personal_titles.txt +++ b/resources/dictionaries/pt/personal_titles.txt @@ -35,20 +35,20 @@ embaixador|emb eminencia|ema|em.a eminentissimo|emmo|em.mo enfermeira|enfª|enfa|enf.ª|enf.a -enfermeiro|enfº|enfo|enf|enf.º|enf.o +enfermeiro|enfº|enf°|enfo|enf|enf.º|enf.°|enf.o engenheira|engª|enga|eng.ª|eng.a -engenheiro|engº|eng.º|eng|engo|eng.o +engenheiro|engº|eng°|eng.º|eng.°|eng|engo|eng.o excelencia|exª|exa|ex.ª|ex.a excelentissima|exmª|exma|ex.mª|ex.ma -excelentissimo|exmº|exmo|ex.mº|ex.mo +excelentissimo|exmº|exm°|exmo|ex.mº|ex.m°|ex.mo frei|fr general|gen|gal|g.al governador|gov|govdor|govd.or|gov.dor ilustrissima|ilmª|ilma|il.mª|il.ma -ilustrissimo|ilmº|ilmo|il.mº|il.mo +ilustrissimo|ilmº|ilm°|ilmo|il.mº|il.m°|il.mo infante|inf irmã|irma|imª|ima|im -irmão|irmao|imº|imo|im +irmão|irmao|imº|im°|imo|im juiz|jz maestra|mta|mtra|mstra maestro|mto|mtro|mstro @@ -69,7 +69,7 @@ padre|pe|p.e pastor|pr prefeito|pref presidente|presid|pres -professor|prof|profº +professor|prof|profº|prof° professora|profa|prof.a|profª|prof.ª professoras|profas professores|profs @@ -78,7 +78,7 @@ reverendo|revdo|rev.do reverendissima|revma|rev.ma reverendissimo|revmo|rev.mo santa|stª|sta|st.ª|st.a -santo|stº|sto|st|st.º|st.o +santo|stº|st°|sto|st|st.º|st.°|st.o são|sao|s sargento|sarg|sgto|sargto sargento ajudante|sarg ajte|sarg aj.te diff --git a/resources/dictionaries/pt/qualifiers.txt b/resources/dictionaries/pt/qualifiers.txt index 3373e002..41b7258e 100644 --- a/resources/dictionaries/pt/qualifiers.txt +++ b/resources/dictionaries/pt/qualifiers.txt @@ -1,4 +1,4 @@ -bairro|b|bº|bo|brº|bro +bairro|b|bº|b°|bo|brº|br°|bro distrito|dtto|dist divisao|div quadra|q|qd diff --git a/resources/dictionaries/pt/street_types.txt b/resources/dictionaries/pt/street_types.txt index 841e61fe..b661dd05 100644 --- a/resources/dictionaries/pt/street_types.txt +++ b/resources/dictionaries/pt/street_types.txt @@ -4,7 +4,7 @@ autoestrada|auto estrada|auto estr|autoestr avenida|av|ava|ave avenida marginal|ave marg|av marg|ava marg azinhaga|az -bairro|b|bº|bo|brº|bro +bairro|b|bº|b°|bo|brº|br°|bro beco|bc|bco calçada|calcada|cc calçadinha|caclcadinha|ccnh diff --git a/resources/dictionaries/ro/number.txt b/resources/dictionaries/ro/number.txt index 113d7efe..1e3fad48 100644 --- a/resources/dictionaries/ro/number.txt +++ b/resources/dictionaries/ro/number.txt @@ -1 +1 @@ -număr|numar|nr|nº|#|№|no \ No newline at end of file +număr|numar|nr|nº|n°|#|№|no \ No newline at end of file From f3adde746e2e518c64e57c81d8b2acd3903abb07 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 19 Apr 2017 20:18:21 -0400 Subject: [PATCH 3/3] [numex] adding ability to handle handle the degree symbol in numex parsing since it's technically a separate token --- src/libpostal.c | 21 +++++++++++++++++---- src/numex.c | 11 +++-------- src/numex.h | 1 + 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/libpostal.c b/src/libpostal.c index d9b0f436..a38b5f31 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -764,10 +764,20 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to return add_affix_expansions(tree, str, lang, token, prefix, suffix, options); } -static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { +static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { + size_t token_digit_len = possible_ordinal_digit_len(str + token.offset, token.len); size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - if (len_ordinal_suffix == 0) return false; + bool ret = false; + + if (len_ordinal_suffix == 0 || token_digit_len + len_ordinal_suffix < token.len) { + return false; + } else if (len_ordinal_suffix == token.len && i > 0 && prev_token.len > 0) { + size_t prev_token_digit_len = possible_ordinal_digit_len(str + prev_token.offset, prev_token.len); + ret = prev_token_digit_len == prev_token.len; + } else { + ret = true; + } cstring_array *strings = tree->strings; // Add the original form first. When this function returns true, @@ -779,12 +789,14 @@ static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, ch char *expansion = char_array_get_string(key); cstring_array_add_string(strings, expansion); char_array_destroy(key); - return true; + return ret; } static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { cstring_array *strings = tree->strings; + token_t prev_token = (token_t){0, 0, 0}; + for (size_t i = 0; i < tokens->n; i++) { token_t token = tokens->a[i]; bool have_phrase = false; @@ -803,7 +815,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s break; } - if (normalize_ordinal_suffixes(tree, str, lang, token, options)) { + if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) { have_ordinal = true; break; } @@ -814,6 +826,7 @@ static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *s } string_tree_finalize_token(tree); + prev_token = token; } } diff --git a/src/numex.c b/src/numex.c index ff34fcfb..f2a0b156 100644 --- a/src/numex.c +++ b/src/numex.c @@ -1009,7 +1009,7 @@ static char *get_ordinal_suffix(char *numeric_string, size_t len, char *lang, ge } -static size_t possible_ordinal_digit_len(char *str, size_t len) { +size_t possible_ordinal_digit_len(char *str, size_t len) { uint8_t *ptr = (uint8_t *)str; size_t idx = 0; @@ -1053,11 +1053,6 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { return 0; } - size_t ordinal_digit_len = possible_ordinal_digit_len(str, len); - if (ordinal_digit_len == 0) { - return 0; - } - if (numex_table == NULL) { log_error(NUMEX_SETUP_ERROR); return 0; @@ -1081,8 +1076,8 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { phrase_t phrase = trie_search_suffixes_from_index(trie, str, len, prefix.node_id); - if (phrase.len == len - ordinal_digit_len) { - return len - ordinal_digit_len; + if (phrase.len + phrase.start == len) { + return phrase.len; } } } diff --git a/src/numex.h b/src/numex.h index c000ff9c..d80f96e1 100644 --- a/src/numex.h +++ b/src/numex.h @@ -149,6 +149,7 @@ VECTOR_INIT(numex_result_array, numex_result_t) char *replace_numeric_expressions(char *str, char *lang); numex_result_array *convert_numeric_expressions(char *str, char *lang); size_t ordinal_suffix_len(char *s, size_t len, char *lang); +size_t possible_ordinal_digit_len(char *str, size_t len); bool numex_table_write(FILE *file); bool numex_table_save(char *filename);