From 378459fde8f3b74be43a176d05748c8534751eae Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 29 Mar 2016 02:18:37 -0400 Subject: [PATCH 1/7] [fix] vignt => vingt in French numex --- README.md | 10 +++++----- resources/numex/fr.json | 34 +++++++++++++++++----------------- test/test_numex.c | 6 +++--- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 517eee6a..2955270c 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Here's a short list of some less straightforward normalizations in various langu | One-hundred twenty E 96th St | 120 east 96th street | | C/ Ocho, P.I. 4 | calle 8 polígono industrial 4 | | V XX Settembre, 20 | via 20 settembre 20 | -| Quatre vignt douze R. de l'Église | 92 rue de l' église | +| Quatre vingt douze R. de l'Église | 92 rue de l' église | | ул Каретный Ряд, д 4, строение 7 | улица каретныи ряд дом 4 строение 7 | | ул Каретный Ряд, д 4, строение 7 | ulitsa karetnyy ryad dom 4 stroyeniye 7 | | Marktstrasse 14 | markt straße 14 | @@ -50,7 +50,7 @@ Here's an example using the Python bindings for succinctness (most of the higher ```python from postal.expand import expand_address -expansions = expand_address('Quatre-vignt-douze Ave des Champs-Élysées') +expansions = expand_address('Quatre-vingt-douze Ave des Champs-Élysées') assert '92 avenue des champs-elysees' in set(expansions) ``` @@ -70,7 +70,7 @@ int main(int argc, char **argv) { size_t num_expansions; normalize_options_t options = get_libpostal_default_options(); - char **expansions = expand_address("Quatre-vignt-douze Ave des Champs-Élysées", options, &num_expansions); + char **expansions = expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions); for (size_t i = 0; i < num_expansions; i++) { printf("%s\n", expansions[i]); @@ -220,7 +220,7 @@ After building libpostal: ``` cd src/ -./libpostal "Quatre vignt douze Ave des Champs-Élysées" +./libpostal "Quatre vingt douze Ave des Champs-Élysées" ``` If you have a text file or stream with one address per line, the command-line interface also accepts input from stdin: @@ -310,7 +310,7 @@ The dictionaries are also used to abbreviate canonical phrases like "Calle" => " (performed on both the language classifier and the address parser training sets) - **Numeric expression parsing** ("twenty first" => 21st, -"quatre-vignt-douze" => 92, again using data provided in CLDR), supports > 30 +"quatre-vingt-douze" => 92, again using data provided in CLDR), supports > 30 languages. Handles languages with concatenated expressions e.g. milleottocento => 1800. Optionally normalizes Roman numerals regardless of the language (IX => 9) which occur in the names of many monarchs, popes, etc. diff --git a/resources/numex/fr.json b/resources/numex/fr.json index 7dd32243..ec47958e 100644 --- a/resources/numex/fr.json +++ b/resources/numex/fr.json @@ -173,28 +173,28 @@ "right": "add" }, { - "name": "quatre vignts", + "name": "quatre vingts", "value": 80, "type": "cardinal", "radix": 20, "right": "add" }, { - "name": "quatrevignts", + "name": "quatrevingts", "value": 80, "type": "cardinal", "radix": 20, "right": "add" }, { - "name": "quatre vignt", + "name": "quatre vingt", "value": 80, "type": "cardinal", "radix": 20, "right": "add" }, { - "name": "quatrevignt", + "name": "quatrevingt", "value": 80, "type": "cardinal", "radix": 20, @@ -814,23 +814,23 @@ "category": "plural" }, { - "name": "vigntieme", + "name": "vingtieme", "value": 20, "type": "ordinal" }, { - "name": "vigntième", + "name": "vingtième", "value": 20, "type": "ordinal" }, { - "name": "vigntiemes", + "name": "vingtiemes", "value": 20, "type": "ordinal", "category": "plural" }, { - "name": "vigntièmes", + "name": "vingtièmes", "value": 20, "type": "ordinal", "category": "plural" @@ -946,45 +946,45 @@ "category": "plural" }, { - "name": "quatre vigntieme", + "name": "quatre vingtieme", "value": 80, "type": "ordinal" }, { - "name": "quatre vigntième", + "name": "quatre vingtième", "value": 80, "type": "ordinal" }, { - "name": "quatre vigntiemes", + "name": "quatre vingtiemes", "value": 80, "type": "ordinal", "category": "plural" }, { - "name": "quatre vigntièmes", + "name": "quatre vingtièmes", "value": 80, "type": "ordinal", "category": "plural" }, { - "name": "quatrevigntieme", + "name": "quatrevingtieme", "value": 80, "type": "ordinal" }, { - "name": "quatrevigntième", + "name": "quatrevingtième", "value": 80, "type": "ordinal" }, { - "name": "quatrevigntiemes", + "name": "quatrevingtiemes", "value": 80, "type": "ordinal", "category": "plural" }, { - "name": "quatrevigntièmes", + "name": "quatrevingtièmes", "value": 80, "type": "ordinal", "category": "plural" @@ -1188,4 +1188,4 @@ "stopwords": [ "et" ] -} \ No newline at end of file +} diff --git a/test/test_numex.c b/test/test_numex.c index f01708cb..5f0c7639 100644 --- a/test/test_numex.c +++ b/test/test_numex.c @@ -35,9 +35,9 @@ TEST test_numeric_expressions(void) { CHECK_CALL(test_numex("ten and four", "10 and 4", "en")); // French (Celtic-style) numbers - CHECK_CALL(test_numex("quatre-vignt-douze", "92", "fr")); - CHECK_CALL(test_numex("quatre vignt douze", "92", "fr")); - CHECK_CALL(test_numex("quatre vignts", "80", "fr")); + CHECK_CALL(test_numex("quatre-vingt-douze", "92", "fr")); + CHECK_CALL(test_numex("quatre vingt douze", "92", "fr")); + CHECK_CALL(test_numex("quatre vingts", "80", "fr")); CHECK_CALL(test_numex("soixante-et-onze", "71", "fr")); CHECK_CALL(test_numex("soixante-cinq", "65", "fr")); From 2a2d1738a3f8a1ee2b75c10cbe967f0ff6e267b2 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 29 Mar 2016 11:15:24 -0400 Subject: [PATCH 2/7] [fix] path for running numex.py --- scripts/geodata/i18n/numex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/geodata/i18n/numex.py b/scripts/geodata/i18n/numex.py index 5cdc6f4f..32feb280 100644 --- a/scripts/geodata/i18n/numex.py +++ b/scripts/geodata/i18n/numex.py @@ -4,7 +4,7 @@ import sys import ujson as json this_dir = os.path.realpath(os.path.dirname(__file__)) -sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) +sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir))) from geodata.encoding import safe_encode from unicode_paths import DATA_DIR From 46b4cc7a5f1bbc9ac445c73b08f44004f3828a14 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 29 Mar 2016 11:20:25 -0400 Subject: [PATCH 3/7] [fix] altering file to test the numex build --- resources/numex/fr.json | 1 - 1 file changed, 1 deletion(-) diff --git a/resources/numex/fr.json b/resources/numex/fr.json index ec47958e..7bc3ff74 100644 --- a/resources/numex/fr.json +++ b/resources/numex/fr.json @@ -1181,7 +1181,6 @@ "7": ["es"], "8": ["es"], "9": ["es"] - } } ], From 1bc92d69957e192d3697947cb474b69e1e21c383 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 29 Mar 2016 11:25:33 -0400 Subject: [PATCH 4/7] [fix] output path in numex.py --- resources/numex/fr.json | 1 + scripts/geodata/i18n/numex.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/numex/fr.json b/resources/numex/fr.json index 7bc3ff74..29b62851 100644 --- a/resources/numex/fr.json +++ b/resources/numex/fr.json @@ -1187,4 +1187,5 @@ "stopwords": [ "et" ] + } diff --git a/scripts/geodata/i18n/numex.py b/scripts/geodata/i18n/numex.py index 32feb280..6a5d3207 100644 --- a/scripts/geodata/i18n/numex.py +++ b/scripts/geodata/i18n/numex.py @@ -15,7 +15,7 @@ class InvalidNumexRuleException(Exception): NUMEX_DATA_DIR = os.path.join(DATA_DIR, 'numex') -NUMEX_RULES_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c') +NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c') GENDER_MASCULINE = 'GENDER_MASCULINE' GENDER_FEMININE = 'GENDER_FEMININE' From 8e16ad245f5d457b788b82cdb85f6f5221420c7f Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 29 Mar 2016 11:35:02 -0400 Subject: [PATCH 5/7] [fix] Building the dictionaries/numex table, etc. locally on clang, just not publishing unless it's gcc --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index c0f163be..d8cae97e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,9 +33,9 @@ install: script: - ./configure --datadir=$(pwd)/data - make - - if [[ "$CC" == gcc* && $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi; - - if [[ "$CC" == gcc* && $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi; - - if [[ "$CC" == gcc* && $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi; + - if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi; + - if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi; + - if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi; - make check after_success: - | From 08d873ac15c6eba70ffcaed24326a807909684d4 Mon Sep 17 00:00:00 2001 From: Travis Date: Tue, 29 Mar 2016 15:39:14 +0000 Subject: [PATCH 6/7] [auto][ci skip] Adding data files from Travis build #105 --- src/numex_data.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/numex_data.c b/src/numex_data.c index b2c622ba..a45e1e3c 100644 --- a/src/numex_data.c +++ b/src/numex_data.c @@ -1527,10 +1527,10 @@ char *numex_keys[] = { "septante", "huitante", "octante", - "quatre vignts", - "quatrevignts", - "quatre vignt", - "quatrevignt", + "quatre vingts", + "quatrevingts", + "quatre vingt", + "quatrevingt", "nonante", "cent", "cents", @@ -1639,10 +1639,10 @@ char *numex_keys[] = { "dixneuvième", "dixneuviemes", "dixneuvièmes", - "vigntieme", - "vigntième", - "vigntiemes", - "vigntièmes", + "vingtieme", + "vingtième", + "vingtiemes", + "vingtièmes", "trentieme", "trentième", "trentiemes", @@ -1663,14 +1663,14 @@ char *numex_keys[] = { "septantième", "septantiemes", "septantièmes", - "quatre vigntieme", - "quatre vigntième", - "quatre vigntiemes", - "quatre vigntièmes", - "quatrevigntieme", - "quatrevigntième", - "quatrevigntiemes", - "quatrevigntièmes", + "quatre vingtieme", + "quatre vingtième", + "quatre vingtiemes", + "quatre vingtièmes", + "quatrevingtieme", + "quatrevingtième", + "quatrevingtiemes", + "quatrevingtièmes", "huitantieme", "huitantième", "huitantiemes", From eff3ae0ff7dd916b2ea971d90ef6e638c0503716 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 29 Mar 2016 12:13:36 -0400 Subject: [PATCH 7/7] [fix][ci skip] Updating GIF to reflect French correction --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2955270c..01ec931d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ equivalents suitable for search indexing, hashing, etc. Here's an interactive example using the Python binding: -![expand](https://cloud.githubusercontent.com/assets/238455/13210468/b1676b72-d8fd-11e5-8557-ded418f6ffda.gif) +![expand](https://cloud.githubusercontent.com/assets/238455/14115012/52990d14-f5a7-11e5-9797-159dacdf8c5f.gif) libpostal contains an OSM-trained language classifier to detect which language(s) are used in a given address so it can apply the appropriate normalizations. The only input needed is the raw address string.