diff --git a/.travis.yml b/.travis.yml index 5c485cca..f8f0892a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,9 +33,9 @@ install: script: - ./configure --datadir=$(pwd)/data - make - - if [[ "$CC" == gcc* && $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi; - - if [[ "$CC" == gcc* && $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi; - - if [[ "$CC" == gcc* && $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi; + - if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi; + - if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi; + - if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi; - make check after_success: - | diff --git a/README.md b/README.md index 517eee6a..01ec931d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ equivalents suitable for search indexing, hashing, etc. Here's an interactive example using the Python binding: -![expand](https://cloud.githubusercontent.com/assets/238455/13210468/b1676b72-d8fd-11e5-8557-ded418f6ffda.gif) +![expand](https://cloud.githubusercontent.com/assets/238455/14115012/52990d14-f5a7-11e5-9797-159dacdf8c5f.gif) libpostal contains an OSM-trained language classifier to detect which language(s) are used in a given address so it can apply the appropriate normalizations. The only input needed is the raw address string. @@ -31,7 +31,7 @@ Here's a short list of some less straightforward normalizations in various langu | One-hundred twenty E 96th St | 120 east 96th street | | C/ Ocho, P.I. 4 | calle 8 polígono industrial 4 | | V XX Settembre, 20 | via 20 settembre 20 | -| Quatre vignt douze R. de l'Église | 92 rue de l' église | +| Quatre vingt douze R. de l'Église | 92 rue de l' église | | ул Каретный Ряд, д 4, строение 7 | улица каретныи ряд дом 4 строение 7 | | ул Каретный Ряд, д 4, строение 7 | ulitsa karetnyy ryad dom 4 stroyeniye 7 | | Marktstrasse 14 | markt straße 14 | @@ -50,7 +50,7 @@ Here's an example using the Python bindings for succinctness (most of the higher ```python from postal.expand import expand_address -expansions = expand_address('Quatre-vignt-douze Ave des Champs-Élysées') +expansions = expand_address('Quatre-vingt-douze Ave des Champs-Élysées') assert '92 avenue des champs-elysees' in set(expansions) ``` @@ -70,7 +70,7 @@ int main(int argc, char **argv) { size_t num_expansions; normalize_options_t options = get_libpostal_default_options(); - char **expansions = expand_address("Quatre-vignt-douze Ave des Champs-Élysées", options, &num_expansions); + char **expansions = expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions); for (size_t i = 0; i < num_expansions; i++) { printf("%s\n", expansions[i]); @@ -220,7 +220,7 @@ After building libpostal: ``` cd src/ -./libpostal "Quatre vignt douze Ave des Champs-Élysées" +./libpostal "Quatre vingt douze Ave des Champs-Élysées" ``` If you have a text file or stream with one address per line, the command-line interface also accepts input from stdin: @@ -310,7 +310,7 @@ The dictionaries are also used to abbreviate canonical phrases like "Calle" => " (performed on both the language classifier and the address parser training sets) - **Numeric expression parsing** ("twenty first" => 21st, -"quatre-vignt-douze" => 92, again using data provided in CLDR), supports > 30 +"quatre-vingt-douze" => 92, again using data provided in CLDR), supports > 30 languages. Handles languages with concatenated expressions e.g. milleottocento => 1800. Optionally normalizes Roman numerals regardless of the language (IX => 9) which occur in the names of many monarchs, popes, etc. diff --git a/resources/numex/fr.json b/resources/numex/fr.json index 7dd32243..29b62851 100644 --- a/resources/numex/fr.json +++ b/resources/numex/fr.json @@ -173,28 +173,28 @@ "right": "add" }, { - "name": "quatre vignts", + "name": "quatre vingts", "value": 80, "type": "cardinal", "radix": 20, "right": "add" }, { - "name": "quatrevignts", + "name": "quatrevingts", "value": 80, "type": "cardinal", "radix": 20, "right": "add" }, { - "name": "quatre vignt", + "name": "quatre vingt", "value": 80, "type": "cardinal", "radix": 20, "right": "add" }, { - "name": "quatrevignt", + "name": "quatrevingt", "value": 80, "type": "cardinal", "radix": 20, @@ -814,23 +814,23 @@ "category": "plural" }, { - "name": "vigntieme", + "name": "vingtieme", "value": 20, "type": "ordinal" }, { - "name": "vigntième", + "name": "vingtième", "value": 20, "type": "ordinal" }, { - "name": "vigntiemes", + "name": "vingtiemes", "value": 20, "type": "ordinal", "category": "plural" }, { - "name": "vigntièmes", + "name": "vingtièmes", "value": 20, "type": "ordinal", "category": "plural" @@ -946,45 +946,45 @@ "category": "plural" }, { - "name": "quatre vigntieme", + "name": "quatre vingtieme", "value": 80, "type": "ordinal" }, { - "name": "quatre vigntième", + "name": "quatre vingtième", "value": 80, "type": "ordinal" }, { - "name": "quatre vigntiemes", + "name": "quatre vingtiemes", "value": 80, "type": "ordinal", "category": "plural" }, { - "name": "quatre vigntièmes", + "name": "quatre vingtièmes", "value": 80, "type": "ordinal", "category": "plural" }, { - "name": "quatrevigntieme", + "name": "quatrevingtieme", "value": 80, "type": "ordinal" }, { - "name": "quatrevigntième", + "name": "quatrevingtième", "value": 80, "type": "ordinal" }, { - "name": "quatrevigntiemes", + "name": "quatrevingtiemes", "value": 80, "type": "ordinal", "category": "plural" }, { - "name": "quatrevigntièmes", + "name": "quatrevingtièmes", "value": 80, "type": "ordinal", "category": "plural" @@ -1181,11 +1181,11 @@ "7": ["es"], "8": ["es"], "9": ["es"] - } } ], "stopwords": [ "et" ] -} \ No newline at end of file + +} diff --git a/scripts/geodata/numbers/numex.py b/scripts/geodata/numbers/numex.py index dba7bb72..279ea3ba 100644 --- a/scripts/geodata/numbers/numex.py +++ b/scripts/geodata/numbers/numex.py @@ -4,7 +4,7 @@ import sys import ujson as json this_dir = os.path.realpath(os.path.dirname(__file__)) -sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) +sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir))) from geodata.encoding import safe_encode from geodata.i18n.unicode_paths import DATA_DIR @@ -16,7 +16,7 @@ class InvalidNumexRuleException(Exception): NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources', 'numex') -NUMEX_RULES_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c') +NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c') GENDER_MASCULINE = 'GENDER_MASCULINE' GENDER_FEMININE = 'GENDER_FEMININE' diff --git a/src/numex_data.c b/src/numex_data.c index b2c622ba..a45e1e3c 100644 --- a/src/numex_data.c +++ b/src/numex_data.c @@ -1527,10 +1527,10 @@ char *numex_keys[] = { "septante", "huitante", "octante", - "quatre vignts", - "quatrevignts", - "quatre vignt", - "quatrevignt", + "quatre vingts", + "quatrevingts", + "quatre vingt", + "quatrevingt", "nonante", "cent", "cents", @@ -1639,10 +1639,10 @@ char *numex_keys[] = { "dixneuvième", "dixneuviemes", "dixneuvièmes", - "vigntieme", - "vigntième", - "vigntiemes", - "vigntièmes", + "vingtieme", + "vingtième", + "vingtiemes", + "vingtièmes", "trentieme", "trentième", "trentiemes", @@ -1663,14 +1663,14 @@ char *numex_keys[] = { "septantième", "septantiemes", "septantièmes", - "quatre vigntieme", - "quatre vigntième", - "quatre vigntiemes", - "quatre vigntièmes", - "quatrevigntieme", - "quatrevigntième", - "quatrevigntiemes", - "quatrevigntièmes", + "quatre vingtieme", + "quatre vingtième", + "quatre vingtiemes", + "quatre vingtièmes", + "quatrevingtieme", + "quatrevingtième", + "quatrevingtiemes", + "quatrevingtièmes", "huitantieme", "huitantième", "huitantiemes", diff --git a/test/test_numex.c b/test/test_numex.c index f01708cb..5f0c7639 100644 --- a/test/test_numex.c +++ b/test/test_numex.c @@ -35,9 +35,9 @@ TEST test_numeric_expressions(void) { CHECK_CALL(test_numex("ten and four", "10 and 4", "en")); // French (Celtic-style) numbers - CHECK_CALL(test_numex("quatre-vignt-douze", "92", "fr")); - CHECK_CALL(test_numex("quatre vignt douze", "92", "fr")); - CHECK_CALL(test_numex("quatre vignts", "80", "fr")); + CHECK_CALL(test_numex("quatre-vingt-douze", "92", "fr")); + CHECK_CALL(test_numex("quatre vingt douze", "92", "fr")); + CHECK_CALL(test_numex("quatre vingts", "80", "fr")); CHECK_CALL(test_numex("soixante-et-onze", "71", "fr")); CHECK_CALL(test_numex("soixante-cinq", "65", "fr"));