Merge branch 'master' into parser-data

2016-03-29 12:39:10 -04:00
parent 998c774405 ab63d18269
commit 2991772c36
6 changed files with 48 additions and 48 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,9 +33,9 @@ install:
 script:
    - ./configure --datadir=$(pwd)/data
    - make
-    - if [[ "$CC" == gcc* && $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
+    - if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
-    - if [[ "$CC" == gcc* && $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
+    - if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
-    - if [[ "$CC" == gcc* && $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
+    - if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
    - make check
 after_success:
    - |
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ equivalents suitable for search indexing, hashing, etc.
 Here's an interactive example using the Python binding:
-![expand](https://cloud.githubusercontent.com/assets/238455/13210468/b1676b72-d8fd-11e5-8557-ded418f6ffda.gif)
+![expand](https://cloud.githubusercontent.com/assets/238455/14115012/52990d14-f5a7-11e5-9797-159dacdf8c5f.gif)
 libpostal contains an OSM-trained language classifier to detect which language(s) are used in a given
 address so it can apply the appropriate normalizations. The only input needed is the raw address string. 
@@ -31,7 +31,7 @@ Here's a short list of some less straightforward normalizations in various langu
 | One-hundred twenty E 96th St        | 120 east 96th street                    |
 | C/ Ocho, P.I. 4                     | calle 8 polígono industrial 4           |
 | V XX Settembre, 20                  | via 20 settembre 20                     |
-| Quatre vignt douze R. de l'Église   | 92 rue de l' église                     |
+| Quatre vingt douze R. de l'Église   | 92 rue de l' église                     |
 | ул Каретный Ряд, д 4, строение 7    | улица каретныи ряд дом 4 строение 7     |
 | ул Каретный Ряд, д 4, строение 7    | ulitsa karetnyy ryad dom 4 stroyeniye 7 |
 | Marktstrasse 14                     | markt straße 14                         |
@@ -50,7 +50,7 @@ Here's an example using the Python bindings for succinctness (most of the higher
 ```python
 from postal.expand import expand_address
-expansions = expand_address('Quatre-vignt-douze Ave des Champs-Élysées')
+expansions = expand_address('Quatre-vingt-douze Ave des Champs-Élysées')
 assert '92 avenue des champs-elysees' in set(expansions)
 ```
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
    size_t num_expansions;
    normalize_options_t options = get_libpostal_default_options();
-    char **expansions = expand_address("Quatre-vignt-douze Ave des Champs-Élysées", options, &num_expansions);
+    char **expansions = expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);
    for (size_t i = 0; i < num_expansions; i++) {
        printf("%s\n", expansions[i]);
@@ -220,7 +220,7 @@ After building libpostal:
 ```
 cd src/
-./libpostal "Quatre vignt douze Ave des Champs-Élysées"
+./libpostal "Quatre vingt douze Ave des Champs-Élysées"
 ```
 If you have a text file or stream with one address per line, the command-line interface also accepts input from stdin:
@@ -310,7 +310,7 @@ The dictionaries are also used to abbreviate canonical phrases like "Calle" => "
 (performed on both the language classifier and the address parser training sets)
 - **Numeric expression parsing** ("twenty first" => 21st, 
-"quatre-vignt-douze" => 92, again using data provided in CLDR), supports > 30
+"quatre-vingt-douze" => 92, again using data provided in CLDR), supports > 30
 languages. Handles languages with concatenated expressions e.g.
 milleottocento => 1800. Optionally normalizes Roman numerals regardless of the
 language (IX => 9) which occur in the names of many monarchs, popes, etc.
--- a/resources/numex/fr.json
+++ b/resources/numex/fr.json
@@ -173,28 +173,28 @@
            "right": "add"
        },
        {
-            "name": "quatre vignts",
+            "name": "quatre vingts",
            "value": 80,
            "type": "cardinal",
            "radix": 20,
            "right": "add"
        },
        {
-            "name": "quatrevignts",
+            "name": "quatrevingts",
            "value": 80,
            "type": "cardinal",
            "radix": 20,
            "right": "add"
        },
        {
-            "name": "quatre vignt",
+            "name": "quatre vingt",
            "value": 80,
            "type": "cardinal",
            "radix": 20,
            "right": "add"
        },
        {
-            "name": "quatrevignt",
+            "name": "quatrevingt",
            "value": 80,
            "type": "cardinal",
            "radix": 20,
@@ -814,23 +814,23 @@
            "category": "plural"
        }, 
        {
-            "name": "vigntieme", 
+            "name": "vingtieme", 
            "value": 20, 
            "type": "ordinal"
        }, 
        {
-            "name": "vigntième", 
+            "name": "vingtième", 
            "value": 20, 
            "type": "ordinal"
        }, 
        {
-            "name": "vigntiemes", 
+            "name": "vingtiemes", 
            "value": 20, 
            "type": "ordinal", 
            "category": "plural"
        }, 
        {
-            "name": "vigntièmes", 
+            "name": "vingtièmes", 
            "value": 20, 
            "type": "ordinal", 
            "category": "plural"
@@ -946,45 +946,45 @@
            "category": "plural"
        }, 
        {
-            "name": "quatre vigntieme", 
+            "name": "quatre vingtieme", 
            "value": 80, 
            "type": "ordinal"
        }, 
        {
-            "name": "quatre vigntième", 
+            "name": "quatre vingtième", 
            "value": 80, 
            "type": "ordinal"
        }, 
        {
-            "name": "quatre vigntiemes", 
+            "name": "quatre vingtiemes", 
            "value": 80, 
            "type": "ordinal", 
            "category": "plural"
        }, 
        {
-            "name": "quatre vigntièmes", 
+            "name": "quatre vingtièmes", 
            "value": 80, 
            "type": "ordinal", 
            "category": "plural"
        }, 
        {
-            "name": "quatrevigntieme", 
+            "name": "quatrevingtieme", 
            "value": 80, 
            "type": "ordinal"
        }, 
        {
-            "name": "quatrevigntième", 
+            "name": "quatrevingtième", 
            "value": 80, 
            "type": "ordinal"
        }, 
        {
-            "name": "quatrevigntiemes", 
+            "name": "quatrevingtiemes", 
            "value": 80, 
            "type": "ordinal", 
            "category": "plural"
        }, 
        {
-            "name": "quatrevigntièmes", 
+            "name": "quatrevingtièmes", 
            "value": 80, 
            "type": "ordinal", 
            "category": "plural"
@@ -1181,11 +1181,11 @@
                "7": ["es"],
                "8": ["es"],
                "9": ["es"]
            }
        }
    ],
    "stopwords": [
        "et"
    ]
-}
+
 }
--- a/scripts/geodata/numbers/numex.py
+++ b/scripts/geodata/numbers/numex.py
@@ -4,7 +4,7 @@ import sys
 import ujson as json
 this_dir = os.path.realpath(os.path.dirname(__file__))
-sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
 from geodata.encoding import safe_encode
 from geodata.i18n.unicode_paths import DATA_DIR
@@ -16,7 +16,7 @@ class InvalidNumexRuleException(Exception):
 NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                              'resources', 'numex')
-NUMEX_RULES_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
+NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
 GENDER_MASCULINE = 'GENDER_MASCULINE'
 GENDER_FEMININE = 'GENDER_FEMININE'
--- a/src/numex_data.c
+++ b/src/numex_data.c
@@ -1527,10 +1527,10 @@ char *numex_keys[] = {
    "septante",
    "huitante",
    "octante",
-    "quatre vignts",
+    "quatre vingts",
-    "quatrevignts",
+    "quatrevingts",
-    "quatre vignt",
+    "quatre vingt",
-    "quatrevignt",
+    "quatrevingt",
    "nonante",
    "cent",
    "cents",
@@ -1639,10 +1639,10 @@ char *numex_keys[] = {
    "dixneuvième",
    "dixneuviemes",
    "dixneuvièmes",
-    "vigntieme",
+    "vingtieme",
-    "vigntième",
+    "vingtième",
-    "vigntiemes",
+    "vingtiemes",
-    "vigntièmes",
+    "vingtièmes",
    "trentieme",
    "trentième",
    "trentiemes",
@@ -1663,14 +1663,14 @@ char *numex_keys[] = {
    "septantième",
    "septantiemes",
    "septantièmes",
-    "quatre vigntieme",
+    "quatre vingtieme",
-    "quatre vigntième",
+    "quatre vingtième",
-    "quatre vigntiemes",
+    "quatre vingtiemes",
-    "quatre vigntièmes",
+    "quatre vingtièmes",
-    "quatrevigntieme",
+    "quatrevingtieme",
-    "quatrevigntième",
+    "quatrevingtième",
-    "quatrevigntiemes",
+    "quatrevingtiemes",
-    "quatrevigntièmes",
+    "quatrevingtièmes",
    "huitantieme",
    "huitantième",
    "huitantiemes",
--- a/test/test_numex.c
+++ b/test/test_numex.c
@@ -35,9 +35,9 @@ TEST test_numeric_expressions(void) {
    CHECK_CALL(test_numex("ten and four", "10 and 4", "en"));
    // French (Celtic-style) numbers
-    CHECK_CALL(test_numex("quatre-vignt-douze", "92", "fr"));
+    CHECK_CALL(test_numex("quatre-vingt-douze", "92", "fr"));
-    CHECK_CALL(test_numex("quatre vignt douze", "92", "fr"));
+    CHECK_CALL(test_numex("quatre vingt douze", "92", "fr"));
-    CHECK_CALL(test_numex("quatre vignts", "80", "fr"));
+    CHECK_CALL(test_numex("quatre vingts", "80", "fr"));
    CHECK_CALL(test_numex("soixante-et-onze", "71", "fr"));
    CHECK_CALL(test_numex("soixante-cinq", "65", "fr"));