Merge branch 'master' into parser-data
This commit is contained in:
@@ -33,9 +33,9 @@ install:
|
|||||||
script:
|
script:
|
||||||
- ./configure --datadir=$(pwd)/data
|
- ./configure --datadir=$(pwd)/data
|
||||||
- make
|
- make
|
||||||
- if [[ "$CC" == gcc* && $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
|
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
|
||||||
- if [[ "$CC" == gcc* && $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
|
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
|
||||||
- if [[ "$CC" == gcc* && $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
|
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
|
||||||
- make check
|
- make check
|
||||||
after_success:
|
after_success:
|
||||||
- |
|
- |
|
||||||
|
|||||||
12
README.md
12
README.md
@@ -20,7 +20,7 @@ equivalents suitable for search indexing, hashing, etc.
|
|||||||
|
|
||||||
Here's an interactive example using the Python binding:
|
Here's an interactive example using the Python binding:
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
libpostal contains an OSM-trained language classifier to detect which language(s) are used in a given
|
libpostal contains an OSM-trained language classifier to detect which language(s) are used in a given
|
||||||
address so it can apply the appropriate normalizations. The only input needed is the raw address string.
|
address so it can apply the appropriate normalizations. The only input needed is the raw address string.
|
||||||
@@ -31,7 +31,7 @@ Here's a short list of some less straightforward normalizations in various langu
|
|||||||
| One-hundred twenty E 96th St | 120 east 96th street |
|
| One-hundred twenty E 96th St | 120 east 96th street |
|
||||||
| C/ Ocho, P.I. 4 | calle 8 polígono industrial 4 |
|
| C/ Ocho, P.I. 4 | calle 8 polígono industrial 4 |
|
||||||
| V XX Settembre, 20 | via 20 settembre 20 |
|
| V XX Settembre, 20 | via 20 settembre 20 |
|
||||||
| Quatre vignt douze R. de l'Église | 92 rue de l' église |
|
| Quatre vingt douze R. de l'Église | 92 rue de l' église |
|
||||||
| ул Каретный Ряд, д 4, строение 7 | улица каретныи ряд дом 4 строение 7 |
|
| ул Каретный Ряд, д 4, строение 7 | улица каретныи ряд дом 4 строение 7 |
|
||||||
| ул Каретный Ряд, д 4, строение 7 | ulitsa karetnyy ryad dom 4 stroyeniye 7 |
|
| ул Каретный Ряд, д 4, строение 7 | ulitsa karetnyy ryad dom 4 stroyeniye 7 |
|
||||||
| Marktstrasse 14 | markt straße 14 |
|
| Marktstrasse 14 | markt straße 14 |
|
||||||
@@ -50,7 +50,7 @@ Here's an example using the Python bindings for succinctness (most of the higher
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from postal.expand import expand_address
|
from postal.expand import expand_address
|
||||||
expansions = expand_address('Quatre-vignt-douze Ave des Champs-Élysées')
|
expansions = expand_address('Quatre-vingt-douze Ave des Champs-Élysées')
|
||||||
|
|
||||||
assert '92 avenue des champs-elysees' in set(expansions)
|
assert '92 avenue des champs-elysees' in set(expansions)
|
||||||
```
|
```
|
||||||
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
size_t num_expansions;
|
size_t num_expansions;
|
||||||
normalize_options_t options = get_libpostal_default_options();
|
normalize_options_t options = get_libpostal_default_options();
|
||||||
char **expansions = expand_address("Quatre-vignt-douze Ave des Champs-Élysées", options, &num_expansions);
|
char **expansions = expand_address("Quatre-vingt-douze Ave des Champs-Élysées", options, &num_expansions);
|
||||||
|
|
||||||
for (size_t i = 0; i < num_expansions; i++) {
|
for (size_t i = 0; i < num_expansions; i++) {
|
||||||
printf("%s\n", expansions[i]);
|
printf("%s\n", expansions[i]);
|
||||||
@@ -220,7 +220,7 @@ After building libpostal:
|
|||||||
```
|
```
|
||||||
cd src/
|
cd src/
|
||||||
|
|
||||||
./libpostal "Quatre vignt douze Ave des Champs-Élysées"
|
./libpostal "Quatre vingt douze Ave des Champs-Élysées"
|
||||||
```
|
```
|
||||||
|
|
||||||
If you have a text file or stream with one address per line, the command-line interface also accepts input from stdin:
|
If you have a text file or stream with one address per line, the command-line interface also accepts input from stdin:
|
||||||
@@ -310,7 +310,7 @@ The dictionaries are also used to abbreviate canonical phrases like "Calle" => "
|
|||||||
(performed on both the language classifier and the address parser training sets)
|
(performed on both the language classifier and the address parser training sets)
|
||||||
|
|
||||||
- **Numeric expression parsing** ("twenty first" => 21st,
|
- **Numeric expression parsing** ("twenty first" => 21st,
|
||||||
"quatre-vignt-douze" => 92, again using data provided in CLDR), supports > 30
|
"quatre-vingt-douze" => 92, again using data provided in CLDR), supports > 30
|
||||||
languages. Handles languages with concatenated expressions e.g.
|
languages. Handles languages with concatenated expressions e.g.
|
||||||
milleottocento => 1800. Optionally normalizes Roman numerals regardless of the
|
milleottocento => 1800. Optionally normalizes Roman numerals regardless of the
|
||||||
language (IX => 9) which occur in the names of many monarchs, popes, etc.
|
language (IX => 9) which occur in the names of many monarchs, popes, etc.
|
||||||
|
|||||||
@@ -173,28 +173,28 @@
|
|||||||
"right": "add"
|
"right": "add"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatre vignts",
|
"name": "quatre vingts",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "cardinal",
|
"type": "cardinal",
|
||||||
"radix": 20,
|
"radix": 20,
|
||||||
"right": "add"
|
"right": "add"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatrevignts",
|
"name": "quatrevingts",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "cardinal",
|
"type": "cardinal",
|
||||||
"radix": 20,
|
"radix": 20,
|
||||||
"right": "add"
|
"right": "add"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatre vignt",
|
"name": "quatre vingt",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "cardinal",
|
"type": "cardinal",
|
||||||
"radix": 20,
|
"radix": 20,
|
||||||
"right": "add"
|
"right": "add"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatrevignt",
|
"name": "quatrevingt",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "cardinal",
|
"type": "cardinal",
|
||||||
"radix": 20,
|
"radix": 20,
|
||||||
@@ -814,23 +814,23 @@
|
|||||||
"category": "plural"
|
"category": "plural"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "vigntieme",
|
"name": "vingtieme",
|
||||||
"value": 20,
|
"value": 20,
|
||||||
"type": "ordinal"
|
"type": "ordinal"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "vigntième",
|
"name": "vingtième",
|
||||||
"value": 20,
|
"value": 20,
|
||||||
"type": "ordinal"
|
"type": "ordinal"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "vigntiemes",
|
"name": "vingtiemes",
|
||||||
"value": 20,
|
"value": 20,
|
||||||
"type": "ordinal",
|
"type": "ordinal",
|
||||||
"category": "plural"
|
"category": "plural"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "vigntièmes",
|
"name": "vingtièmes",
|
||||||
"value": 20,
|
"value": 20,
|
||||||
"type": "ordinal",
|
"type": "ordinal",
|
||||||
"category": "plural"
|
"category": "plural"
|
||||||
@@ -946,45 +946,45 @@
|
|||||||
"category": "plural"
|
"category": "plural"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatre vigntieme",
|
"name": "quatre vingtieme",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal"
|
"type": "ordinal"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatre vigntième",
|
"name": "quatre vingtième",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal"
|
"type": "ordinal"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatre vigntiemes",
|
"name": "quatre vingtiemes",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal",
|
"type": "ordinal",
|
||||||
"category": "plural"
|
"category": "plural"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatre vigntièmes",
|
"name": "quatre vingtièmes",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal",
|
"type": "ordinal",
|
||||||
"category": "plural"
|
"category": "plural"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatrevigntieme",
|
"name": "quatrevingtieme",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal"
|
"type": "ordinal"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatrevigntième",
|
"name": "quatrevingtième",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal"
|
"type": "ordinal"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatrevigntiemes",
|
"name": "quatrevingtiemes",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal",
|
"type": "ordinal",
|
||||||
"category": "plural"
|
"category": "plural"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "quatrevigntièmes",
|
"name": "quatrevingtièmes",
|
||||||
"value": 80,
|
"value": 80,
|
||||||
"type": "ordinal",
|
"type": "ordinal",
|
||||||
"category": "plural"
|
"category": "plural"
|
||||||
@@ -1181,11 +1181,11 @@
|
|||||||
"7": ["es"],
|
"7": ["es"],
|
||||||
"8": ["es"],
|
"8": ["es"],
|
||||||
"9": ["es"]
|
"9": ["es"]
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"stopwords": [
|
"stopwords": [
|
||||||
"et"
|
"et"
|
||||||
]
|
]
|
||||||
}
|
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import sys
|
|||||||
import ujson as json
|
import ujson as json
|
||||||
|
|
||||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||||
|
|
||||||
from geodata.encoding import safe_encode
|
from geodata.encoding import safe_encode
|
||||||
from geodata.i18n.unicode_paths import DATA_DIR
|
from geodata.i18n.unicode_paths import DATA_DIR
|
||||||
@@ -16,7 +16,7 @@ class InvalidNumexRuleException(Exception):
|
|||||||
NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
'resources', 'numex')
|
'resources', 'numex')
|
||||||
|
|
||||||
NUMEX_RULES_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
|
NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
|
||||||
|
|
||||||
GENDER_MASCULINE = 'GENDER_MASCULINE'
|
GENDER_MASCULINE = 'GENDER_MASCULINE'
|
||||||
GENDER_FEMININE = 'GENDER_FEMININE'
|
GENDER_FEMININE = 'GENDER_FEMININE'
|
||||||
|
|||||||
@@ -1527,10 +1527,10 @@ char *numex_keys[] = {
|
|||||||
"septante",
|
"septante",
|
||||||
"huitante",
|
"huitante",
|
||||||
"octante",
|
"octante",
|
||||||
"quatre vignts",
|
"quatre vingts",
|
||||||
"quatrevignts",
|
"quatrevingts",
|
||||||
"quatre vignt",
|
"quatre vingt",
|
||||||
"quatrevignt",
|
"quatrevingt",
|
||||||
"nonante",
|
"nonante",
|
||||||
"cent",
|
"cent",
|
||||||
"cents",
|
"cents",
|
||||||
@@ -1639,10 +1639,10 @@ char *numex_keys[] = {
|
|||||||
"dixneuvième",
|
"dixneuvième",
|
||||||
"dixneuviemes",
|
"dixneuviemes",
|
||||||
"dixneuvièmes",
|
"dixneuvièmes",
|
||||||
"vigntieme",
|
"vingtieme",
|
||||||
"vigntième",
|
"vingtième",
|
||||||
"vigntiemes",
|
"vingtiemes",
|
||||||
"vigntièmes",
|
"vingtièmes",
|
||||||
"trentieme",
|
"trentieme",
|
||||||
"trentième",
|
"trentième",
|
||||||
"trentiemes",
|
"trentiemes",
|
||||||
@@ -1663,14 +1663,14 @@ char *numex_keys[] = {
|
|||||||
"septantième",
|
"septantième",
|
||||||
"septantiemes",
|
"septantiemes",
|
||||||
"septantièmes",
|
"septantièmes",
|
||||||
"quatre vigntieme",
|
"quatre vingtieme",
|
||||||
"quatre vigntième",
|
"quatre vingtième",
|
||||||
"quatre vigntiemes",
|
"quatre vingtiemes",
|
||||||
"quatre vigntièmes",
|
"quatre vingtièmes",
|
||||||
"quatrevigntieme",
|
"quatrevingtieme",
|
||||||
"quatrevigntième",
|
"quatrevingtième",
|
||||||
"quatrevigntiemes",
|
"quatrevingtiemes",
|
||||||
"quatrevigntièmes",
|
"quatrevingtièmes",
|
||||||
"huitantieme",
|
"huitantieme",
|
||||||
"huitantième",
|
"huitantième",
|
||||||
"huitantiemes",
|
"huitantiemes",
|
||||||
|
|||||||
@@ -35,9 +35,9 @@ TEST test_numeric_expressions(void) {
|
|||||||
CHECK_CALL(test_numex("ten and four", "10 and 4", "en"));
|
CHECK_CALL(test_numex("ten and four", "10 and 4", "en"));
|
||||||
|
|
||||||
// French (Celtic-style) numbers
|
// French (Celtic-style) numbers
|
||||||
CHECK_CALL(test_numex("quatre-vignt-douze", "92", "fr"));
|
CHECK_CALL(test_numex("quatre-vingt-douze", "92", "fr"));
|
||||||
CHECK_CALL(test_numex("quatre vignt douze", "92", "fr"));
|
CHECK_CALL(test_numex("quatre vingt douze", "92", "fr"));
|
||||||
CHECK_CALL(test_numex("quatre vignts", "80", "fr"));
|
CHECK_CALL(test_numex("quatre vingts", "80", "fr"));
|
||||||
CHECK_CALL(test_numex("soixante-et-onze", "71", "fr"));
|
CHECK_CALL(test_numex("soixante-et-onze", "71", "fr"));
|
||||||
CHECK_CALL(test_numex("soixante-cinq", "65", "fr"));
|
CHECK_CALL(test_numex("soixante-cinq", "65", "fr"));
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user