[merge] merging in the Ohio expansion numex changes from master

This commit is contained in:
Al
2017-11-29 11:51:25 -05:00
16 changed files with 3526 additions and 2886 deletions

View File

@@ -1,8 +1,8 @@
# libpostal: international street address NLP # libpostal: international street address NLP
[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE) [![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE)
[![OpenCollective](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors) [![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors)
[![OpenCollective](https://opencollective.com/libpostal/backers/badge.svg)](#backers) [![OpenCollective Backers](https://opencollective.com/libpostal/backers/badge.svg)](#backers)
libpostal is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. The goal of this project is to understand location-based strings in every language, everywhere. For a more comprehensive overview of the research behind libpostal, be sure to check out the (lengthy) introductory blog posts: libpostal is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. The goal of this project is to understand location-based strings in every language, everywhere. For a more comprehensive overview of the research behind libpostal, be sure to check out the (lengthy) introductory blog posts:

View File

@@ -1,9 +1,10 @@
apartment|aparment|apartmen|apt apartment|aparment|apartmen|apt
bangunan|bgn bangunan|bgn
dewan
kondominium|kondo kondominium|kondo
pangsapuri|p/puri pangsapuri|p/puri
dewan
rumah|rmh
rumah pangsa
residensi|residen residensi|residen
rumah|rmh
rumah kediaman
rumah pangsa
menara menara

View File

@@ -1,6 +1,8 @@
bank bank
syarikat
kelab bola sepak
berhad|bhd berhad|bhd
sendirian berhad|sdn bhd kelab
kelab bola sepak
persatuan persatuan
sendirian berhad|sdn bhd
syarikat
yayasan

View File

@@ -1 +1,3 @@
mohammad|muhammad|mohd mohammad|muhammad|mohd
suleiman|sulaiman
yusuf|yusof

View File

@@ -1 +1,92 @@
puteri ahli dewan undangan negeri|adun
bintara kanan
bintara muda
brigedier jeneral
cik
datin
datin
datin paduka
datin paduka patinggi
datin paduka seri
datin paduka seri panglima
datin paduka seri utama
dato
dato paduka
dato sri|dato seri
dato wira
datuk
datuk seri
doktor|dr
encik|en
fil marsyal
flait sarjan
hakim
jeneral
kadet kanan
kapten
kolonel
komander
korporal
korporal udara
laksamana
laksamana armada
laksamana madya
laksamana muda
laksamana pertama
lans korporal
laskar kanan
laskar kelas kedua
laskar kelas pertama
laskar muda
laskar udara kanan
laskar udara kelas kedua
laskar udara kelas pertama
leftenan
leftenan jeneral
leftenan kolonel
leftenan komander
leftenan madya
leftenan muda
marsyal tentera udara
mejar
mejar jeneral
menteri
naib pengerusi
parajurit muda
pegawai
pegawai kadet
pegawai waran kelas kedua
pegawai waran kelas pertama
pengerusi
perdana menteri
prebet
presiden
puan|pn
puan sri
putera|putra
putera|putra
puteri|putri
puteri|putri
raja
rekrut
sarjan
sarjan udara
sharifah
staff sarjan
sultan
syed
tan sri
tengku
timbalan perdana menteri
timbalan perdana menteri
timbalan presiden
toh puan
to puan
tuan
tun
wakil
wakil-wakil
yang berhormat
yang di-pertuan agong|agong
yang di-pertuan besar
yang di-pertua negeri

View File

@@ -1,4 +1,145 @@
aiskrim
akademi
akuarium
arked
asrama
auditorum
balai bomba
balai polis
bangunan perbandaran
bank
berek
biara
bilik|blk
bilik kebal
dapur
dermaga
dewan
dewan bandar
dewan bandar
dewan konsert
dewan muzik
doktor-doktor
doktor haiwan|dr haiwan
empangan
farmasi
galeri
galeri seni
garaj
gereja
gim|gimnasium
hospital
hospital haiwan
hostel
ibu pejabat|ibu pej
institut
istana
jabatan bomba
jabatan polis
jawatankuasa|jawatan kuasa
jelapang jelapang
jeti
kafe
kampung|kampong|kg
kampus
kasino
kawasan perindustrian|kaw perindustri|kawasan industri
kedai
kedai buku
kedutaan
kejururawatan
kelab
kelab golf
kelab malam
kelab sosial
kilang
kiropraktik
kitar semula
klinik
kolam renang
kolam renang awam
kolej
kompleks
kompleks
komuniti
kuarters|kuarter
ladang pertanian|ladang
lapangan terbang|lpg terbang
liga
mahkamah
makmal
masjid masjid
mata air
menara
menara pejabat
misi
monumen
muzium
nurseri
padang golf
pagar
panggung
panggung wayang gambar|pawagam
pangkalan tentera udara
pantai
parkir
pasar pasar
sekolah pasaran
pasar raya|pasaraya
pasar tani|pasar petani
pawagam
pediatrik
pejabat
pejabat pos
pelabuhan
pelabuhan
pembangunan
pengedar
penjara
penjara
perpustakaan
perubatan
pintu gerbang
pintu pagar
politeknik
pra sekolah
presint
pusat
pusat bandar|pusat bandaraya
pusat belia
pusat kebudayaan
pusat kecergasan
pusat kejururawatan
pusat kesihatan
pusat komuniti
pusat membeli-belah|pusat beli-belah
pusat penjagaan
pusat seni
pusat seni persembahan
residen
restoran
rumah kediaman
rumah kelab
rumah orang tua
rumah pangsa|pangsapuri|p/puri
rumah|rmh
rumah sakit
salon kecantikan
sekolah menegah|sekolah men|sek menengah|sek men
sekolah rendah|sekolah ren|sek rendah|sek ren
sekolah|sek
stesen minyak
stor
studio tarian
surau
taman
taman didikan kanak-kanak|tadika
taman negara
taman perindustrian|taman industri|tmn ind|tmn perindustrian
tanah perkuburan
tapak letak kereta
teh
tempat letak kereta
teres
universiti
veterinar

View File

@@ -1,3 +1,6 @@
blok
kampung|kampong|kg
nusa nusa
penampang penampang
pulau pulau
seksyen

View File

@@ -1,17 +1,35 @@
awang|awg awang|awg
bulatan bulatan
cerunan|crn
changkat|ckt
dalaman|dlm
dataran|dtr
dayang|dyg dayang|dyg
denai denai
haji|hj haji|hj
halaman|hlm
hilir|hlr
jalan|jln|jl jalan|jln|jl
lapangan kampung|kampong|kg
laluan laluan
lebuhraya laman|lmn
lengkok langgak|lgk
linkaran lapangan
lorong lebuh|lbh|luh
kampong|kg lebuhraya|lebuh raya|lbh raya|l/raya|lbr
lengkok|lkk
lingkaran|lkr
lintang|ltg
lorong|lrg
medan|mdn
pengiran|pg pengiran|pg
persiaran|psn|psrn
rapat rapat
simpangan|spn
simpang|spg simpang|spg
sngai|sg sisiran|ssr
solok|slk
sungai|sngai|sg
taman|tmn
tepian|tpn
tingkat|tkt

View File

@@ -1,23 +1,27 @@
timur|timor
air|ayer air|ayer
kampung|kampong|kg bandar|bdr
itam|hitam
tanjung|tanjong
sri|seri
tasik|tasek
dwitasik|dwitasek
fasa|phasa
selasih|selaseh
putera|putra
puteri|putri
kecil|kechil
sungai|sungei|sg
teluk|telok
yusuf|yusof
batu|bt batu|bt
bukit|bkt bukit|bkt
buluh|buloh buluh|buloh
jelutung|jelutong
tanduk|tandok
chempaka|cempaka chempaka|cempaka
dwitasik|dwitasek
fasa|phasa
glugor|gelugor
itam|hitam
jelutung|jelutong
kampung|kampong|kg
kawasan|kaw
kecil|kechil
manjalara|menjalara
putera|putra
puteri|putri
selasih|selaseh
sri|seri
suleiman|sulaiman suleiman|sulaiman
sungai|sungei|sg
tanduk|tandok
tanjung|tanjong
tasik|tasek
teluk|telok
timur|timor
yusuf|yusof

View File

@@ -9,6 +9,7 @@
name: "oh" name: "oh"
value: 0 value: 0
type: "cardinal" type: "cardinal"
left: "concat_only_if_number"
- -
name: "one" name: "one"
value: 1 value: 1

View File

@@ -47,11 +47,13 @@ category_map = {
LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY' LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD' LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE' LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
left_context_map = { left_context_map = {
'add': LEFT_CONTEXT_ADD, 'add': LEFT_CONTEXT_ADD,
'multiply': LEFT_CONTEXT_MULTIPLY, 'multiply': LEFT_CONTEXT_MULTIPLY,
'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
None: LEFT_CONTEXT_NONE, None: LEFT_CONTEXT_NONE,
} }

File diff suppressed because it is too large Load Diff

View File

@@ -133,7 +133,7 @@ download_file() {
for subdir in $subdirs; do for subdir in $subdirs; do
rm -rf $data_dir/$subdir; rm -rf $data_dir/$subdir;
done done
tar -xvzf $local_path -C $data_dir; tar -xvzf $local_path --no-same-owner -C $data_dir;
rm $local_path; rm $local_path;
else else
echo "libpostal $name up to date" echo "libpostal $name up to date"

View File

@@ -709,6 +709,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
bool possible_complete_token = false; bool possible_complete_token = false;
bool complete_token = false; bool complete_token = false;
bool prev_rule_was_number = false;
log_debug("Converting numex for str=%s, lang=%s\n", str, lang); log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
while (idx < len) { while (idx < len) {
@@ -844,22 +846,41 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) { FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
result.value += rule.value; result.value += rule.value;
log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value); log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value);
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) { } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && (!whole_tokens_only || complete_token)) {
log_debug("Had previous token with no context, finishing previous rule before returning\n"); log_debug("Had previous token with no context, finishing previous rule before returning\n");
if (!whole_tokens_only || complete_token) { result.len = prev_result_len;
result.len = prev_result_len; number_finished = true;
number_finished = true; complete_token = false;
complete_token = false; advance_index = false;
advance_index = false; state = start_state;
state = start_state; prev_rule_was_number = true;
rule = prev_rule = NUMEX_NULL_RULE; rule = prev_rule = NUMEX_NULL_RULE;
prev_result_len = 0; prev_result_len = 0;
} else { } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && whole_tokens_only && !complete_token) {
rule = NUMEX_NULL_RULE; log_debug("whole_tokens_only = %d, complete_token = %d\n", whole_tokens_only, complete_token);
last_was_separator = false; rule = NUMEX_NULL_RULE;
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN; last_was_separator = false;
continue; prev_rule_was_number = false;
} state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
continue;
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) {
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n");
prev_rule = rule;
last_was_separator = false;
rule = NUMEX_NULL_RULE;
prev_result_len = result.len;
result = NULL_NUMEX_RESULT;
stopword_phrase = NULL_PHRASE;
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
last_was_stopword = false;
continue;
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && prev_rule_was_number) {
last_was_separator = false;
number_finished = true;
state = start_state;
last_was_stopword = false;
prev_rule_was_number = true;
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, value = %" PRId64 "\n", result.value);
} else if (rule.rule_type != NUMEX_STOPWORD) { } else if (rule.rule_type != NUMEX_STOPWORD) {
result.value = rule.value; result.value = rule.value;
log_debug("Got number, result.value=%" PRId64 "\n", result.value); log_debug("Got number, result.value=%" PRId64 "\n", result.value);
@@ -871,6 +892,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
continue; continue;
} }
prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL;
if (rule.rule_type != NUMEX_STOPWORD) { if (rule.rule_type != NUMEX_STOPWORD) {
prev_rule = rule; prev_rule = rule;
prev_result_len = result.len; prev_result_len = result.len;
@@ -903,7 +926,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
if (prev_rule.rule_type != NUMEX_NULL) { if (prev_rule.rule_type != NUMEX_NULL) {
number_finished = true; number_finished = true;
} }
} }
if (!set_rule) { if (!set_rule) {
@@ -926,6 +948,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
log_debug("Adding phrase, value=%" PRId64 "\n", result.value); log_debug("Adding phrase, value=%" PRId64 "\n", result.value);
result = NULL_NUMEX_RESULT; result = NULL_NUMEX_RESULT;
number_finished = false; number_finished = false;
rule = prev_rule = NUMEX_NULL_RULE;
} }
prev_state = state; prev_state = state;
@@ -1150,7 +1173,6 @@ char *replace_numeric_expressions(char *str, char *lang) {
char_array_append(replacement, ordinal_suffix); char_array_append(replacement, ordinal_suffix);
} }
} }
start = result.start + result.len; start = result.start + result.len;
} }

View File

@@ -50,7 +50,8 @@ typedef enum {
typedef enum { typedef enum {
NUMEX_LEFT_CONTEXT_NONE, NUMEX_LEFT_CONTEXT_NONE,
NUMEX_LEFT_CONTEXT_ADD, NUMEX_LEFT_CONTEXT_ADD,
NUMEX_LEFT_CONTEXT_MULTIPLY NUMEX_LEFT_CONTEXT_MULTIPLY,
NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER
} numex_left_context; } numex_left_context;
typedef enum { typedef enum {

View File

@@ -82,6 +82,8 @@ TEST test_expansions(void) {
CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de")); CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl")); CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));