[merge] merging in the Ohio expansion numex changes from master

This commit is contained in:
Al
2017-11-29 11:51:25 -05:00
16 changed files with 3526 additions and 2886 deletions

View File

@@ -1,8 +1,8 @@
# libpostal: international street address NLP
[![Build Status](https://travis-ci.org/openvenues/libpostal.svg?branch=master)](https://travis-ci.org/openvenues/libpostal) [![License](https://img.shields.io/github/license/openvenues/libpostal.svg)](https://github.com/openvenues/libpostal/blob/master/LICENSE)
[![OpenCollective](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors)
[![OpenCollective](https://opencollective.com/libpostal/backers/badge.svg)](#backers)
[![OpenCollective Sponsors](https://opencollective.com/libpostal/sponsors/badge.svg)](#sponsors)
[![OpenCollective Backers](https://opencollective.com/libpostal/backers/badge.svg)](#backers)
libpostal is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. The goal of this project is to understand location-based strings in every language, everywhere. For a more comprehensive overview of the research behind libpostal, be sure to check out the (lengthy) introductory blog posts:

View File

@@ -1,9 +1,10 @@
apartment|aparment|apartmen|apt
bangunan|bgn
dewan
kondominium|kondo
pangsapuri|p/puri
dewan
rumah|rmh
rumah pangsa
residensi|residen
rumah|rmh
rumah kediaman
rumah pangsa
menara

View File

@@ -1,6 +1,8 @@
bank
syarikat
kelab bola sepak
berhad|bhd
sendirian berhad|sdn bhd
kelab
kelab bola sepak
persatuan
sendirian berhad|sdn bhd
syarikat
yayasan

View File

@@ -1 +1,3 @@
mohammad|muhammad|mohd
suleiman|sulaiman
yusuf|yusof

View File

@@ -1 +1,92 @@
puteri
ahli dewan undangan negeri|adun
bintara kanan
bintara muda
brigedier jeneral
cik
datin
datin
datin paduka
datin paduka patinggi
datin paduka seri
datin paduka seri panglima
datin paduka seri utama
dato
dato paduka
dato sri|dato seri
dato wira
datuk
datuk seri
doktor|dr
encik|en
fil marsyal
flait sarjan
hakim
jeneral
kadet kanan
kapten
kolonel
komander
korporal
korporal udara
laksamana
laksamana armada
laksamana madya
laksamana muda
laksamana pertama
lans korporal
laskar kanan
laskar kelas kedua
laskar kelas pertama
laskar muda
laskar udara kanan
laskar udara kelas kedua
laskar udara kelas pertama
leftenan
leftenan jeneral
leftenan kolonel
leftenan komander
leftenan madya
leftenan muda
marsyal tentera udara
mejar
mejar jeneral
menteri
naib pengerusi
parajurit muda
pegawai
pegawai kadet
pegawai waran kelas kedua
pegawai waran kelas pertama
pengerusi
perdana menteri
prebet
presiden
puan|pn
puan sri
putera|putra
putera|putra
puteri|putri
puteri|putri
raja
rekrut
sarjan
sarjan udara
sharifah
staff sarjan
sultan
syed
tan sri
tengku
timbalan perdana menteri
timbalan perdana menteri
timbalan presiden
toh puan
to puan
tuan
tun
wakil
wakil-wakil
yang berhormat
yang di-pertuan agong|agong
yang di-pertuan besar
yang di-pertua negeri

View File

@@ -1,4 +1,145 @@
aiskrim
akademi
akuarium
arked
asrama
auditorum
balai bomba
balai polis
bangunan perbandaran
bank
berek
biara
bilik|blk
bilik kebal
dapur
dermaga
dewan
dewan bandar
dewan bandar
dewan konsert
dewan muzik
doktor-doktor
doktor haiwan|dr haiwan
empangan
farmasi
galeri
galeri seni
garaj
gereja
gim|gimnasium
hospital
hospital haiwan
hostel
ibu pejabat|ibu pej
institut
istana
jabatan bomba
jabatan polis
jawatankuasa|jawatan kuasa
jelapang
jeti
kafe
kampung|kampong|kg
kampus
kasino
kawasan perindustrian|kaw perindustri|kawasan industri
kedai
kedai buku
kedutaan
kejururawatan
kelab
kelab golf
kelab malam
kelab sosial
kilang
kiropraktik
kitar semula
klinik
kolam renang
kolam renang awam
kolej
kompleks
kompleks
komuniti
kuarters|kuarter
ladang pertanian|ladang
lapangan terbang|lpg terbang
liga
mahkamah
makmal
masjid
mata air
menara
menara pejabat
misi
monumen
muzium
nurseri
padang golf
pagar
panggung
panggung wayang gambar|pawagam
pangkalan tentera udara
pantai
parkir
pasar
sekolah
pasaran
pasar raya|pasaraya
pasar tani|pasar petani
pawagam
pediatrik
pejabat
pejabat pos
pelabuhan
pelabuhan
pembangunan
pengedar
penjara
penjara
perpustakaan
perubatan
pintu gerbang
pintu pagar
politeknik
pra sekolah
presint
pusat
pusat bandar|pusat bandaraya
pusat belia
pusat kebudayaan
pusat kecergasan
pusat kejururawatan
pusat kesihatan
pusat komuniti
pusat membeli-belah|pusat beli-belah
pusat penjagaan
pusat seni
pusat seni persembahan
residen
restoran
rumah kediaman
rumah kelab
rumah orang tua
rumah pangsa|pangsapuri|p/puri
rumah|rmh
rumah sakit
salon kecantikan
sekolah menegah|sekolah men|sek menengah|sek men
sekolah rendah|sekolah ren|sek rendah|sek ren
sekolah|sek
stesen minyak
stor
studio tarian
surau
taman
taman didikan kanak-kanak|tadika
taman negara
taman perindustrian|taman industri|tmn ind|tmn perindustrian
tanah perkuburan
tapak letak kereta
teh
tempat letak kereta
teres
universiti
veterinar

View File

@@ -1,3 +1,6 @@
blok
kampung|kampong|kg
nusa
penampang
pulau
seksyen

View File

@@ -1,17 +1,35 @@
awang|awg
bulatan
cerunan|crn
changkat|ckt
dalaman|dlm
dataran|dtr
dayang|dyg
denai
haji|hj
halaman|hlm
hilir|hlr
jalan|jln|jl
lapangan
kampung|kampong|kg
laluan
lebuhraya
lengkok
linkaran
lorong
kampong|kg
laman|lmn
langgak|lgk
lapangan
lebuh|lbh|luh
lebuhraya|lebuh raya|lbh raya|l/raya|lbr
lengkok|lkk
lingkaran|lkr
lintang|ltg
lorong|lrg
medan|mdn
pengiran|pg
persiaran|psn|psrn
rapat
simpangan|spn
simpang|spg
sngai|sg
sisiran|ssr
solok|slk
sungai|sngai|sg
taman|tmn
tepian|tpn
tingkat|tkt

View File

@@ -1,23 +1,27 @@
timur|timor
air|ayer
kampung|kampong|kg
itam|hitam
tanjung|tanjong
sri|seri
tasik|tasek
dwitasik|dwitasek
fasa|phasa
selasih|selaseh
putera|putra
puteri|putri
kecil|kechil
sungai|sungei|sg
teluk|telok
yusuf|yusof
bandar|bdr
batu|bt
bukit|bkt
buluh|buloh
jelutung|jelutong
tanduk|tandok
chempaka|cempaka
dwitasik|dwitasek
fasa|phasa
glugor|gelugor
itam|hitam
jelutung|jelutong
kampung|kampong|kg
kawasan|kaw
kecil|kechil
manjalara|menjalara
putera|putra
puteri|putri
selasih|selaseh
sri|seri
suleiman|sulaiman
sungai|sungei|sg
tanduk|tandok
tanjung|tanjong
tasik|tasek
teluk|telok
timur|timor
yusuf|yusof

View File

@@ -9,6 +9,7 @@
name: "oh"
value: 0
type: "cardinal"
left: "concat_only_if_number"
-
name: "one"
value: 1

View File

@@ -47,11 +47,13 @@ category_map = {
LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
left_context_map = {
'add': LEFT_CONTEXT_ADD,
'multiply': LEFT_CONTEXT_MULTIPLY,
'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
None: LEFT_CONTEXT_NONE,
}

File diff suppressed because it is too large Load Diff

View File

@@ -133,7 +133,7 @@ download_file() {
for subdir in $subdirs; do
rm -rf $data_dir/$subdir;
done
tar -xvzf $local_path -C $data_dir;
tar -xvzf $local_path --no-same-owner -C $data_dir;
rm $local_path;
else
echo "libpostal $name up to date"

View File

@@ -709,6 +709,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
bool possible_complete_token = false;
bool complete_token = false;
bool prev_rule_was_number = false;
log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
while (idx < len) {
@@ -844,22 +846,41 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
result.value += rule.value;
log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value);
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && (!whole_tokens_only || complete_token)) {
log_debug("Had previous token with no context, finishing previous rule before returning\n");
if (!whole_tokens_only || complete_token) {
result.len = prev_result_len;
number_finished = true;
complete_token = false;
advance_index = false;
state = start_state;
prev_rule_was_number = true;
rule = prev_rule = NUMEX_NULL_RULE;
prev_result_len = 0;
} else {
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && whole_tokens_only && !complete_token) {
log_debug("whole_tokens_only = %d, complete_token = %d\n", whole_tokens_only, complete_token);
rule = NUMEX_NULL_RULE;
last_was_separator = false;
prev_rule_was_number = false;
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
continue;
}
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) {
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n");
prev_rule = rule;
last_was_separator = false;
rule = NUMEX_NULL_RULE;
prev_result_len = result.len;
result = NULL_NUMEX_RESULT;
stopword_phrase = NULL_PHRASE;
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
last_was_stopword = false;
continue;
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && prev_rule_was_number) {
last_was_separator = false;
number_finished = true;
state = start_state;
last_was_stopword = false;
prev_rule_was_number = true;
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, value = %" PRId64 "\n", result.value);
} else if (rule.rule_type != NUMEX_STOPWORD) {
result.value = rule.value;
log_debug("Got number, result.value=%" PRId64 "\n", result.value);
@@ -871,6 +892,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
continue;
}
prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL;
if (rule.rule_type != NUMEX_STOPWORD) {
prev_rule = rule;
prev_result_len = result.len;
@@ -903,7 +926,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
if (prev_rule.rule_type != NUMEX_NULL) {
number_finished = true;
}
}
if (!set_rule) {
@@ -926,6 +948,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
log_debug("Adding phrase, value=%" PRId64 "\n", result.value);
result = NULL_NUMEX_RESULT;
number_finished = false;
rule = prev_rule = NUMEX_NULL_RULE;
}
prev_state = state;
@@ -1150,7 +1173,6 @@ char *replace_numeric_expressions(char *str, char *lang) {
char_array_append(replacement, ordinal_suffix);
}
}
start = result.start + result.len;
}

View File

@@ -50,7 +50,8 @@ typedef enum {
typedef enum {
NUMEX_LEFT_CONTEXT_NONE,
NUMEX_LEFT_CONTEXT_ADD,
NUMEX_LEFT_CONTEXT_MULTIPLY
NUMEX_LEFT_CONTEXT_MULTIPLY,
NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER
} numex_left_context;
typedef enum {

View File

@@ -82,6 +82,8 @@ TEST test_expansions(void) {
CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));