[merge] merging in the Ohio expansion numex changes from master
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
# libpostal: international street address NLP
|
||||
|
||||
[](https://travis-ci.org/openvenues/libpostal) [](https://github.com/openvenues/libpostal/blob/master/LICENSE)
|
||||
[](#sponsors)
|
||||
[](#backers)
|
||||
[](#sponsors)
|
||||
[](#backers)
|
||||
|
||||
libpostal is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. The goal of this project is to understand location-based strings in every language, everywhere. For a more comprehensive overview of the research behind libpostal, be sure to check out the (lengthy) introductory blog posts:
|
||||
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
apartment|aparment|apartmen|apt
|
||||
bangunan|bgn
|
||||
dewan
|
||||
kondominium|kondo
|
||||
pangsapuri|p/puri
|
||||
dewan
|
||||
rumah|rmh
|
||||
rumah pangsa
|
||||
residensi|residen
|
||||
menara
|
||||
rumah|rmh
|
||||
rumah kediaman
|
||||
rumah pangsa
|
||||
menara
|
||||
@@ -1,6 +1,8 @@
|
||||
bank
|
||||
syarikat
|
||||
kelab bola sepak
|
||||
berhad|bhd
|
||||
sendirian berhad|sdn bhd
|
||||
kelab
|
||||
kelab bola sepak
|
||||
persatuan
|
||||
sendirian berhad|sdn bhd
|
||||
syarikat
|
||||
yayasan
|
||||
@@ -1 +1,3 @@
|
||||
mohammad|muhammad|mohd
|
||||
suleiman|sulaiman
|
||||
yusuf|yusof
|
||||
|
||||
@@ -1 +1,92 @@
|
||||
puteri
|
||||
ahli dewan undangan negeri|adun
|
||||
bintara kanan
|
||||
bintara muda
|
||||
brigedier jeneral
|
||||
cik
|
||||
datin
|
||||
datin
|
||||
datin paduka
|
||||
datin paduka patinggi
|
||||
datin paduka seri
|
||||
datin paduka seri panglima
|
||||
datin paduka seri utama
|
||||
dato
|
||||
dato paduka
|
||||
dato sri|dato seri
|
||||
dato wira
|
||||
datuk
|
||||
datuk seri
|
||||
doktor|dr
|
||||
encik|en
|
||||
fil marsyal
|
||||
flait sarjan
|
||||
hakim
|
||||
jeneral
|
||||
kadet kanan
|
||||
kapten
|
||||
kolonel
|
||||
komander
|
||||
korporal
|
||||
korporal udara
|
||||
laksamana
|
||||
laksamana armada
|
||||
laksamana madya
|
||||
laksamana muda
|
||||
laksamana pertama
|
||||
lans korporal
|
||||
laskar kanan
|
||||
laskar kelas kedua
|
||||
laskar kelas pertama
|
||||
laskar muda
|
||||
laskar udara kanan
|
||||
laskar udara kelas kedua
|
||||
laskar udara kelas pertama
|
||||
leftenan
|
||||
leftenan jeneral
|
||||
leftenan kolonel
|
||||
leftenan komander
|
||||
leftenan madya
|
||||
leftenan muda
|
||||
marsyal tentera udara
|
||||
mejar
|
||||
mejar jeneral
|
||||
menteri
|
||||
naib pengerusi
|
||||
parajurit muda
|
||||
pegawai
|
||||
pegawai kadet
|
||||
pegawai waran kelas kedua
|
||||
pegawai waran kelas pertama
|
||||
pengerusi
|
||||
perdana menteri
|
||||
prebet
|
||||
presiden
|
||||
puan|pn
|
||||
puan sri
|
||||
putera|putra
|
||||
putera|putra
|
||||
puteri|putri
|
||||
puteri|putri
|
||||
raja
|
||||
rekrut
|
||||
sarjan
|
||||
sarjan udara
|
||||
sharifah
|
||||
staff sarjan
|
||||
sultan
|
||||
syed
|
||||
tan sri
|
||||
tengku
|
||||
timbalan perdana menteri
|
||||
timbalan perdana menteri
|
||||
timbalan presiden
|
||||
toh puan
|
||||
to puan
|
||||
tuan
|
||||
tun
|
||||
wakil
|
||||
wakil-wakil
|
||||
yang berhormat
|
||||
yang di-pertuan agong|agong
|
||||
yang di-pertuan besar
|
||||
yang di-pertua negeri
|
||||
|
||||
@@ -1,4 +1,145 @@
|
||||
aiskrim
|
||||
akademi
|
||||
akuarium
|
||||
arked
|
||||
asrama
|
||||
auditorum
|
||||
balai bomba
|
||||
balai polis
|
||||
bangunan perbandaran
|
||||
bank
|
||||
berek
|
||||
biara
|
||||
bilik|blk
|
||||
bilik kebal
|
||||
dapur
|
||||
dermaga
|
||||
dewan
|
||||
dewan bandar
|
||||
dewan bandar
|
||||
dewan konsert
|
||||
dewan muzik
|
||||
doktor-doktor
|
||||
doktor haiwan|dr haiwan
|
||||
empangan
|
||||
farmasi
|
||||
galeri
|
||||
galeri seni
|
||||
garaj
|
||||
gereja
|
||||
gim|gimnasium
|
||||
hospital
|
||||
hospital haiwan
|
||||
hostel
|
||||
ibu pejabat|ibu pej
|
||||
institut
|
||||
istana
|
||||
jabatan bomba
|
||||
jabatan polis
|
||||
jawatankuasa|jawatan kuasa
|
||||
jelapang
|
||||
jeti
|
||||
kafe
|
||||
kampung|kampong|kg
|
||||
kampus
|
||||
kasino
|
||||
kawasan perindustrian|kaw perindustri|kawasan industri
|
||||
kedai
|
||||
kedai buku
|
||||
kedutaan
|
||||
kejururawatan
|
||||
kelab
|
||||
kelab golf
|
||||
kelab malam
|
||||
kelab sosial
|
||||
kilang
|
||||
kiropraktik
|
||||
kitar semula
|
||||
klinik
|
||||
kolam renang
|
||||
kolam renang awam
|
||||
kolej
|
||||
kompleks
|
||||
kompleks
|
||||
komuniti
|
||||
kuarters|kuarter
|
||||
ladang pertanian|ladang
|
||||
lapangan terbang|lpg terbang
|
||||
liga
|
||||
mahkamah
|
||||
makmal
|
||||
masjid
|
||||
mata air
|
||||
menara
|
||||
menara pejabat
|
||||
misi
|
||||
monumen
|
||||
muzium
|
||||
nurseri
|
||||
padang golf
|
||||
pagar
|
||||
panggung
|
||||
panggung wayang gambar|pawagam
|
||||
pangkalan tentera udara
|
||||
pantai
|
||||
parkir
|
||||
pasar
|
||||
sekolah
|
||||
pasaran
|
||||
pasar raya|pasaraya
|
||||
pasar tani|pasar petani
|
||||
pawagam
|
||||
pediatrik
|
||||
pejabat
|
||||
pejabat pos
|
||||
pelabuhan
|
||||
pelabuhan
|
||||
pembangunan
|
||||
pengedar
|
||||
penjara
|
||||
penjara
|
||||
perpustakaan
|
||||
perubatan
|
||||
pintu gerbang
|
||||
pintu pagar
|
||||
politeknik
|
||||
pra sekolah
|
||||
presint
|
||||
pusat
|
||||
pusat bandar|pusat bandaraya
|
||||
pusat belia
|
||||
pusat kebudayaan
|
||||
pusat kecergasan
|
||||
pusat kejururawatan
|
||||
pusat kesihatan
|
||||
pusat komuniti
|
||||
pusat membeli-belah|pusat beli-belah
|
||||
pusat penjagaan
|
||||
pusat seni
|
||||
pusat seni persembahan
|
||||
residen
|
||||
restoran
|
||||
rumah kediaman
|
||||
rumah kelab
|
||||
rumah orang tua
|
||||
rumah pangsa|pangsapuri|p/puri
|
||||
rumah|rmh
|
||||
rumah sakit
|
||||
salon kecantikan
|
||||
sekolah menegah|sekolah men|sek menengah|sek men
|
||||
sekolah rendah|sekolah ren|sek rendah|sek ren
|
||||
sekolah|sek
|
||||
stesen minyak
|
||||
stor
|
||||
studio tarian
|
||||
surau
|
||||
taman
|
||||
taman didikan kanak-kanak|tadika
|
||||
taman negara
|
||||
taman perindustrian|taman industri|tmn ind|tmn perindustrian
|
||||
tanah perkuburan
|
||||
tapak letak kereta
|
||||
teh
|
||||
tempat letak kereta
|
||||
teres
|
||||
universiti
|
||||
veterinar
|
||||
@@ -1,3 +1,6 @@
|
||||
blok
|
||||
kampung|kampong|kg
|
||||
nusa
|
||||
penampang
|
||||
pulau
|
||||
pulau
|
||||
seksyen
|
||||
|
||||
@@ -1,17 +1,35 @@
|
||||
awang|awg
|
||||
bulatan
|
||||
cerunan|crn
|
||||
changkat|ckt
|
||||
dalaman|dlm
|
||||
dataran|dtr
|
||||
dayang|dyg
|
||||
denai
|
||||
haji|hj
|
||||
halaman|hlm
|
||||
hilir|hlr
|
||||
jalan|jln|jl
|
||||
lapangan
|
||||
kampung|kampong|kg
|
||||
laluan
|
||||
lebuhraya
|
||||
lengkok
|
||||
linkaran
|
||||
lorong
|
||||
kampong|kg
|
||||
laman|lmn
|
||||
langgak|lgk
|
||||
lapangan
|
||||
lebuh|lbh|luh
|
||||
lebuhraya|lebuh raya|lbh raya|l/raya|lbr
|
||||
lengkok|lkk
|
||||
lingkaran|lkr
|
||||
lintang|ltg
|
||||
lorong|lrg
|
||||
medan|mdn
|
||||
pengiran|pg
|
||||
persiaran|psn|psrn
|
||||
rapat
|
||||
simpangan|spn
|
||||
simpang|spg
|
||||
sngai|sg
|
||||
sisiran|ssr
|
||||
solok|slk
|
||||
sungai|sngai|sg
|
||||
taman|tmn
|
||||
tepian|tpn
|
||||
tingkat|tkt
|
||||
|
||||
@@ -1,23 +1,27 @@
|
||||
timur|timor
|
||||
air|ayer
|
||||
kampung|kampong|kg
|
||||
itam|hitam
|
||||
tanjung|tanjong
|
||||
sri|seri
|
||||
tasik|tasek
|
||||
dwitasik|dwitasek
|
||||
fasa|phasa
|
||||
selasih|selaseh
|
||||
putera|putra
|
||||
puteri|putri
|
||||
kecil|kechil
|
||||
sungai|sungei|sg
|
||||
teluk|telok
|
||||
yusuf|yusof
|
||||
bandar|bdr
|
||||
batu|bt
|
||||
bukit|bkt
|
||||
buluh|buloh
|
||||
jelutung|jelutong
|
||||
tanduk|tandok
|
||||
chempaka|cempaka
|
||||
dwitasik|dwitasek
|
||||
fasa|phasa
|
||||
glugor|gelugor
|
||||
itam|hitam
|
||||
jelutung|jelutong
|
||||
kampung|kampong|kg
|
||||
kawasan|kaw
|
||||
kecil|kechil
|
||||
manjalara|menjalara
|
||||
putera|putra
|
||||
puteri|putri
|
||||
selasih|selaseh
|
||||
sri|seri
|
||||
suleiman|sulaiman
|
||||
sungai|sungei|sg
|
||||
tanduk|tandok
|
||||
tanjung|tanjong
|
||||
tasik|tasek
|
||||
teluk|telok
|
||||
timur|timor
|
||||
yusuf|yusof
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
name: "oh"
|
||||
value: 0
|
||||
type: "cardinal"
|
||||
left: "concat_only_if_number"
|
||||
-
|
||||
name: "one"
|
||||
value: 1
|
||||
|
||||
@@ -47,11 +47,13 @@ category_map = {
|
||||
|
||||
LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
|
||||
LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
|
||||
LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
|
||||
LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
|
||||
|
||||
left_context_map = {
|
||||
'add': LEFT_CONTEXT_ADD,
|
||||
'multiply': LEFT_CONTEXT_MULTIPLY,
|
||||
'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
|
||||
None: LEFT_CONTEXT_NONE,
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -133,7 +133,7 @@ download_file() {
|
||||
for subdir in $subdirs; do
|
||||
rm -rf $data_dir/$subdir;
|
||||
done
|
||||
tar -xvzf $local_path -C $data_dir;
|
||||
tar -xvzf $local_path --no-same-owner -C $data_dir;
|
||||
rm $local_path;
|
||||
else
|
||||
echo "libpostal $name up to date"
|
||||
|
||||
56
src/numex.c
56
src/numex.c
@@ -709,6 +709,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
bool possible_complete_token = false;
|
||||
bool complete_token = false;
|
||||
|
||||
bool prev_rule_was_number = false;
|
||||
|
||||
log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
|
||||
|
||||
while (idx < len) {
|
||||
@@ -844,22 +846,41 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) {
|
||||
result.value += rule.value;
|
||||
log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value);
|
||||
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) {
|
||||
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && (!whole_tokens_only || complete_token)) {
|
||||
log_debug("Had previous token with no context, finishing previous rule before returning\n");
|
||||
if (!whole_tokens_only || complete_token) {
|
||||
result.len = prev_result_len;
|
||||
number_finished = true;
|
||||
complete_token = false;
|
||||
advance_index = false;
|
||||
state = start_state;
|
||||
rule = prev_rule = NUMEX_NULL_RULE;
|
||||
prev_result_len = 0;
|
||||
} else {
|
||||
rule = NUMEX_NULL_RULE;
|
||||
last_was_separator = false;
|
||||
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
|
||||
continue;
|
||||
}
|
||||
result.len = prev_result_len;
|
||||
number_finished = true;
|
||||
complete_token = false;
|
||||
advance_index = false;
|
||||
state = start_state;
|
||||
prev_rule_was_number = true;
|
||||
rule = prev_rule = NUMEX_NULL_RULE;
|
||||
prev_result_len = 0;
|
||||
} else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && whole_tokens_only && !complete_token) {
|
||||
log_debug("whole_tokens_only = %d, complete_token = %d\n", whole_tokens_only, complete_token);
|
||||
rule = NUMEX_NULL_RULE;
|
||||
last_was_separator = false;
|
||||
prev_rule_was_number = false;
|
||||
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
|
||||
continue;
|
||||
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) {
|
||||
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n");
|
||||
prev_rule = rule;
|
||||
last_was_separator = false;
|
||||
rule = NUMEX_NULL_RULE;
|
||||
prev_result_len = result.len;
|
||||
result = NULL_NUMEX_RESULT;
|
||||
stopword_phrase = NULL_PHRASE;
|
||||
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
|
||||
last_was_stopword = false;
|
||||
continue;
|
||||
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && prev_rule_was_number) {
|
||||
last_was_separator = false;
|
||||
number_finished = true;
|
||||
state = start_state;
|
||||
last_was_stopword = false;
|
||||
prev_rule_was_number = true;
|
||||
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, value = %" PRId64 "\n", result.value);
|
||||
} else if (rule.rule_type != NUMEX_STOPWORD) {
|
||||
result.value = rule.value;
|
||||
log_debug("Got number, result.value=%" PRId64 "\n", result.value);
|
||||
@@ -871,6 +892,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
continue;
|
||||
}
|
||||
|
||||
prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL;
|
||||
|
||||
if (rule.rule_type != NUMEX_STOPWORD) {
|
||||
prev_rule = rule;
|
||||
prev_result_len = result.len;
|
||||
@@ -903,7 +926,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
if (prev_rule.rule_type != NUMEX_NULL) {
|
||||
number_finished = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (!set_rule) {
|
||||
@@ -926,6 +948,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
log_debug("Adding phrase, value=%" PRId64 "\n", result.value);
|
||||
result = NULL_NUMEX_RESULT;
|
||||
number_finished = false;
|
||||
rule = prev_rule = NUMEX_NULL_RULE;
|
||||
}
|
||||
|
||||
prev_state = state;
|
||||
@@ -1150,7 +1173,6 @@ char *replace_numeric_expressions(char *str, char *lang) {
|
||||
char_array_append(replacement, ordinal_suffix);
|
||||
}
|
||||
}
|
||||
|
||||
start = result.start + result.len;
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,8 @@ typedef enum {
|
||||
typedef enum {
|
||||
NUMEX_LEFT_CONTEXT_NONE,
|
||||
NUMEX_LEFT_CONTEXT_ADD,
|
||||
NUMEX_LEFT_CONTEXT_MULTIPLY
|
||||
NUMEX_LEFT_CONTEXT_MULTIPLY,
|
||||
NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER
|
||||
} numex_left_context;
|
||||
|
||||
typedef enum {
|
||||
|
||||
@@ -82,6 +82,8 @@ TEST test_expansions(void) {
|
||||
CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));
|
||||
|
||||
Reference in New Issue
Block a user