From 60cde05c3d813fe40a9f3959231bb29292392463 Mon Sep 17 00:00:00 2001 From: Yanuar Budi Baskoro Date: Fri, 19 May 2017 18:39:48 +0700 Subject: [PATCH 01/19] [dictionaries] Indonesian dictionaries to support new config --- .../dictionaries/id/academic_degrees.txt | 82 +++++++ resources/dictionaries/id/building_types.txt | 8 + resources/dictionaries/id/chains.txt | 9 + resources/dictionaries/id/company_types.txt | 29 +++ resources/dictionaries/id/cross_streets.txt | 6 + resources/dictionaries/id/directionals.txt | 18 +- resources/dictionaries/id/entrances.txt | 3 + .../dictionaries/id/level_types_basement.txt | 1 + .../dictionaries/id/level_types_mezzanine.txt | 5 + .../dictionaries/id/level_types_numbered.txt | 3 + .../id/level_types_standalone.txt | 14 ++ .../id/level_types_sub_basement.txt | 1 + resources/dictionaries/id/near.txt | 3 + resources/dictionaries/id/no_number.txt | 1 + resources/dictionaries/id/nulls.txt | 2 + resources/dictionaries/id/number.txt | 1 + resources/dictionaries/id/personal_titles.txt | 39 ++- resources/dictionaries/id/place_names.txt | 225 +++++++++++++++++- resources/dictionaries/id/qualifiers.txt | 19 +- resources/dictionaries/id/stopwords.txt | 9 + resources/dictionaries/id/street_types.txt | 9 +- resources/dictionaries/id/synonyms.txt | 23 ++ resources/dictionaries/id/toponyms.txt | 34 +++ resources/dictionaries/id/unit_directions.txt | 4 + .../dictionaries/id/unit_types_numbered.txt | 33 +++ .../dictionaries/id/unit_types_standalone.txt | 11 + 26 files changed, 564 insertions(+), 28 deletions(-) create mode 100644 resources/dictionaries/id/academic_degrees.txt create mode 100644 resources/dictionaries/id/building_types.txt create mode 100644 resources/dictionaries/id/chains.txt create mode 100644 resources/dictionaries/id/company_types.txt create mode 100644 resources/dictionaries/id/cross_streets.txt create mode 100644 resources/dictionaries/id/entrances.txt create mode 100644 resources/dictionaries/id/level_types_basement.txt create mode 100644 resources/dictionaries/id/level_types_mezzanine.txt create mode 100644 resources/dictionaries/id/level_types_numbered.txt create mode 100644 resources/dictionaries/id/level_types_standalone.txt create mode 100644 resources/dictionaries/id/level_types_sub_basement.txt create mode 100644 resources/dictionaries/id/near.txt create mode 100644 resources/dictionaries/id/no_number.txt create mode 100644 resources/dictionaries/id/nulls.txt create mode 100644 resources/dictionaries/id/number.txt create mode 100644 resources/dictionaries/id/stopwords.txt create mode 100644 resources/dictionaries/id/synonyms.txt create mode 100644 resources/dictionaries/id/toponyms.txt create mode 100644 resources/dictionaries/id/unit_directions.txt create mode 100644 resources/dictionaries/id/unit_types_numbered.txt create mode 100644 resources/dictionaries/id/unit_types_standalone.txt diff --git a/resources/dictionaries/id/academic_degrees.txt b/resources/dictionaries/id/academic_degrees.txt new file mode 100644 index 00000000..a6db8c35 --- /dev/null +++ b/resources/dictionaries/id/academic_degrees.txt @@ -0,0 +1,82 @@ +doktor|dr +magister administrasi bisnis|mab|m ab +magister administrasi publi|map|m ap +magister administrasi rumah sakit|mars|m a r s +magister agama|mag|m ag +magister akuntansi|mak|m ak +magister epidemiolog|mepid|m epid +magister farmasi|mfarm|m farmasi +magister hukum|m h| +magister humaniora|mhum|m hum +magister ilmu biomedik|mbiomedik|m biomedik +magister ilmu komputer|mkom|m kom +magister kehutanan|mhut|m hut +magister kenotariatan|mkn|m kn +magister keperawatan|mkep|m kep +magister kesehatan|mkes|m kes +magister manajemen|mm|m m +magister manajemen pendidikan|mmpd|m mpd +magister manajemen sistem informasi|mmsi|m msi +magister marine|mmar|m mar +magister pendidikan|mpd|m pd +magister psikologi|mpsi|m psi +magister sains bidang ilmu pertahanan|msihan|m si han +magister seni|msn|m sn +magister statistik|mstat|m stat +magister teknik|mt|m t +magister teknologi informasi|mti|m ti +magister veteriner|mvet|m vet +master of arts|ma|m a +master of public health|mph|m p h +master of scienc|msc|m sc +sarjana administrasi bisnis|sab|s ab +sarjana administrasi publik|sap|s ap +sarjana agama|s ag|sag|sa g|s a g +sarjana agroteknologi|sagr|s agr +sarjana antropologi|sant|s ant +sarjana arsitektur|sars|s ars +sarjana desain|sds|s ds +sarjana ekonomi islam|sei|s e i +sarjana ekonomi|se|s e +sarjana farmasi|sfarm|s farm +sarjana hubungan internasional|shint|s h int|sh int +sarjana hukum islam|shi|s h i|sh i|s hi +sarjana hukum|sh|s h +sarjana humaniora|shum|s hum +sarjana ilmu gizi|sgz|s gz +sarjana ilmu kelautan|skel|s kel +sarjana ilmu kepolisian|s ik|sik|si k +sarjana ilmu perpustakaan|sip|s ip +sarjana ilmu politik|sarjana ilmu pemerintahan|sip|s i p +sarjana ilmu psikologi|sikom|s i kom|si kom +sarjana intelijen|sin|s in +sarjana kedokteran gigi|skg|s kg +sarjana kedokteran hewan|skh|s kh +sarjana kedokteran|s ked|sked +sarjana kehutanan|shut|s hut +sarjana komputer|skom|s kom +sarjana komunikasi dan pengembangan masyarakat|skpm|s kpm +sarjana manajemen bisnis|smb|s mb +sarjana pariwisata|spar|s par +sarjana pendidikan islam|spdi|s pd i|s pdi +sarjana pendidikan sekolah dasar|spdsd|s pd sd +sarjana pendidikan|spd|s pd|sp d|s p d +sarjana pertahanan|shan|s han +sarjana pertanian|sp|s p +sarjana peternakan|spt|s pt +sarjana psikologi|s psi|spsi +sarjana sains|s si|ssi +sarjana sains|ssi|s si +sarjana sains terapan pemerintahan|sstp|s stp +sarjana sastra|ss|s s +sarjana seni|s sn|ssn +sarjana sistem informasi|ssi|s si +sarjana sosial|ssos|s sos +sarjana syari ah|s sy|ssy +sarjana teknik|st|s t +sarjana teknologi informasi|sti|s ti|s t i +sarjana teknologi pertanian|stp|s tp|st p|s t p +sarjana teologi islam|s thi|s th i|sthi +sarjana teologi kristen|sthk|s th k|s thk +sarjana teologi|s teol|steol +sarjana terapan kepolisian|strk|s trk diff --git a/resources/dictionaries/id/building_types.txt b/resources/dictionaries/id/building_types.txt new file mode 100644 index 00000000..0aab3bcb --- /dev/null +++ b/resources/dictionaries/id/building_types.txt @@ -0,0 +1,8 @@ +apartemen|apart|aprtmn +garasi|grs +gedung|gd|gdg +gudang|gdang +kebun|kebon|kbn +rumah|rmah|rmh +tower|twr +villa|vlla|vl diff --git a/resources/dictionaries/id/chains.txt b/resources/dictionaries/id/chains.txt new file mode 100644 index 00000000..59781fe0 --- /dev/null +++ b/resources/dictionaries/id/chains.txt @@ -0,0 +1,9 @@ +alfamart|alfamar +indomart|indomaret +circle k|circlek +seven eleven|sevel +carrefour +superindo +lottemart +bonjour +bright diff --git a/resources/dictionaries/id/company_types.txt b/resources/dictionaries/id/company_types.txt new file mode 100644 index 00000000..48b8364f --- /dev/null +++ b/resources/dictionaries/id/company_types.txt @@ -0,0 +1,29 @@ +bank +company|co +cooperative|coop|co op +corporation|corp +dan rekan|& rekan +enterprise|ent +firma|fa|f a +foundation +general patnership|gp|g p +incorporated|inc +intermediary|nt|n t +international business company|ibc|i b c +koperasi|kop|kprs +koperasi usaha dagang|kud|k ud +limited company|lc|l c|ltd co +limited liability company|llc|l l c|ltd liability company|ltd liability co +limited|ltd +national association|na|n a +nonprofit|non profit +perseroan komanditer|comanditaire venotschap|cv|c v +perseroan terbatas|pt|p t +perusahaan daerah|pd +perusahaan dagang|pd|p d +perusahaan jawatan|pj|pjaw|p jaw +perusahaan umum|perum|p u +trust +unlimited|ultd|unltd +usaha dagang|ud|u d +yayasan diff --git a/resources/dictionaries/id/cross_streets.txt b/resources/dictionaries/id/cross_streets.txt new file mode 100644 index 00000000..023bd4a1 --- /dev/null +++ b/resources/dictionaries/id/cross_streets.txt @@ -0,0 +1,6 @@ +& +dan +di +di pojok|d pjk +pojok|pjk +di antara|d antr diff --git a/resources/dictionaries/id/directionals.txt b/resources/dictionaries/id/directionals.txt index f704bd7f..6ccb8652 100644 --- a/resources/dictionaries/id/directionals.txt +++ b/resources/dictionaries/id/directionals.txt @@ -1,9 +1,9 @@ -barat -barat daya -barat laut -selatan -tengah -tenggara -timur -timur laut -utara \ No newline at end of file +barat|bar|brt|brat|kulon|kln|klon +barat daya|brt dy|bar day|brat dya +barat laut|brt lt|bar laut +selatan|sel|sltn|slatan|kidul|kdl|kdul +tengah|teng|tngh +tenggara|teng|tnggra +timur|tim|tmur|wetan|wtan +timur laut|tim laut|tmur laut +utara|ut|utra|utr|lor diff --git a/resources/dictionaries/id/entrances.txt b/resources/dictionaries/id/entrances.txt new file mode 100644 index 00000000..76402a38 --- /dev/null +++ b/resources/dictionaries/id/entrances.txt @@ -0,0 +1,3 @@ +masuk|msk +gerbang masuk|grbg msk +pintu masuk|pntu msk|pntu msuk diff --git a/resources/dictionaries/id/level_types_basement.txt b/resources/dictionaries/id/level_types_basement.txt new file mode 100644 index 00000000..34d55d03 --- /dev/null +++ b/resources/dictionaries/id/level_types_basement.txt @@ -0,0 +1 @@ +basement|bsm|bsmt|bsmnt|basement|bsment \ No newline at end of file diff --git a/resources/dictionaries/id/level_types_mezzanine.txt b/resources/dictionaries/id/level_types_mezzanine.txt new file mode 100644 index 00000000..aa8cbaa1 --- /dev/null +++ b/resources/dictionaries/id/level_types_mezzanine.txt @@ -0,0 +1,5 @@ +mezzanine|mezz +mezzanine floor|mezz fl|mezz floor +mezzanine level|mezz lvl|mezz level +lower mezzanine|lower mezz|lwr mezz +upper mezzanine|upper mezz|uppr mezz|upr mezz \ No newline at end of file diff --git a/resources/dictionaries/id/level_types_numbered.txt b/resources/dictionaries/id/level_types_numbered.txt new file mode 100644 index 00000000..8ec779f4 --- /dev/null +++ b/resources/dictionaries/id/level_types_numbered.txt @@ -0,0 +1,3 @@ +lantai|lt|ltai|lt. +level|lev|levl|lvel|lvl|l|/ l +platform|pf diff --git a/resources/dictionaries/id/level_types_standalone.txt b/resources/dictionaries/id/level_types_standalone.txt new file mode 100644 index 00000000..a044b9c8 --- /dev/null +++ b/resources/dictionaries/id/level_types_standalone.txt @@ -0,0 +1,14 @@ +ground|g|gd +ground floor|gdfl|gd fl|gd/fl|gd / fl|gf|g / f +ground level|gd lvl|g lvl|g level|gd level|ground lvl|gd / lvl|gl|g / l +lantai atas|lantai ats|lt ats|lt. ats|lt. atas +lantai dasar|lt dsr|lt dsar|lt. dsr|lt. dasar +lobby +lower ground floor|lg|lgf|lgfl|l / g|l / gf|l / g / f|l / g / fl +lower level|lwr level|lower lvl|lwr lvl +podium|pd +podium level|podium lev|podium levl|podium lvel|podium lvl|podium l|pd level|pd lev|pd levl|pd lvel|pd lvl|pd l +rooftop|rt|rf|r / t +top floor|top fl|top / f|tf|t.f|t f|t / f +upper ground floor|ug|ugf|ugfl|ug / f|ug / fl +upper|uppr|upr diff --git a/resources/dictionaries/id/level_types_sub_basement.txt b/resources/dictionaries/id/level_types_sub_basement.txt new file mode 100644 index 00000000..db2c66d2 --- /dev/null +++ b/resources/dictionaries/id/level_types_sub_basement.txt @@ -0,0 +1 @@ +sub basement|sub-basement|subbasement|sb|s.b \ No newline at end of file diff --git a/resources/dictionaries/id/near.txt b/resources/dictionaries/id/near.txt new file mode 100644 index 00000000..003ec4e1 --- /dev/null +++ b/resources/dictionaries/id/near.txt @@ -0,0 +1,3 @@ +sekitar +dalam|dalem|dlm|dlam +dekat|dkt|dkat diff --git a/resources/dictionaries/id/no_number.txt b/resources/dictionaries/id/no_number.txt new file mode 100644 index 00000000..8ef9c661 --- /dev/null +++ b/resources/dictionaries/id/no_number.txt @@ -0,0 +1 @@ +no fixed address|nfa|n f a|n / f / a \ No newline at end of file diff --git a/resources/dictionaries/id/nulls.txt b/resources/dictionaries/id/nulls.txt new file mode 100644 index 00000000..661b09b0 --- /dev/null +++ b/resources/dictionaries/id/nulls.txt @@ -0,0 +1,2 @@ +not applicable|n / a|na|n a +null \ No newline at end of file diff --git a/resources/dictionaries/id/number.txt b/resources/dictionaries/id/number.txt new file mode 100644 index 00000000..657d45e2 --- /dev/null +++ b/resources/dictionaries/id/number.txt @@ -0,0 +1 @@ +nomor|nomr|nmr|#|no|№|nr| diff --git a/resources/dictionaries/id/personal_titles.txt b/resources/dictionaries/id/personal_titles.txt index 812b53cf..1a9b9460 100644 --- a/resources/dictionaries/id/personal_titles.txt +++ b/resources/dictionaries/id/personal_titles.txt @@ -1,5 +1,34 @@ -imam -jenderal -pangeran -raja -sultan \ No newline at end of file +brigadir jendral|brig jen|brigjen +bapak|pak +doktor|dr|dok +doktorandus|drs +ibu|bu|ibuk|buk +jenderal|jend|jnd|jen|jendral +kapten|kpt|kapt +kolonel|kol +komandan|kmndn|kmd +kopral|koprl|kpl +komandan letnan|kmndn let|kmd lt +letnan|letn|lt +letnan kolonel|letkol|lt kol|lt kl +letnan jenderal|let jen|letjen|lt jn +mas|bang|ms|bg +mayor|may|myr +mayor jenderal|may jen|mayjen +mbak +menteri|mentri|mntr +pangeran|pngrn +pastor +pendeta|pdt +ustadz|ust +prajurit|prjrt +presiden|pres +profesor|prof +raden|rdn|r +raden rara|rdn rara|rr +raja|rja +ratu +reverend|rev +saint|st +sersan|sers|sersn +sultan|sltn diff --git a/resources/dictionaries/id/place_names.txt b/resources/dictionaries/id/place_names.txt index 75a53866..cc21778f 100644 --- a/resources/dictionaries/id/place_names.txt +++ b/resources/dictionaries/id/place_names.txt @@ -1,5 +1,222 @@ +air mancur +akademi|akdm +akuarium|akrium +amfiteater +apartmen|apt +arkade +auditorium +bakery +balai kota +ballroom +bandar udara|bandara +banjar|bjr +bank +bank perkreditan rakyat|bpr +bar +barak +bazar +bendungan +benteng +benteng +biara +bioskop +bistro +business park|bus pk|biz pk|bus prk|biz prk +cafe|café +cagar alam +child care|childcare +cinema +cineplex +club|clb +clubhouse|club house +clubrooms +cottage|cott|cottg +cuci mobil +daerah +danau +dance studio +day care +dental +dentist +departmen|dep +dermaga +desa|ds +developmen +distributor|dstr|distrib|dstrb +dojo +dokter hewan +dormitory|dorm +dusun|dsn +embassy +farmasi +farm|frm +fitness center|fitness centre +flat|flt +galeri +galeri seni +gampong|gp +garasi +gedung rekreasi +gerbang +gereja +golf club +gym|gymnasium +hall +headquarter +health center|health centre +hostel +hostel|host|hostl|hstel|hstl +hotel|hot|hotl|htel +hub +ice cream|icecream +institut|inst +istana jembatan -mesjid -perkebunan -pondok -puri \ No newline at end of file +kabupaten|kab +kafe|kafé +kali +kampung|kp|kpg|kmpg +kampus +kandang +kantor bupati +kantor|kntr +kantor pos +kantor pusat +kapel +karang taruan +kawasan industri +kebun binatang|bonbin +kecamatan|kec +kedai +kedutaan +kelompok bermain|kb +kelurahan|kel +kennel +kepolisian daerah|polda +kepolisian resor|polres +kepolisian sektor|polsek +kindergarten +kios +kitchen +klinik +kolam renang +komite +kompleks +komunitas +konservatori +kopi +kos|kosan +kota +krematorium +kuburan +lab +laboratorium +lapangan +lapangan golf +lembaga pemasyarakatan +lounge +mall|mll +mansion +marina +markas besar|mabes +market|mkt|mrkt +medik|med +memorial|mem +mesjid|masjid|msjd +monumen +motel|mot|motl|mtel +museum|mus +night club|nightclub +office|ofc +office tower|ofc twr|office twr|ofc tower +pabrik|pbrk +pandai besi +pangkalan udara|lanud|lanud +pantai|pante +panti jompo|pnt jmp|pnti jmpo +parking +parking lot +parkiran +park|pk|prk +paroki +pasar +pavilion +paviliun|pav +pediatric +pelabuhan +pemadam kebakaran +pemakaman +penampungan +penampungan hewan +pengadilan +penginapan +penitipan anak +penjara +perkebunan|kebun|kbn|kebon +perpustakaan +perserikatan +playgroup +plaza +politeknik +pom bensin +pondok|pndok +pos polisi +provinsi|prov +puri +pusat kebudayaan +pusat kesehatan +pusat pemulihan +pusat perawatan +pusat seni +reservation|res|resrv|resv|rsrv|rserv|rs +resort +restoran +ruangan +rumah makan|rm +rumah|rmh|rmah|rumh +rumah sakit hewan|rs hewan +rumah sakit|rs|r s +rumah sakit wanita|rs wanita|r s wanita +salon +saloon +sauna +sekolah dasar neger|sdn|sd n +sekolah menengah atas negeri|sma n|sma +sekolah menengah pertama negeri|smp n|smpn +shopping center|shoppingcenter|shoppingcentre|shopping|shoppingtown|shopping town|shopping centre|shctr|sh ctr|s / centre|shp / centre +shop|shp +showground +sirkuit +sirkus +situ +social club +spa +stadium +stasiun +steakhouse +studio +suaka margasatwa nasional +sungai +supermarket|super market +tahanan +taman +taman kanak|tk +taman nasional +teater +tempat pembuangan umum|tpu +tempat pertunjukan +tepi pantai +terminal|term +toko +toko +toko buku +tower|twr +townhouse|town house +unit +unit gawat darurat|ugd +unit geriatrik +universitas|univ|uni +velodrome +vila|vl|vla +warung +yoga diff --git a/resources/dictionaries/id/qualifiers.txt b/resources/dictionaries/id/qualifiers.txt index b9d14020..37fc2612 100644 --- a/resources/dictionaries/id/qualifiers.txt +++ b/resources/dictionaries/id/qualifiers.txt @@ -1,7 +1,14 @@ -blok -gedung -kampung -kampong -kompleks +blok|blk +gedung|gdg|gd +kampung|kmpg|kpg|kp +kampong|kmpg|kpg|kp +gampong|gmpg|gpg +banjar|bjr +kabupaten|kab|kbptn|kbp +kecamatan|kec|kcmntn|kcm +kelurahan|kel|klrhn|klh +desa|ds +dusun|dsn +kompleks|komp|kmplk kota -pulau \ No newline at end of file +pulau diff --git a/resources/dictionaries/id/stopwords.txt b/resources/dictionaries/id/stopwords.txt new file mode 100644 index 00000000..1703f14d --- /dev/null +++ b/resources/dictionaries/id/stopwords.txt @@ -0,0 +1,9 @@ +berlawanan|lawanan +dan|dn|n +dari|dr +dekat|dkt +di +ke +lewat|lwt +seberang|sebrang|sbrg +semua|semoa|smua|smoa diff --git a/resources/dictionaries/id/street_types.txt b/resources/dictionaries/id/street_types.txt index 4bbadef8..a3ef2a02 100644 --- a/resources/dictionaries/id/street_types.txt +++ b/resources/dictionaries/id/street_types.txt @@ -1,6 +1,6 @@ alun-alun|alun alun|alunalun -gang -jalan|jln|jl +gang|gg +jalan|jln|jl|jl. jalan besar|jl besar|jl.besar|jln besar|jln.besar jalan desa|jl desa|jl.desa|jln desa|jln.desa jalan tol lingkar|jl tol lingkar|jl.tol lingkar|jln tol lingkar|jln.tol lingkar @@ -12,7 +12,8 @@ jalan poros|jl poros|jl.poros|jln poros|jln.poros jalan raya|jl raya|jl.raya|jln raya|jln.raya jalan tol|jl tol|jl.tol|jln tol|jln.tol jalan utama|jl utama|jl.utama|jln utama|jln.utama +jalan raya|jl raya|jl. raya|jln raya|jln. raya jalur jembatan -lorong -terowongan \ No newline at end of file +lorong|lrong +terowongan diff --git a/resources/dictionaries/id/synonyms.txt b/resources/dictionaries/id/synonyms.txt new file mode 100644 index 00000000..557e679f --- /dev/null +++ b/resources/dictionaries/id/synonyms.txt @@ -0,0 +1,23 @@ +banjar|bnjr|bjr +daerah|drh +danau|dano +desa|dsa|ds +dusun|dsun|dsn +flat|flt +gampong|gpg +gunung|gn +internasional|int|int'l +kabupaten|kbptn +kali|kli +kampung|kpg|kmpg +kecamatan|kcmtn +kelurahan|klrhn +mas|ms +mbak|mba|mb +medikal|med +memorial|mem +militer|mil +national|nasl|nas'l +sungai|sngai +tanjung|tjg +utama|utm diff --git a/resources/dictionaries/id/toponyms.txt b/resources/dictionaries/id/toponyms.txt new file mode 100644 index 00000000..ddba1aef --- /dev/null +++ b/resources/dictionaries/id/toponyms.txt @@ -0,0 +1,34 @@ +bali +bangka belitung|babel +banten +bengkulu +daerah istimewa yogyakarta|di yogyakarta|d.i. yogyakarta|diy +dki jakarta|jakarta +gorontalo +jambi +jawa barat|jabar +jawa tengah|jateng +jawa timur|jatim +kalimantan barat|kalbar +kalimantan selatan|kalsel +kalimantan tengah|kalteng +kalimantan timur|kaltim +kalimantan utara|kalut +kepulauan riau|kepri +lampung +maluku +maluku utara|malut +nanggroe aceh darussalam|nanggro aceh darussalam|nanggroe aceh darusalam|nad|n a d +nusa tenggara barat|ntb +nusa tenggara timur|ntt +papua +papua barat +riau +sulawesi barat|sulbar +sulawesi selatan|sulsel +sulawesi tengah|sulteng +sulawesi tenggara|sultra +sulawesi utara|sulut +sumatera barat|sumatra barat|sumbar +sumatera selatan|sumatra selatan|sumsel +sumatera utara|sumatra utara|sumut diff --git a/resources/dictionaries/id/unit_directions.txt b/resources/dictionaries/id/unit_directions.txt new file mode 100644 index 00000000..3648c709 --- /dev/null +++ b/resources/dictionaries/id/unit_directions.txt @@ -0,0 +1,4 @@ +belakang|blkg|blkang +depan|dpn +kanan|knan|knn +kiri|kri|kr diff --git a/resources/dictionaries/id/unit_types_numbered.txt b/resources/dictionaries/id/unit_types_numbered.txt new file mode 100644 index 00000000..a430fae7 --- /dev/null +++ b/resources/dictionaries/id/unit_types_numbered.txt @@ -0,0 +1,33 @@ +flat|flt +garasi|grsi +gedung|gdg|gd +gudang|gdng +hangar|hngr +kamar|kmr +kantor +kavling|kav|kv +kebun|kebon|kbn +kios +pabrik|pabrk|pbrk +parcel +parkiran|park +penthouse|pths|ph|pent house +reserve|rsve|rsrv|rsv|resv +room|rm +rukun tetangga|rt +rukun warga|rw +shed|shd +shop|shp +showroom|shrm +stasiun +stop +studio|stu +suite|ste|se +tenancy|tncy +toko|tko +tower|twr +townhouse|tnhs|twnhs|tnhse|twnhse +unit|un|unt|u +vila|vla|vl +warehouse|we|whs|wrhs +workshop|wshp|wkshp|wksp diff --git a/resources/dictionaries/id/unit_types_standalone.txt b/resources/dictionaries/id/unit_types_standalone.txt new file mode 100644 index 00000000..b012acde --- /dev/null +++ b/resources/dictionaries/id/unit_types_standalone.txt @@ -0,0 +1,11 @@ +depan|dpn +garasi|grsi +lobby|lby|lbby +kiri bawah|kiri bwh|kri bwh +kanan bawah|kanan bwh|knan bwh +penthouse|pths|ph|pent house +belakang|blkg +rear lower|rear lwr +belakang bawah|belakang bwh|blkg bwh +sisi + From 251458061180f51e53fae65ee08f8a967955aea9 Mon Sep 17 00:00:00 2001 From: Yanuar Budi Baskoro Date: Fri, 19 May 2017 18:44:32 +0700 Subject: [PATCH 02/19] [dictionaries] Indonesian dictionaries to support new config --- resources/dictionaries/id/personal_suffixes.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 resources/dictionaries/id/personal_suffixes.txt diff --git a/resources/dictionaries/id/personal_suffixes.txt b/resources/dictionaries/id/personal_suffixes.txt deleted file mode 100644 index 427730e5..00000000 --- a/resources/dictionaries/id/personal_suffixes.txt +++ /dev/null @@ -1 +0,0 @@ -utama \ No newline at end of file From 7f14dafd211ce66194c619ad3c00cba2a6457ba0 Mon Sep 17 00:00:00 2001 From: Yanuar Budi Baskoro Date: Sat, 20 May 2017 01:00:28 +0700 Subject: [PATCH 03/19] [dictionaries] Fix blank synonym in academic degrees --- resources/dictionaries/id/academic_degrees.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/dictionaries/id/academic_degrees.txt b/resources/dictionaries/id/academic_degrees.txt index a6db8c35..de9908d3 100644 --- a/resources/dictionaries/id/academic_degrees.txt +++ b/resources/dictionaries/id/academic_degrees.txt @@ -6,7 +6,7 @@ magister agama|mag|m ag magister akuntansi|mak|m ak magister epidemiolog|mepid|m epid magister farmasi|mfarm|m farmasi -magister hukum|m h| +magister hukum|m h|mh magister humaniora|mhum|m hum magister ilmu biomedik|mbiomedik|m biomedik magister ilmu komputer|mkom|m kom From 3b2fb597fe67ed8a5f0e5a461e317eee73227dbe Mon Sep 17 00:00:00 2001 From: Yanuar Budi Baskoro Date: Sat, 20 May 2017 01:04:12 +0700 Subject: [PATCH 04/19] [dictionaries] Fix blank synonym in numbers --- resources/dictionaries/id/number.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/dictionaries/id/number.txt b/resources/dictionaries/id/number.txt index 657d45e2..0c61787b 100644 --- a/resources/dictionaries/id/number.txt +++ b/resources/dictionaries/id/number.txt @@ -1 +1 @@ -nomor|nomr|nmr|#|no|№|nr| +nomor|nomr|nmr|#|no|№|nr From 09cb28cb14bc6e421aca13093ffbcc183878ea0b Mon Sep 17 00:00:00 2001 From: Yanuar Budi Baskoro Date: Sun, 21 May 2017 15:39:47 +0700 Subject: [PATCH 05/19] [dictionaries] Remove english words from ID dictionary --- resources/dictionaries/id/company_types.txt | 9 +-- .../dictionaries/id/level_types_basement.txt | 1 - .../dictionaries/id/level_types_mezzanine.txt | 5 -- .../id/level_types_standalone.txt | 12 +--- .../id/level_types_sub_basement.txt | 1 - resources/dictionaries/id/near.txt | 2 +- resources/dictionaries/id/nulls.txt | 2 - resources/dictionaries/id/personal_titles.txt | 2 + resources/dictionaries/id/place_names.txt | 62 ------------------- .../dictionaries/id/unit_types_numbered.txt | 15 ----- .../dictionaries/id/unit_types_standalone.txt | 14 ++--- 11 files changed, 12 insertions(+), 113 deletions(-) delete mode 100644 resources/dictionaries/id/level_types_basement.txt delete mode 100644 resources/dictionaries/id/level_types_mezzanine.txt delete mode 100644 resources/dictionaries/id/level_types_sub_basement.txt delete mode 100644 resources/dictionaries/id/nulls.txt diff --git a/resources/dictionaries/id/company_types.txt b/resources/dictionaries/id/company_types.txt index 48b8364f..511b124d 100644 --- a/resources/dictionaries/id/company_types.txt +++ b/resources/dictionaries/id/company_types.txt @@ -1,3 +1,4 @@ +asosiasi bank company|co cooperative|coop|co op @@ -7,16 +8,10 @@ enterprise|ent firma|fa|f a foundation general patnership|gp|g p -incorporated|inc -intermediary|nt|n t -international business company|ibc|i b c koperasi|kop|kprs koperasi usaha dagang|kud|k ud -limited company|lc|l c|ltd co -limited liability company|llc|l l c|ltd liability company|ltd liability co -limited|ltd -national association|na|n a nonprofit|non profit +persekutuan perseroan komanditer|comanditaire venotschap|cv|c v perseroan terbatas|pt|p t perusahaan daerah|pd diff --git a/resources/dictionaries/id/level_types_basement.txt b/resources/dictionaries/id/level_types_basement.txt deleted file mode 100644 index 34d55d03..00000000 --- a/resources/dictionaries/id/level_types_basement.txt +++ /dev/null @@ -1 +0,0 @@ -basement|bsm|bsmt|bsmnt|basement|bsment \ No newline at end of file diff --git a/resources/dictionaries/id/level_types_mezzanine.txt b/resources/dictionaries/id/level_types_mezzanine.txt deleted file mode 100644 index aa8cbaa1..00000000 --- a/resources/dictionaries/id/level_types_mezzanine.txt +++ /dev/null @@ -1,5 +0,0 @@ -mezzanine|mezz -mezzanine floor|mezz fl|mezz floor -mezzanine level|mezz lvl|mezz level -lower mezzanine|lower mezz|lwr mezz -upper mezzanine|upper mezz|uppr mezz|upr mezz \ No newline at end of file diff --git a/resources/dictionaries/id/level_types_standalone.txt b/resources/dictionaries/id/level_types_standalone.txt index a044b9c8..668b6039 100644 --- a/resources/dictionaries/id/level_types_standalone.txt +++ b/resources/dictionaries/id/level_types_standalone.txt @@ -1,14 +1,4 @@ -ground|g|gd -ground floor|gdfl|gd fl|gd/fl|gd / fl|gf|g / f -ground level|gd lvl|g lvl|g level|gd level|ground lvl|gd / lvl|gl|g / l lantai atas|lantai ats|lt ats|lt. ats|lt. atas lantai dasar|lt dsr|lt dsar|lt. dsr|lt. dasar -lobby -lower ground floor|lg|lgf|lgfl|l / g|l / gf|l / g / f|l / g / fl -lower level|lwr level|lower lvl|lwr lvl +lobi podium|pd -podium level|podium lev|podium levl|podium lvel|podium lvl|podium l|pd level|pd lev|pd levl|pd lvel|pd lvl|pd l -rooftop|rt|rf|r / t -top floor|top fl|top / f|tf|t.f|t f|t / f -upper ground floor|ug|ugf|ugfl|ug / f|ug / fl -upper|uppr|upr diff --git a/resources/dictionaries/id/level_types_sub_basement.txt b/resources/dictionaries/id/level_types_sub_basement.txt deleted file mode 100644 index db2c66d2..00000000 --- a/resources/dictionaries/id/level_types_sub_basement.txt +++ /dev/null @@ -1 +0,0 @@ -sub basement|sub-basement|subbasement|sb|s.b \ No newline at end of file diff --git a/resources/dictionaries/id/near.txt b/resources/dictionaries/id/near.txt index 003ec4e1..38c659f9 100644 --- a/resources/dictionaries/id/near.txt +++ b/resources/dictionaries/id/near.txt @@ -1,3 +1,3 @@ -sekitar +sekitar|sktr dalam|dalem|dlm|dlam dekat|dkt|dkat diff --git a/resources/dictionaries/id/nulls.txt b/resources/dictionaries/id/nulls.txt deleted file mode 100644 index 661b09b0..00000000 --- a/resources/dictionaries/id/nulls.txt +++ /dev/null @@ -1,2 +0,0 @@ -not applicable|n / a|na|n a -null \ No newline at end of file diff --git a/resources/dictionaries/id/personal_titles.txt b/resources/dictionaries/id/personal_titles.txt index 1a9b9460..09bee7fe 100644 --- a/resources/dictionaries/id/personal_titles.txt +++ b/resources/dictionaries/id/personal_titles.txt @@ -15,6 +15,7 @@ letnan jenderal|let jen|letjen|lt jn mas|bang|ms|bg mayor|may|myr mayor jenderal|may jen|mayjen +mas mbak menteri|mentri|mntr pangeran|pngrn @@ -25,6 +26,7 @@ prajurit|prjrt presiden|pres profesor|prof raden|rdn|r +raden mas|rm|r m raden rara|rdn rara|rr raja|rja ratu diff --git a/resources/dictionaries/id/place_names.txt b/resources/dictionaries/id/place_names.txt index cc21778f..ce7882ca 100644 --- a/resources/dictionaries/id/place_names.txt +++ b/resources/dictionaries/id/place_names.txt @@ -7,10 +7,8 @@ arkade auditorium bakery balai kota -ballroom bandar udara|bandara banjar|bjr -bank bank perkreditan rakyat|bpr bar barak @@ -21,37 +19,19 @@ benteng biara bioskop bistro -business park|bus pk|biz pk|bus prk|biz prk -cafe|café cagar alam -child care|childcare -cinema -cineplex -club|clb -clubhouse|club house -clubrooms -cottage|cott|cottg cuci mobil daerah danau -dance studio -day care dental dentist departmen|dep dermaga desa|ds -developmen distributor|dstr|distrib|dstrb -dojo dokter hewan -dormitory|dorm dusun|dsn -embassy farmasi -farm|frm -fitness center|fitness centre -flat|flt galeri galeri seni gampong|gp @@ -59,21 +39,10 @@ garasi gedung rekreasi gerbang gereja -golf club -gym|gymnasium -hall -headquarter -health center|health centre -hostel -hostel|host|hostl|hstel|hstl -hotel|hot|hotl|htel -hub -ice cream|icecream institut|inst istana jembatan kabupaten|kab -kafe|kafé kali kampung|kp|kpg|kmpg kampus @@ -91,7 +60,6 @@ kedai kedutaan kelompok bermain|kb kelurahan|kel -kennel kepolisian daerah|polda kepolisian resor|polres kepolisian sektor|polsek @@ -114,8 +82,6 @@ laboratorium lapangan lapangan golf lembaga pemasyarakatan -lounge -mall|mll mansion marina markas besar|mabes @@ -125,22 +91,14 @@ memorial|mem mesjid|masjid|msjd monumen motel|mot|motl|mtel -museum|mus -night club|nightclub -office|ofc -office tower|ofc twr|office twr|ofc tower pabrik|pbrk pandai besi pangkalan udara|lanud|lanud pantai|pante panti jompo|pnt jmp|pnti jmpo -parking -parking lot parkiran -park|pk|prk paroki pasar -pavilion paviliun|pav pediatric pelabuhan @@ -155,8 +113,6 @@ penjara perkebunan|kebun|kbn|kebon perpustakaan perserikatan -playgroup -plaza politeknik pom bensin pondok|pndok @@ -168,8 +124,6 @@ pusat kesehatan pusat pemulihan pusat perawatan pusat seni -reservation|res|resrv|resv|rsrv|rserv|rs -resort restoran ruangan rumah makan|rm @@ -177,27 +131,15 @@ rumah|rmh|rmah|rumh rumah sakit hewan|rs hewan rumah sakit|rs|r s rumah sakit wanita|rs wanita|r s wanita -salon -saloon -sauna sekolah dasar neger|sdn|sd n sekolah menengah atas negeri|sma n|sma sekolah menengah pertama negeri|smp n|smpn -shopping center|shoppingcenter|shoppingcentre|shopping|shoppingtown|shopping town|shopping centre|shctr|sh ctr|s / centre|shp / centre -shop|shp -showground sirkuit sirkus situ -social club -spa -stadium stasiun -steakhouse -studio suaka margasatwa nasional sungai -supermarket|super market tahanan taman taman kanak|tk @@ -210,13 +152,9 @@ terminal|term toko toko toko buku -tower|twr -townhouse|town house -unit unit gawat darurat|ugd unit geriatrik universitas|univ|uni -velodrome vila|vl|vla warung yoga diff --git a/resources/dictionaries/id/unit_types_numbered.txt b/resources/dictionaries/id/unit_types_numbered.txt index a430fae7..ebcab23c 100644 --- a/resources/dictionaries/id/unit_types_numbered.txt +++ b/resources/dictionaries/id/unit_types_numbered.txt @@ -1,4 +1,3 @@ -flat|flt garasi|grsi gedung|gdg|gd gudang|gdng @@ -11,23 +10,9 @@ kios pabrik|pabrk|pbrk parcel parkiran|park -penthouse|pths|ph|pent house -reserve|rsve|rsrv|rsv|resv -room|rm rukun tetangga|rt rukun warga|rw -shed|shd -shop|shp -showroom|shrm stasiun stop -studio|stu -suite|ste|se -tenancy|tncy toko|tko -tower|twr -townhouse|tnhs|twnhs|tnhse|twnhse -unit|un|unt|u vila|vla|vl -warehouse|we|whs|wrhs -workshop|wshp|wkshp|wksp diff --git a/resources/dictionaries/id/unit_types_standalone.txt b/resources/dictionaries/id/unit_types_standalone.txt index b012acde..1b40e282 100644 --- a/resources/dictionaries/id/unit_types_standalone.txt +++ b/resources/dictionaries/id/unit_types_standalone.txt @@ -1,11 +1,9 @@ +belakang bawah|belakang bwh|blkg bwh +belakang bawah|belakang bwh|blkg bwh +belakang|blkg depan|dpn garasi|grsi -lobby|lby|lbby -kiri bawah|kiri bwh|kri bwh kanan bawah|kanan bwh|knan bwh -penthouse|pths|ph|pent house -belakang|blkg -rear lower|rear lwr -belakang bawah|belakang bwh|blkg bwh -sisi - +kiri bawah|kiri bwh|kri bwh +lobi|lbi|lbbi +sisi|ssi From 03be9eea4938c5b0e46a772eb5eb8982a381a88f Mon Sep 17 00:00:00 2001 From: Yanuar Budi Baskoro Date: Sun, 21 May 2017 15:58:02 +0700 Subject: [PATCH 06/19] [dictionaries] Remove additional english words from ID dictionary --- resources/dictionaries/id/academic_degrees.txt | 3 --- resources/dictionaries/id/building_types.txt | 3 +-- resources/dictionaries/id/company_types.txt | 8 -------- resources/dictionaries/id/personal_titles.txt | 3 --- resources/dictionaries/id/place_names.txt | 12 ++---------- resources/dictionaries/id/qualifiers.txt | 18 +++++++++--------- resources/dictionaries/id/street_types.txt | 12 ------------ resources/dictionaries/id/synonyms.txt | 1 - 8 files changed, 12 insertions(+), 48 deletions(-) diff --git a/resources/dictionaries/id/academic_degrees.txt b/resources/dictionaries/id/academic_degrees.txt index de9908d3..e7b1aa2f 100644 --- a/resources/dictionaries/id/academic_degrees.txt +++ b/resources/dictionaries/id/academic_degrees.txt @@ -26,9 +26,6 @@ magister statistik|mstat|m stat magister teknik|mt|m t magister teknologi informasi|mti|m ti magister veteriner|mvet|m vet -master of arts|ma|m a -master of public health|mph|m p h -master of scienc|msc|m sc sarjana administrasi bisnis|sab|s ab sarjana administrasi publik|sap|s ap sarjana agama|s ag|sag|sa g|s a g diff --git a/resources/dictionaries/id/building_types.txt b/resources/dictionaries/id/building_types.txt index 0aab3bcb..72c28d78 100644 --- a/resources/dictionaries/id/building_types.txt +++ b/resources/dictionaries/id/building_types.txt @@ -4,5 +4,4 @@ gedung|gd|gdg gudang|gdang kebun|kebon|kbn rumah|rmah|rmh -tower|twr -villa|vlla|vl +vila|vla|vl diff --git a/resources/dictionaries/id/company_types.txt b/resources/dictionaries/id/company_types.txt index 511b124d..79b8bb61 100644 --- a/resources/dictionaries/id/company_types.txt +++ b/resources/dictionaries/id/company_types.txt @@ -1,13 +1,7 @@ asosiasi bank -company|co -cooperative|coop|co op -corporation|corp dan rekan|& rekan -enterprise|ent firma|fa|f a -foundation -general patnership|gp|g p koperasi|kop|kprs koperasi usaha dagang|kud|k ud nonprofit|non profit @@ -18,7 +12,5 @@ perusahaan daerah|pd perusahaan dagang|pd|p d perusahaan jawatan|pj|pjaw|p jaw perusahaan umum|perum|p u -trust -unlimited|ultd|unltd usaha dagang|ud|u d yayasan diff --git a/resources/dictionaries/id/personal_titles.txt b/resources/dictionaries/id/personal_titles.txt index 09bee7fe..2ae5643c 100644 --- a/resources/dictionaries/id/personal_titles.txt +++ b/resources/dictionaries/id/personal_titles.txt @@ -15,7 +15,6 @@ letnan jenderal|let jen|letjen|lt jn mas|bang|ms|bg mayor|may|myr mayor jenderal|may jen|mayjen -mas mbak menteri|mentri|mntr pangeran|pngrn @@ -30,7 +29,5 @@ raden mas|rm|r m raden rara|rdn rara|rr raja|rja ratu -reverend|rev -saint|st sersan|sers|sersn sultan|sltn diff --git a/resources/dictionaries/id/place_names.txt b/resources/dictionaries/id/place_names.txt index ce7882ca..bc39c334 100644 --- a/resources/dictionaries/id/place_names.txt +++ b/resources/dictionaries/id/place_names.txt @@ -5,14 +5,12 @@ amfiteater apartmen|apt arkade auditorium -bakery balai kota bandar udara|bandara banjar|bjr bank perkreditan rakyat|bpr bar barak -bazar bendungan benteng benteng @@ -23,8 +21,6 @@ cagar alam cuci mobil daerah danau -dental -dentist departmen|dep dermaga desa|ds @@ -34,7 +30,7 @@ dusun|dsn farmasi galeri galeri seni -gampong|gp +gampong|gp|gpg|gmpg garasi gedung rekreasi gerbang @@ -52,7 +48,7 @@ kantor|kntr kantor pos kantor pusat kapel -karang taruan +karang taruna kawasan industri kebun binatang|bonbin kecamatan|kec @@ -85,12 +81,9 @@ lembaga pemasyarakatan mansion marina markas besar|mabes -market|mkt|mrkt medik|med -memorial|mem mesjid|masjid|msjd monumen -motel|mot|motl|mtel pabrik|pbrk pandai besi pangkalan udara|lanud|lanud @@ -100,7 +93,6 @@ parkiran paroki pasar paviliun|pav -pediatric pelabuhan pemadam kebakaran pemakaman diff --git a/resources/dictionaries/id/qualifiers.txt b/resources/dictionaries/id/qualifiers.txt index 37fc2612..910dfd59 100644 --- a/resources/dictionaries/id/qualifiers.txt +++ b/resources/dictionaries/id/qualifiers.txt @@ -1,14 +1,14 @@ -blok|blk -gedung|gdg|gd -kampung|kmpg|kpg|kp -kampong|kmpg|kpg|kp -gampong|gmpg|gpg banjar|bjr +blok|blk +desa|ds|dsa +dusun|dsn|dsun +gampong|gmpg|gpg|gp +gedung|gdg|gd kabupaten|kab|kbptn|kbp +kampong|kmpg|kpg|kp +kampung|kmpg|kpg|kp kecamatan|kec|kcmntn|kcm kelurahan|kel|klrhn|klh -desa|ds -dusun|dsn kompleks|komp|kmplk -kota -pulau +kota|kta +pulau|plau|pl diff --git a/resources/dictionaries/id/street_types.txt b/resources/dictionaries/id/street_types.txt index a3ef2a02..98dc1498 100644 --- a/resources/dictionaries/id/street_types.txt +++ b/resources/dictionaries/id/street_types.txt @@ -1,18 +1,6 @@ alun-alun|alun alun|alunalun gang|gg jalan|jln|jl|jl. -jalan besar|jl besar|jl.besar|jln besar|jln.besar -jalan desa|jl desa|jl.desa|jln desa|jln.desa -jalan tol lingkar|jl tol lingkar|jl.tol lingkar|jln tol lingkar|jln.tol lingkar -jalan lingkar|jl lingkar|jl.lingkar|jln lingkar|jln.lingkar -jalan lintas|jl lintas|jl.lintas|jln lintas|jln.lintas -jalan pedesaan|jl pedesaan|jl.pedesaan|jln pedesaan|jln.pedesaan -jalan pemukiman|jl pemukiman|jl.pemukiman|jln pemukiman|jln.pemukiman -jalan poros|jl poros|jl.poros|jln poros|jln.poros -jalan raya|jl raya|jl.raya|jln raya|jln.raya -jalan tol|jl tol|jl.tol|jln tol|jln.tol -jalan utama|jl utama|jl.utama|jln utama|jln.utama -jalan raya|jl raya|jl. raya|jln raya|jln. raya jalur jembatan lorong|lrong diff --git a/resources/dictionaries/id/synonyms.txt b/resources/dictionaries/id/synonyms.txt index 557e679f..279aa8ed 100644 --- a/resources/dictionaries/id/synonyms.txt +++ b/resources/dictionaries/id/synonyms.txt @@ -15,7 +15,6 @@ kelurahan|klrhn mas|ms mbak|mba|mb medikal|med -memorial|mem militer|mil national|nasl|nas'l sungai|sngai From 695756d48421e1029f7b5e26e4682cd1e6463cc5 Mon Sep 17 00:00:00 2001 From: Yanuar Budi Baskoro Date: Sun, 21 May 2017 16:56:14 +0700 Subject: [PATCH 07/19] [dictionaries] add more option on toponyms --- resources/dictionaries/id/chains.txt | 6 +-- resources/dictionaries/id/company_types.txt | 4 +- resources/dictionaries/id/place_names.txt | 1 - resources/dictionaries/id/stopwords.txt | 2 +- resources/dictionaries/id/toponyms.txt | 44 ++++++++++----------- 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/resources/dictionaries/id/chains.txt b/resources/dictionaries/id/chains.txt index 59781fe0..83d935eb 100644 --- a/resources/dictionaries/id/chains.txt +++ b/resources/dictionaries/id/chains.txt @@ -1,9 +1,9 @@ alfamart|alfamar indomart|indomaret -circle k|circlek +circle k|circlek|ck seven eleven|sevel carrefour -superindo -lottemart +superindo|super indo +lottemart|lotte mart bonjour bright diff --git a/resources/dictionaries/id/company_types.txt b/resources/dictionaries/id/company_types.txt index 79b8bb61..ba9c8b72 100644 --- a/resources/dictionaries/id/company_types.txt +++ b/resources/dictionaries/id/company_types.txt @@ -1,5 +1,4 @@ asosiasi -bank dan rekan|& rekan firma|fa|f a koperasi|kop|kprs @@ -10,7 +9,8 @@ perseroan komanditer|comanditaire venotschap|cv|c v perseroan terbatas|pt|p t perusahaan daerah|pd perusahaan dagang|pd|p d -perusahaan jawatan|pj|pjaw|p jaw +perusahaan jawatan|pj|pjaw|p jaw|p j +perusahaan otobus|po|p o perusahaan umum|perum|p u usaha dagang|ud|u d yayasan diff --git a/resources/dictionaries/id/place_names.txt b/resources/dictionaries/id/place_names.txt index bc39c334..9e2056d0 100644 --- a/resources/dictionaries/id/place_names.txt +++ b/resources/dictionaries/id/place_names.txt @@ -16,7 +16,6 @@ benteng benteng biara bioskop -bistro cagar alam cuci mobil daerah diff --git a/resources/dictionaries/id/stopwords.txt b/resources/dictionaries/id/stopwords.txt index 1703f14d..efa1719b 100644 --- a/resources/dictionaries/id/stopwords.txt +++ b/resources/dictionaries/id/stopwords.txt @@ -1,4 +1,4 @@ -berlawanan|lawanan +berlawanan|lawanan|lwnn dan|dn|n dari|dr dekat|dkt diff --git a/resources/dictionaries/id/toponyms.txt b/resources/dictionaries/id/toponyms.txt index ddba1aef..14236e5f 100644 --- a/resources/dictionaries/id/toponyms.txt +++ b/resources/dictionaries/id/toponyms.txt @@ -1,34 +1,34 @@ bali -bangka belitung|babel +bangka belitung|babel|ba bel banten bengkulu -daerah istimewa yogyakarta|di yogyakarta|d.i. yogyakarta|diy +daerah istimewa yogyakarta|di yogyakarta|d.i. yogyakarta|diy|d i y dki jakarta|jakarta gorontalo jambi -jawa barat|jabar -jawa tengah|jateng -jawa timur|jatim -kalimantan barat|kalbar -kalimantan selatan|kalsel -kalimantan tengah|kalteng -kalimantan timur|kaltim -kalimantan utara|kalut -kepulauan riau|kepri +jawa barat|jabar|ja bar +jawa tengah|jateng|ja teng +jawa timur|jatim|ja tim +kalimantan barat|kalbar|kal bar +kalimantan selatan|kalsel|kal sel +kalimantan tengah|kalteng|kal teng +kalimantan timur|kaltim|kal tim +kalimantan utara|kalut|kal ut +kepulauan riau|kepri|kep ri lampung maluku -maluku utara|malut +maluku utara|malut|mal ut nanggroe aceh darussalam|nanggro aceh darussalam|nanggroe aceh darusalam|nad|n a d -nusa tenggara barat|ntb -nusa tenggara timur|ntt +nusa tenggara barat|ntb|n t b +nusa tenggara timur|ntt|n t t papua papua barat riau -sulawesi barat|sulbar -sulawesi selatan|sulsel -sulawesi tengah|sulteng -sulawesi tenggara|sultra -sulawesi utara|sulut -sumatera barat|sumatra barat|sumbar -sumatera selatan|sumatra selatan|sumsel -sumatera utara|sumatra utara|sumut +sulawesi barat|sulbar|sul bar +sulawesi selatan|sulsel|sul sel +sulawesi tengah|sulteng|sul teng +sulawesi tenggara|sultra|sul tra +sulawesi utara|sulut|sul ut +sumatera barat|sumatra barat|sumbar|sum bar +sumatera selatan|sumatra selatan|sumsel|sum sel +sumatera utara|sumatra utara|sumut|sum ut From 08524f4b0708d934115117ab420355f42de7ba18 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:25:59 -0400 Subject: [PATCH 08/19] [dictionaries] moving some of the existing chain stores for Indonesia to the all/chains.txt dictionary --- resources/dictionaries/all/ambiguous_expansions.txt | 1 + resources/dictionaries/all/chains.txt | 6 +++--- resources/dictionaries/id/chains.txt | 4 ---- 3 files changed, 4 insertions(+), 7 deletions(-) create mode 100644 resources/dictionaries/all/ambiguous_expansions.txt diff --git a/resources/dictionaries/all/ambiguous_expansions.txt b/resources/dictionaries/all/ambiguous_expansions.txt new file mode 100644 index 00000000..1f630bb0 --- /dev/null +++ b/resources/dictionaries/all/ambiguous_expansions.txt @@ -0,0 +1 @@ +ck diff --git a/resources/dictionaries/all/chains.txt b/resources/dictionaries/all/chains.txt index 8725646b..5974e524 100644 --- a/resources/dictionaries/all/chains.txt +++ b/resources/dictionaries/all/chains.txt @@ -1,4 +1,4 @@ -7-eleven|7 eleven|7-11|seven-eleven|seven eleven|seveneleven|seven-11|seven 11|7-elevens|7 elevens|7-11s|seven-elevens|seven elevens|sevenelevens|seven-11s|seven 11s +7-eleven|7 eleven|7-11|seven-eleven|seven eleven|seveneleven|seven-11|seven 11|7-elevens|7 elevens|7-11s|seven-elevens|seven elevens|sevenelevens|seven-11s|seven 11s|sevel a&w|a & w|a and w|a&ws|a & ws|a and ws|a&w restaurants|a & w restaurants|a and w restaurants ace hardware|ace hardwares adidas @@ -9,7 +9,7 @@ albertsons|albertson's aldi aldi nord aldi süd|aldi sued -alfamart +alfamart|alfamar allianz alpha bank anz @@ -108,7 +108,7 @@ chuck e. cheese's|chuck e cheese's|chuck e. cheeses|chuck e cheeses|chuck e. che church's chicken|churchs chicken cibc|canadian imperial bank of commerce cici's pizza|ci ci's pizza|cici's|cicis pizza|cicis -circle k|circle-k +circle k|circle-k|circlek|ck citgo citibank|citi bank|citibanks|citi banks citroën|citroen diff --git a/resources/dictionaries/id/chains.txt b/resources/dictionaries/id/chains.txt index 83d935eb..013777b4 100644 --- a/resources/dictionaries/id/chains.txt +++ b/resources/dictionaries/id/chains.txt @@ -1,8 +1,4 @@ -alfamart|alfamar indomart|indomaret -circle k|circlek|ck -seven eleven|sevel -carrefour superindo|super indo lottemart|lotte mart bonjour From 52593c6374d89225fbf4176158fea5223b3a47c2 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:27:11 -0400 Subject: [PATCH 09/19] [dictionaries] remove nonprofit from Indonesian company types --- resources/dictionaries/id/company_types.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/resources/dictionaries/id/company_types.txt b/resources/dictionaries/id/company_types.txt index ba9c8b72..61f017ff 100644 --- a/resources/dictionaries/id/company_types.txt +++ b/resources/dictionaries/id/company_types.txt @@ -3,7 +3,6 @@ dan rekan|& rekan firma|fa|f a koperasi|kop|kprs koperasi usaha dagang|kud|k ud -nonprofit|non profit persekutuan perseroan komanditer|comanditaire venotschap|cv|c v perseroan terbatas|pt|p t From 83378049ee461fd13e1396685079767840ef0f99 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:35:53 -0400 Subject: [PATCH 10/19] [dictionaries] remove Doktor from academic degrees in Indonesian dictionaries --- resources/dictionaries/id/academic_degrees.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/resources/dictionaries/id/academic_degrees.txt b/resources/dictionaries/id/academic_degrees.txt index e7b1aa2f..bf61a2c2 100644 --- a/resources/dictionaries/id/academic_degrees.txt +++ b/resources/dictionaries/id/academic_degrees.txt @@ -1,4 +1,3 @@ -doktor|dr magister administrasi bisnis|mab|m ab magister administrasi publi|map|m ap magister administrasi rumah sakit|mars|m a r s From 364b00da01f14157aa3d185e0dbd43acf500991f Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:46:45 -0400 Subject: [PATCH 11/19] [dictionaries] separating Mas and Abang --- resources/dictionaries/id/personal_titles.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/dictionaries/id/personal_titles.txt b/resources/dictionaries/id/personal_titles.txt index 2ae5643c..f1695011 100644 --- a/resources/dictionaries/id/personal_titles.txt +++ b/resources/dictionaries/id/personal_titles.txt @@ -1,3 +1,4 @@ +abang|bang|bg brigadir jendral|brig jen|brigjen bapak|pak doktor|dr|dok @@ -12,7 +13,7 @@ komandan letnan|kmndn let|kmd lt letnan|letn|lt letnan kolonel|letkol|lt kol|lt kl letnan jenderal|let jen|letjen|lt jn -mas|bang|ms|bg +mas|ms mayor|may|myr mayor jenderal|may jen|mayjen mbak From 8a35cfcd80fed268ecbb742e5fa8eb1ea956e037 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:50:25 -0400 Subject: [PATCH 12/19] [dictionaries] removing level/platform/podium from Indonesian level types --- resources/dictionaries/id/level_types_numbered.txt | 2 -- resources/dictionaries/id/level_types_standalone.txt | 1 - 2 files changed, 3 deletions(-) diff --git a/resources/dictionaries/id/level_types_numbered.txt b/resources/dictionaries/id/level_types_numbered.txt index 8ec779f4..943aad24 100644 --- a/resources/dictionaries/id/level_types_numbered.txt +++ b/resources/dictionaries/id/level_types_numbered.txt @@ -1,3 +1 @@ lantai|lt|ltai|lt. -level|lev|levl|lvel|lvl|l|/ l -platform|pf diff --git a/resources/dictionaries/id/level_types_standalone.txt b/resources/dictionaries/id/level_types_standalone.txt index 668b6039..0b8e8120 100644 --- a/resources/dictionaries/id/level_types_standalone.txt +++ b/resources/dictionaries/id/level_types_standalone.txt @@ -1,4 +1,3 @@ lantai atas|lantai ats|lt ats|lt. ats|lt. atas lantai dasar|lt dsr|lt dsar|lt. dsr|lt. dasar lobi -podium|pd From 77365a56a53e5679a8bb540c0c752343908fcfea Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:51:15 -0400 Subject: [PATCH 13/19] [dictionaries] removing no fixed address from Indonesian dictionaries --- resources/dictionaries/id/no_number.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 resources/dictionaries/id/no_number.txt diff --git a/resources/dictionaries/id/no_number.txt b/resources/dictionaries/id/no_number.txt deleted file mode 100644 index 8ef9c661..00000000 --- a/resources/dictionaries/id/no_number.txt +++ /dev/null @@ -1 +0,0 @@ -no fixed address|nfa|n f a|n / f / a \ No newline at end of file From ec79c610ebce297c09ee79327e4e49764de6b538 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:55:59 -0400 Subject: [PATCH 14/19] [dictionaries] removing a few English words and dupes from Indonesian place names --- resources/dictionaries/id/place_names.txt | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/resources/dictionaries/id/place_names.txt b/resources/dictionaries/id/place_names.txt index 9e2056d0..809eead6 100644 --- a/resources/dictionaries/id/place_names.txt +++ b/resources/dictionaries/id/place_names.txt @@ -39,7 +39,6 @@ istana jembatan kabupaten|kab kali -kampung|kp|kpg|kmpg kampus kandang kantor bupati @@ -58,9 +57,7 @@ kelurahan|kel kepolisian daerah|polda kepolisian resor|polres kepolisian sektor|polsek -kindergarten kios -kitchen klinik kolam renang komite @@ -72,12 +69,10 @@ kos|kosan kota krematorium kuburan -lab -laboratorium +laboratorium|lab lapangan lapangan golf lembaga pemasyarakatan -mansion marina markas besar|mabes medik|med @@ -141,7 +136,6 @@ tempat pertunjukan tepi pantai terminal|term toko -toko toko buku unit gawat darurat|ugd unit geriatrik From 4df48fb412adaa5df7a80a6a670fb9e0bf1b302b Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 17:57:34 -0400 Subject: [PATCH 15/19] [dictionaries] moving Kampong to normalize to Kampung in Indonesian, better if there's one canonical form --- resources/dictionaries/id/qualifiers.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/resources/dictionaries/id/qualifiers.txt b/resources/dictionaries/id/qualifiers.txt index 910dfd59..6074d89c 100644 --- a/resources/dictionaries/id/qualifiers.txt +++ b/resources/dictionaries/id/qualifiers.txt @@ -5,8 +5,7 @@ dusun|dsn|dsun gampong|gmpg|gpg|gp gedung|gdg|gd kabupaten|kab|kbptn|kbp -kampong|kmpg|kpg|kp -kampung|kmpg|kpg|kp +kampung|kmpg|kpg|kp|kampong kecamatan|kec|kcmntn|kcm kelurahan|kel|klrhn|klh kompleks|komp|kmplk From 4b24699e1f8bf3b5be05a6e1855ff922b6c682b4 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 18:00:20 -0400 Subject: [PATCH 16/19] [fix] changing national to nasional in Indonesian --- resources/dictionaries/id/synonyms.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/dictionaries/id/synonyms.txt b/resources/dictionaries/id/synonyms.txt index 279aa8ed..9490c227 100644 --- a/resources/dictionaries/id/synonyms.txt +++ b/resources/dictionaries/id/synonyms.txt @@ -16,7 +16,7 @@ mas|ms mbak|mba|mb medikal|med militer|mil -national|nasl|nas'l +nasional|nasl|nas'l sungai|sngai tanjung|tjg utama|utm From f5071024571aaa95ea759664b8ed4b85d0d13b41 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 18:01:38 -0400 Subject: [PATCH 17/19] [dictionaries] removing English words from Indonesian unit types --- resources/dictionaries/id/unit_types_numbered.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/resources/dictionaries/id/unit_types_numbered.txt b/resources/dictionaries/id/unit_types_numbered.txt index ebcab23c..e2f9b444 100644 --- a/resources/dictionaries/id/unit_types_numbered.txt +++ b/resources/dictionaries/id/unit_types_numbered.txt @@ -8,11 +8,9 @@ kavling|kav|kv kebun|kebon|kbn kios pabrik|pabrk|pbrk -parcel parkiran|park rukun tetangga|rt rukun warga|rw stasiun -stop toko|tko vila|vla|vl From 3b5b5d8baa0edd992c7bf50fc0b7e0bfa819e31a Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 23 May 2017 18:04:09 -0400 Subject: [PATCH 18/19] [dictionaries] adding ambiguous expansions for all Indonesian abbreviations 1-2 characters as they could also be initials, etc. --- .../dictionaries/id/ambiguous_expansions.txt | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 resources/dictionaries/id/ambiguous_expansions.txt diff --git a/resources/dictionaries/id/ambiguous_expansions.txt b/resources/dictionaries/id/ambiguous_expansions.txt new file mode 100644 index 00000000..02700b38 --- /dev/null +++ b/resources/dictionaries/id/ambiguous_expansions.txt @@ -0,0 +1,63 @@ +bg +bu +di +dn +dr +ds +fa +f a +gd +gg +gn +gp +jl +kb +ke +kp +kv +l +lt +mb +mh +m h +mm +m m +ms +mt +m t +n +no +nr +pd +p d +pj +p j +pl +po +p o +pt +p t +pu +p u +r +rm +r m +rr +r r +rs +r s +rt +rw +se +s e +sh +s h +sp +s p +ss +s s +st +s t +ud +u d +vl From 1948634bf3ad1a305698336832592996f56eb9b2 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 24 May 2017 00:26:32 -0400 Subject: [PATCH 19/19] [dictionaries] adding a separable prefix for Jl. and Jln. so things like Jl.Utara get separated and expanded --- resources/dictionaries/id/concatenated_prefixes_separable.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 resources/dictionaries/id/concatenated_prefixes_separable.txt diff --git a/resources/dictionaries/id/concatenated_prefixes_separable.txt b/resources/dictionaries/id/concatenated_prefixes_separable.txt new file mode 100644 index 00000000..3f4d6c59 --- /dev/null +++ b/resources/dictionaries/id/concatenated_prefixes_separable.txt @@ -0,0 +1,2 @@ +jl. +jln.