diff --git a/resources/dictionaries/all/chains.txt b/resources/dictionaries/all/chains.txt index 5974e524..8eeca1b4 100644 --- a/resources/dictionaries/all/chains.txt +++ b/resources/dictionaries/all/chains.txt @@ -1,4 +1,4 @@ -7-eleven|7 eleven|7-11|seven-eleven|seven eleven|seveneleven|seven-11|seven 11|7-elevens|7 elevens|7-11s|seven-elevens|seven elevens|sevenelevens|seven-11s|seven 11s|sevel +7-eleven|7 eleven|7-11|seven-eleven|seven eleven|seveneleven|seven-11|seven 11|7-elevens|7 elevens|7-11s|seven-elevens|seven elevens|sevenelevens|seven-11s|seven 11s|sevel|7 11 a&w|a & w|a and w|a&ws|a & ws|a and ws|a&w restaurants|a & w restaurants|a and w restaurants ace hardware|ace hardwares adidas diff --git a/resources/dictionaries/ca/ambiguous_expansions.txt b/resources/dictionaries/ca/ambiguous_expansions.txt index 60685bd0..669e46b8 100644 --- a/resources/dictionaries/ca/ambiguous_expansions.txt +++ b/resources/dictionaries/ca/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b d e diff --git a/resources/dictionaries/ca/stopwords.txt b/resources/dictionaries/ca/stopwords.txt index fbc06ff6..eeb8905a 100644 --- a/resources/dictionaries/ca/stopwords.txt +++ b/resources/dictionaries/ca/stopwords.txt @@ -13,7 +13,7 @@ el els es entre -i +i|& l' la les diff --git a/resources/dictionaries/cs/ambiguous_expansions.txt b/resources/dictionaries/cs/ambiguous_expansions.txt index 9eae731c..157fc657 100644 --- a/resources/dictionaries/cs/ambiguous_expansions.txt +++ b/resources/dictionaries/cs/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c j s diff --git a/resources/dictionaries/cs/stopwords.txt b/resources/dictionaries/cs/stopwords.txt index 2e65efe2..21d17436 100644 --- a/resources/dictionaries/cs/stopwords.txt +++ b/resources/dictionaries/cs/stopwords.txt @@ -1 +1 @@ -a \ No newline at end of file +a|& \ No newline at end of file diff --git a/resources/dictionaries/da/ambiguous_expansions.txt b/resources/dictionaries/da/ambiguous_expansions.txt index d0990461..cb32539b 100644 --- a/resources/dictionaries/da/ambiguous_expansions.txt +++ b/resources/dictionaries/da/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c n o diff --git a/resources/dictionaries/da/stopwords.txt b/resources/dictionaries/da/stopwords.txt new file mode 100644 index 00000000..fbda6bfa --- /dev/null +++ b/resources/dictionaries/da/stopwords.txt @@ -0,0 +1 @@ +og|& \ No newline at end of file diff --git a/resources/dictionaries/de/ambiguous_expansions.txt b/resources/dictionaries/de/ambiguous_expansions.txt index eaf4cbbf..63700fcc 100644 --- a/resources/dictionaries/de/ambiguous_expansions.txt +++ b/resources/dictionaries/de/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a b ch diff --git a/resources/dictionaries/en/ambiguous_expansions.txt b/resources/dictionaries/en/ambiguous_expansions.txt index dad6c2b7..5f4ad757 100644 --- a/resources/dictionaries/en/ambiguous_expansions.txt +++ b/resources/dictionaries/en/ambiguous_expansions.txt @@ -1,45 +1,92 @@ +& aat act ab -al +abby ak +al +alee +ally +aly ar az +ant +app +apt arc +art +arty +ba bc +bot +byu c ca +carp +cause +ce co +col +con +coop +cor +cowy ct de dc +div +divers +d +doc +dup e +elb +ex f +fit fl +form +fry g ga +gen +gra +h hi +hon i id il +imp in ia +is j jbt +k ks ky l la +lit +low +lynn +m ma me mb md +mem mi +miss +mid +mil +mun mn +mr ms mo mt -m n nb nc @@ -58,35 +105,71 @@ nw nwt nv ny +o oh on ok or +out +p pa +pass pe pei +plat +pur +q qc qld +quad r +ra +ran +rep +reps +rev ri ro +row +rowy s +sa sc sd se +sec +sect +sen +sh +shun sk sw +t tas +thick +thro tn +tri tx +tun +u +up ut un -vic -vt +v va +via +vic +vill +vis +vt w wa wv wi +wy +wyn +x +y yt -wy \ No newline at end of file +z \ No newline at end of file diff --git a/resources/dictionaries/en/company_types.txt b/resources/dictionaries/en/company_types.txt index 3af64e20..5d2f36fa 100644 --- a/resources/dictionaries/en/company_types.txt +++ b/resources/dictionaries/en/company_types.txt @@ -1,3 +1,5 @@ +associates|assoc +association|assoc bank b corporation|b corp|bcorp charitable incorporated organization|cio|c i o @@ -34,7 +36,7 @@ limited liability limited partnership|lllp|l l l p limited liability partnership|llp|l l p limited partnership|lp|l p look through company|look through co|lookthrough company|lookthrough co|ltc -national association|na|n a +national association|na|n a|nat assoc|natl assoc national trust and savings association|national trust & savings association|nt & sa|nt and sa|nt sa|ntsa no liability|nl|n l nonprofit|non profit diff --git a/resources/dictionaries/en/place_names.txt b/resources/dictionaries/en/place_names.txt index 004adcc6..58baf28b 100644 --- a/resources/dictionaries/en/place_names.txt +++ b/resources/dictionaries/en/place_names.txt @@ -252,6 +252,8 @@ salon sanctuary|sanct sauna secondary school +service|svc +services|svcs|svc shelter sheriff's department|sherrifs department|sheriff's dept|sherrifs dept sherrif's office|sherffis office|sheriff's ofc|sheriffs ofc @@ -267,6 +269,7 @@ stadium station|sta|stn steakhouse store|stor +stores studio studios subdivision diff --git a/resources/dictionaries/en/stopwords.txt b/resources/dictionaries/en/stopwords.txt index 812b21f1..c88da481 100644 --- a/resources/dictionaries/en/stopwords.txt +++ b/resources/dictionaries/en/stopwords.txt @@ -1,10 +1,14 @@ -and +a +and|& all at between|betw|btwn|btw|btween|b / t by +for +in of +on the to via -opposite \ No newline at end of file +opposite|opp \ No newline at end of file diff --git a/resources/dictionaries/en/synonyms.txt b/resources/dictionaries/en/synonyms.txt index 98f09a4a..b6ac8907 100644 --- a/resources/dictionaries/en/synonyms.txt +++ b/resources/dictionaries/en/synonyms.txt @@ -18,6 +18,8 @@ greater|grtr|gtr greens|grns groves|grvs heights|hghts|hgts|hieghts|ht|hts|hgths +hill|hl +hills|hls international|intl|int'l lake|lk lakes|lks diff --git a/resources/dictionaries/es/ambiguous_expansions.txt b/resources/dictionaries/es/ambiguous_expansions.txt index 8b443427..0ca210eb 100644 --- a/resources/dictionaries/es/ambiguous_expansions.txt +++ b/resources/dictionaries/es/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c cr d diff --git a/resources/dictionaries/es/stopwords.txt b/resources/dictionaries/es/stopwords.txt index 206d8773..4c309c30 100644 --- a/resources/dictionaries/es/stopwords.txt +++ b/resources/dictionaries/es/stopwords.txt @@ -26,4 +26,4 @@ por sin un una -y \ No newline at end of file +y|& \ No newline at end of file diff --git a/resources/dictionaries/et/ambiguous_expansions.txt b/resources/dictionaries/et/ambiguous_expansions.txt index d97bc2d4..538bfca6 100644 --- a/resources/dictionaries/et/ambiguous_expansions.txt +++ b/resources/dictionaries/et/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& k l p diff --git a/resources/dictionaries/et/stopwords.txt b/resources/dictionaries/et/stopwords.txt new file mode 100644 index 00000000..fa41c60f --- /dev/null +++ b/resources/dictionaries/et/stopwords.txt @@ -0,0 +1 @@ +ja|& \ No newline at end of file diff --git a/resources/dictionaries/eu/ambiguous_expansions.txt b/resources/dictionaries/eu/ambiguous_expansions.txt index 23fa7d31..553961dd 100644 --- a/resources/dictionaries/eu/ambiguous_expansions.txt +++ b/resources/dictionaries/eu/ambiguous_expansions.txt @@ -1 +1,2 @@ +& k \ No newline at end of file diff --git a/resources/dictionaries/eu/stopwords.txt b/resources/dictionaries/eu/stopwords.txt new file mode 100644 index 00000000..6aafbc3a --- /dev/null +++ b/resources/dictionaries/eu/stopwords.txt @@ -0,0 +1 @@ +eta|& \ No newline at end of file diff --git a/resources/dictionaries/fi/ambiguous_expansions.txt b/resources/dictionaries/fi/ambiguous_expansions.txt index 48370ef9..6dc2a2f1 100644 --- a/resources/dictionaries/fi/ambiguous_expansions.txt +++ b/resources/dictionaries/fi/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& k p r diff --git a/resources/dictionaries/fi/stopwords.txt b/resources/dictionaries/fi/stopwords.txt new file mode 100644 index 00000000..fa41c60f --- /dev/null +++ b/resources/dictionaries/fi/stopwords.txt @@ -0,0 +1 @@ +ja|& \ No newline at end of file diff --git a/resources/dictionaries/fr/ambiguous_expansions.txt b/resources/dictionaries/fr/ambiguous_expansions.txt index ca1ae415..e4ebf822 100644 --- a/resources/dictionaries/fr/ambiguous_expansions.txt +++ b/resources/dictionaries/fr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a ab bc diff --git a/resources/dictionaries/fr/stopwords.txt b/resources/dictionaries/fr/stopwords.txt index 7d323c41..5d19da08 100644 --- a/resources/dictionaries/fr/stopwords.txt +++ b/resources/dictionaries/fr/stopwords.txt @@ -15,7 +15,7 @@ du en en face de entre -et +et|& l' la le diff --git a/resources/dictionaries/gl/ambiguous_expansions.txt b/resources/dictionaries/gl/ambiguous_expansions.txt index 4634c792..127bded4 100644 --- a/resources/dictionaries/gl/ambiguous_expansions.txt +++ b/resources/dictionaries/gl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& e n o diff --git a/resources/dictionaries/gl/stopwords.txt b/resources/dictionaries/gl/stopwords.txt index 65cd1a9d..dfaea074 100644 --- a/resources/dictionaries/gl/stopwords.txt +++ b/resources/dictionaries/gl/stopwords.txt @@ -15,7 +15,7 @@ deles delas detras do -é +e|& en encima enfronte diff --git a/resources/dictionaries/hr/ambiguous_expansions.txt b/resources/dictionaries/hr/ambiguous_expansions.txt index 9e1e58ce..dcfd416e 100644 --- a/resources/dictionaries/hr/ambiguous_expansions.txt +++ b/resources/dictionaries/hr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c i j diff --git a/resources/dictionaries/hr/stopwords.txt b/resources/dictionaries/hr/stopwords.txt new file mode 100644 index 00000000..34661a71 --- /dev/null +++ b/resources/dictionaries/hr/stopwords.txt @@ -0,0 +1 @@ +i|& \ No newline at end of file diff --git a/resources/dictionaries/hu/ambiguous_expansions.txt b/resources/dictionaries/hu/ambiguous_expansions.txt index 367eb051..88190f16 100644 --- a/resources/dictionaries/hu/ambiguous_expansions.txt +++ b/resources/dictionaries/hu/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& d e k diff --git a/resources/dictionaries/hu/stopwords.txt b/resources/dictionaries/hu/stopwords.txt index b426abeb..6545dae0 100644 --- a/resources/dictionaries/hu/stopwords.txt +++ b/resources/dictionaries/hu/stopwords.txt @@ -1,4 +1,4 @@ a az egy -és|es \ No newline at end of file +és|es|& \ No newline at end of file diff --git a/resources/dictionaries/id/ambiguous_expansions.txt b/resources/dictionaries/id/ambiguous_expansions.txt index 02700b38..fba1c684 100644 --- a/resources/dictionaries/id/ambiguous_expansions.txt +++ b/resources/dictionaries/id/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& bg bu di diff --git a/resources/dictionaries/id/concatenated_prefixes_separable.txt b/resources/dictionaries/id/concatenated_prefixes_separable.txt deleted file mode 100644 index 3f4d6c59..00000000 --- a/resources/dictionaries/id/concatenated_prefixes_separable.txt +++ /dev/null @@ -1,2 +0,0 @@ -jl. -jln. diff --git a/resources/dictionaries/id/stopwords.txt b/resources/dictionaries/id/stopwords.txt index efa1719b..ee774698 100644 --- a/resources/dictionaries/id/stopwords.txt +++ b/resources/dictionaries/id/stopwords.txt @@ -1,5 +1,5 @@ berlawanan|lawanan|lwnn -dan|dn|n +dan|dn|n|en|& dari|dr dekat|dkt di diff --git a/resources/dictionaries/is/ambiguous_expansions.txt b/resources/dictionaries/is/ambiguous_expansions.txt index e2e18f37..65a14de0 100644 --- a/resources/dictionaries/is/ambiguous_expansions.txt +++ b/resources/dictionaries/is/ambiguous_expansions.txt @@ -1,4 +1,6 @@ +& a n +og s v \ No newline at end of file diff --git a/resources/dictionaries/is/stopwords.txt b/resources/dictionaries/is/stopwords.txt new file mode 100644 index 00000000..fbda6bfa --- /dev/null +++ b/resources/dictionaries/is/stopwords.txt @@ -0,0 +1 @@ +og|& \ No newline at end of file diff --git a/resources/dictionaries/it/ambiguous_expansions.txt b/resources/dictionaries/it/ambiguous_expansions.txt index 1bcea6fb..a2723dc9 100644 --- a/resources/dictionaries/it/ambiguous_expansions.txt +++ b/resources/dictionaries/it/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c e l diff --git a/resources/dictionaries/it/stopwords.txt b/resources/dictionaries/it/stopwords.txt index 756c3ef7..1df9448f 100644 --- a/resources/dictionaries/it/stopwords.txt +++ b/resources/dictionaries/it/stopwords.txt @@ -24,6 +24,7 @@ dell' dentro|d.tro|dtro di d' +e|& fuori gli i diff --git a/resources/dictionaries/ka/ambiguous_expansions.txt b/resources/dictionaries/ka/ambiguous_expansions.txt index 926a453d..ea2bb480 100644 --- a/resources/dictionaries/ka/ambiguous_expansions.txt +++ b/resources/dictionaries/ka/ambiguous_expansions.txt @@ -1 +1,2 @@ +& ქ \ No newline at end of file diff --git a/resources/dictionaries/ka/stopwords.txt b/resources/dictionaries/ka/stopwords.txt index 01648f46..217f7039 100644 --- a/resources/dictionaries/ka/stopwords.txt +++ b/resources/dictionaries/ka/stopwords.txt @@ -1 +1 @@ -და \ No newline at end of file +და|& \ No newline at end of file diff --git a/resources/dictionaries/lt/ambiguous_expansions.txt b/resources/dictionaries/lt/ambiguous_expansions.txt index bd110234..04b336c5 100644 --- a/resources/dictionaries/lt/ambiguous_expansions.txt +++ b/resources/dictionaries/lt/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a g k diff --git a/resources/dictionaries/lt/stopwords.txt b/resources/dictionaries/lt/stopwords.txt new file mode 100644 index 00000000..06eeff5e --- /dev/null +++ b/resources/dictionaries/lt/stopwords.txt @@ -0,0 +1 @@ +ir|& \ No newline at end of file diff --git a/resources/dictionaries/lv/ambiguous_expansions.txt b/resources/dictionaries/lv/ambiguous_expansions.txt index 9cae39b7..3dd31f68 100644 --- a/resources/dictionaries/lv/ambiguous_expansions.txt +++ b/resources/dictionaries/lv/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a d g diff --git a/resources/dictionaries/lv/stopwords.txt b/resources/dictionaries/lv/stopwords.txt new file mode 100644 index 00000000..c4409fcb --- /dev/null +++ b/resources/dictionaries/lv/stopwords.txt @@ -0,0 +1 @@ +un|& \ No newline at end of file diff --git a/resources/dictionaries/ms/ambiguous_expansions.txt b/resources/dictionaries/ms/ambiguous_expansions.txt new file mode 100644 index 00000000..00b15c0a --- /dev/null +++ b/resources/dictionaries/ms/ambiguous_expansions.txt @@ -0,0 +1 @@ +& \ No newline at end of file diff --git a/resources/dictionaries/ms/stopwords.txt b/resources/dictionaries/ms/stopwords.txt new file mode 100644 index 00000000..ff6fa2f8 --- /dev/null +++ b/resources/dictionaries/ms/stopwords.txt @@ -0,0 +1 @@ +dan|& \ No newline at end of file diff --git a/resources/dictionaries/mt/ambiguous_expansions.txt b/resources/dictionaries/mt/ambiguous_expansions.txt new file mode 100644 index 00000000..00b15c0a --- /dev/null +++ b/resources/dictionaries/mt/ambiguous_expansions.txt @@ -0,0 +1 @@ +& \ No newline at end of file diff --git a/resources/dictionaries/mt/stopwords.txt b/resources/dictionaries/mt/stopwords.txt index bc46bf88..059d1b8d 100644 --- a/resources/dictionaries/mt/stopwords.txt +++ b/resources/dictionaries/mt/stopwords.txt @@ -1,4 +1,5 @@ il is ta -tar \ No newline at end of file +tar +u|& \ No newline at end of file diff --git a/resources/dictionaries/nb/ambiguous_expansions.txt b/resources/dictionaries/nb/ambiguous_expansions.txt index f959e6b2..58ef95fb 100644 --- a/resources/dictionaries/nb/ambiguous_expansions.txt +++ b/resources/dictionaries/nb/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& g h k diff --git a/resources/dictionaries/nb/stopwords.txt b/resources/dictionaries/nb/stopwords.txt index 519fb87f..c19c7b3b 100644 --- a/resources/dictionaries/nb/stopwords.txt +++ b/resources/dictionaries/nb/stopwords.txt @@ -23,7 +23,7 @@ naer nærmest naermest nest -og +og|& overfor over på diff --git a/resources/dictionaries/nl/ambiguous_expansions.txt b/resources/dictionaries/nl/ambiguous_expansions.txt index 27dd82a3..a1123e76 100644 --- a/resources/dictionaries/nl/ambiguous_expansions.txt +++ b/resources/dictionaries/nl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b h k diff --git a/resources/dictionaries/nl/stopwords.txt b/resources/dictionaries/nl/stopwords.txt index 3f5ff96b..b4c04a01 100644 --- a/resources/dictionaries/nl/stopwords.txt +++ b/resources/dictionaries/nl/stopwords.txt @@ -9,7 +9,7 @@ der die dit een -en +en|& hem het hoe diff --git a/resources/dictionaries/pl/ambiguous_expansions.txt b/resources/dictionaries/pl/ambiguous_expansions.txt index 72a9695d..8e4c26a3 100644 --- a/resources/dictionaries/pl/ambiguous_expansions.txt +++ b/resources/dictionaries/pl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& d g k diff --git a/resources/dictionaries/pl/stopwords.txt b/resources/dictionaries/pl/stopwords.txt index 7d40e787..bd5a4769 100644 --- a/resources/dictionaries/pl/stopwords.txt +++ b/resources/dictionaries/pl/stopwords.txt @@ -1,3 +1,3 @@ -i +i|& na w \ No newline at end of file diff --git a/resources/dictionaries/pt/ambiguous_expansions.txt b/resources/dictionaries/pt/ambiguous_expansions.txt index bdf461fa..c6bbdd08 100644 --- a/resources/dictionaries/pt/ambiguous_expansions.txt +++ b/resources/dictionaries/pt/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b d e diff --git a/resources/dictionaries/pt/stopwords.txt b/resources/dictionaries/pt/stopwords.txt index 7e0d5fe8..0bbaec62 100644 --- a/resources/dictionaries/pt/stopwords.txt +++ b/resources/dictionaries/pt/stopwords.txt @@ -14,7 +14,7 @@ debaixo defronte do dos -e +e|& em em frente de|em ft de entre diff --git a/resources/dictionaries/ro/ambiguous_expansions.txt b/resources/dictionaries/ro/ambiguous_expansions.txt index b9c9a54f..b663b501 100644 --- a/resources/dictionaries/ro/ambiguous_expansions.txt +++ b/resources/dictionaries/ro/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& e n s diff --git a/resources/dictionaries/ro/stopwords.txt b/resources/dictionaries/ro/stopwords.txt index 3e6e19ed..80195e69 100644 --- a/resources/dictionaries/ro/stopwords.txt +++ b/resources/dictionaries/ro/stopwords.txt @@ -1 +1,2 @@ +și|si|& cel \ No newline at end of file diff --git a/resources/dictionaries/ru/ambiguous_expansions.txt b/resources/dictionaries/ru/ambiguous_expansions.txt index 213e43c3..e38e90ed 100644 --- a/resources/dictionaries/ru/ambiguous_expansions.txt +++ b/resources/dictionaries/ru/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& д d г diff --git a/resources/dictionaries/ru/stopwords.txt b/resources/dictionaries/ru/stopwords.txt new file mode 100644 index 00000000..ae72c191 --- /dev/null +++ b/resources/dictionaries/ru/stopwords.txt @@ -0,0 +1 @@ +и|& \ No newline at end of file diff --git a/resources/dictionaries/sk/ambiguous_expansions.txt b/resources/dictionaries/sk/ambiguous_expansions.txt index 9eae731c..157fc657 100644 --- a/resources/dictionaries/sk/ambiguous_expansions.txt +++ b/resources/dictionaries/sk/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c j s diff --git a/resources/dictionaries/sk/stopwords.txt b/resources/dictionaries/sk/stopwords.txt index 54e62505..aff4505e 100644 --- a/resources/dictionaries/sk/stopwords.txt +++ b/resources/dictionaries/sk/stopwords.txt @@ -1,4 +1,4 @@ -a +a|& bližko|blizko cez do diff --git a/resources/dictionaries/sl/ambiguous_expansions.txt b/resources/dictionaries/sl/ambiguous_expansions.txt index 9eae731c..157fc657 100644 --- a/resources/dictionaries/sl/ambiguous_expansions.txt +++ b/resources/dictionaries/sl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c j s diff --git a/resources/dictionaries/sl/stopwords.txt b/resources/dictionaries/sl/stopwords.txt index 8c6ca434..70fb5771 100644 --- a/resources/dictionaries/sl/stopwords.txt +++ b/resources/dictionaries/sl/stopwords.txt @@ -1,3 +1,4 @@ +in|& na ob pot diff --git a/resources/dictionaries/sr/ambiguous_expansions.txt b/resources/dictionaries/sr/ambiguous_expansions.txt index aee698c9..1fcdb71b 100644 --- a/resources/dictionaries/sr/ambiguous_expansions.txt +++ b/resources/dictionaries/sr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& и i ј diff --git a/resources/dictionaries/sr/stopwords.txt b/resources/dictionaries/sr/stopwords.txt new file mode 100644 index 00000000..b25c373b --- /dev/null +++ b/resources/dictionaries/sr/stopwords.txt @@ -0,0 +1,2 @@ +и|& +i|& \ No newline at end of file diff --git a/resources/dictionaries/sv/ambiguous_expansions.txt b/resources/dictionaries/sv/ambiguous_expansions.txt index fb91fa69..7224b6d3 100644 --- a/resources/dictionaries/sv/ambiguous_expansions.txt +++ b/resources/dictionaries/sv/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& g l k diff --git a/resources/dictionaries/sv/stopwords.txt b/resources/dictionaries/sv/stopwords.txt index 70ae014f..a1658555 100644 --- a/resources/dictionaries/sv/stopwords.txt +++ b/resources/dictionaries/sv/stopwords.txt @@ -16,6 +16,7 @@ intill mellan motliggande närmast|naermast +och|& över|oever på|paa på andra sidan|paa andra sidan diff --git a/resources/dictionaries/tr/ambiguous_expansions.txt b/resources/dictionaries/tr/ambiguous_expansions.txt index 601b3955..fd7bdf78 100644 --- a/resources/dictionaries/tr/ambiguous_expansions.txt +++ b/resources/dictionaries/tr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b d g diff --git a/resources/dictionaries/tr/stopwords.txt b/resources/dictionaries/tr/stopwords.txt new file mode 100644 index 00000000..f1f6e736 --- /dev/null +++ b/resources/dictionaries/tr/stopwords.txt @@ -0,0 +1 @@ +ve|& \ No newline at end of file diff --git a/resources/dictionaries/uk/ambiguous_expansions.txt b/resources/dictionaries/uk/ambiguous_expansions.txt index 994b54a6..4ef2c05e 100644 --- a/resources/dictionaries/uk/ambiguous_expansions.txt +++ b/resources/dictionaries/uk/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& д d ш diff --git a/resources/dictionaries/uk/stopwords.txt b/resources/dictionaries/uk/stopwords.txt new file mode 100644 index 00000000..0401c8a8 --- /dev/null +++ b/resources/dictionaries/uk/stopwords.txt @@ -0,0 +1,2 @@ +і|& +i|& \ No newline at end of file diff --git a/scripts/geodata/text/normalize.py b/scripts/geodata/text/normalize.py index 87df1227..70a70be0 100644 --- a/scripts/geodata/text/normalize.py +++ b/scripts/geodata/text/normalize.py @@ -2,7 +2,6 @@ import six from geodata.text import _normalize -from geodata.text.tokenize import tokenize_raw from geodata.text.token_types import token_types from geodata.encoding import safe_decode @@ -17,12 +16,7 @@ NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS NORMALIZE_STRING_SIMPLE_LATIN_ASCII = _normalize.NORMALIZE_STRING_SIMPLE_LATIN_ASCII -DEFAULT_STRING_OPTIONS = NORMALIZE_STRING_LATIN_ASCII | \ - NORMALIZE_STRING_DECOMPOSE | \ - NORMALIZE_STRING_TRIM | \ - NORMALIZE_STRING_REPLACE_HYPHENS | \ - NORMALIZE_STRING_STRIP_ACCENTS | \ - NORMALIZE_STRING_LOWERCASE +DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_DEFAULT_STRING_OPTIONS # Token options NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS @@ -34,16 +28,10 @@ NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHE NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS -DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ - NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ - NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ - NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ - NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE +DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS -TOKEN_OPTIONS_DROP_PERIODS = NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ - NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS - -DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) +TOKEN_OPTIONS_DROP_PERIODS = _normalize.NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS +DEFAULT_TOKEN_OPTIONS_NUMERIC = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC def remove_parens(tokens): @@ -62,33 +50,7 @@ def remove_parens(tokens): def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS): s = safe_decode(s) - if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: - normalized = _normalize.normalize_string_latin(s, string_options) - else: - normalized = _normalize.normalize_string_utf8(s, string_options) - - return normalized - - -def normalize_token(s, t, token_options=DEFAULT_TOKEN_OPTIONS): - return _normalize.normalize_token(s, t, token_options) - - -def normalize_tokens_whitespace(s, raw_tokens, token_options=DEFAULT_TOKEN_OPTIONS): - last_end = 0 - tokens = [] - - for t in raw_tokens: - t_norm = _normalize.normalize_token(s, t, token_options) - t_class = token_types.from_id(t[-1]) - - if last_end < t[0]: - tokens.append((six.u(' '), token_types.WHITESPACE)) - last_end = sum(t[:2]) - - tokens.append((t_norm, t_class)) - - return tokens + return _normalize.normalize_string(s, string_options) def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, @@ -105,20 +67,10 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, Usage: normalized_tokens(u'St.-Barthélemy') ''' - normalized = normalize_string(s, string_options=string_options) - - # Tuples of (offset, len, type) - raw_tokens = tokenize_raw(normalized) - tokens = [] - last_end = 0 - - if not whitespace: - tokens = [(_normalize.normalize_token(normalized, t, token_options), - token_types.from_id(t[-1])) for t in raw_tokens] - else: - tokens = normalize_tokens_whitespace(normalized, raw_tokens, token_options=token_options) + s = safe_decode(s) + normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace) if strip_parentheticals: - return remove_parens(tokens) - else: - return tokens + normalized_tokens = remove_parens(normalized_tokens) + + return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens] diff --git a/scripts/geodata/text/pynormalize.c b/scripts/geodata/text/pynormalize.c index 12f3735b..1ce2df7e 100644 --- a/scripts/geodata/text/pynormalize.c +++ b/scripts/geodata/text/pynormalize.c @@ -1,7 +1,6 @@ #include -#include "src/normalize.h" -#include "src/transliterate.h" +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -19,9 +18,7 @@ struct module_state { static struct module_state _state; #endif - - -static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) +static PyObject *py_normalize_string(PyObject *self, PyObject *args) { PyObject *arg1; uint64_t options; @@ -48,7 +45,7 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) if (str == NULL) { PyErr_SetString(PyExc_TypeError, "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; + goto exit_normalize_decref_unistr; } char *input = PyBytes_AsString(str); @@ -56,13 +53,13 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) #endif if (input == NULL) { - goto exit_decref_str; + goto exit_normalize_decref_str; } - char *normalized = normalize_string_utf8(input, options); + char *normalized = libpostal_normalize_string(input, options); if (normalized == NULL) { - goto exit_decref_str; + goto exit_normalize_decref_str; } PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); @@ -70,7 +67,7 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) if (result == NULL) { PyErr_SetString(PyExc_ValueError, "Result could not be utf-8 decoded"); - goto exit_decref_str; + goto exit_normalize_decref_str; } #ifndef IS_PY3K @@ -80,21 +77,26 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) return result; -exit_decref_str: +exit_normalize_decref_str: #ifndef IS_PY3K Py_XDECREF(str); #endif -exit_decref_unistr: +exit_normalize_decref_unistr: Py_XDECREF(unistr); return 0; } -static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) +static PyObject *py_normalized_tokens(PyObject *self, PyObject *args) { PyObject *arg1; - uint64_t options; - if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) { + uint64_t string_options = LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS; + uint64_t token_options = LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS; + uint32_t arg_whitespace = 0; + + PyObject *result = NULL; + + if (!PyArg_ParseTuple(args, "O|KKI:normalize", &arg1, &string_options, &token_options, &arg_whitespace)) { return 0; } @@ -117,7 +119,7 @@ static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) if (str == NULL) { PyErr_SetString(PyExc_TypeError, "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; + goto exit_normalized_tokens_decref_str; } char *input = PyBytes_AsString(str); @@ -125,98 +127,46 @@ static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) #endif if (input == NULL) { - goto exit_decref_str; + goto exit_normalized_tokens_decref_str; } - char *normalized = normalize_string_latin(input, strlen(input), options); + bool whitespace = arg_whitespace; - PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); - free(normalized); - if (result == NULL) { - PyErr_SetString(PyExc_ValueError, - "Result could not be utf-8 decoded"); - goto exit_decref_str; + size_t num_tokens; + libpostal_normalized_token_t *normalized_tokens = libpostal_normalized_tokens(input, string_options, token_options, whitespace, &num_tokens); + + if (normalized_tokens == NULL) { + goto exit_normalized_tokens_decref_str; } - #ifndef IS_PY3K - Py_XDECREF(str); - #endif - Py_XDECREF(unistr); - - return result; - -exit_decref_str: -#ifndef IS_PY3K - Py_XDECREF(str); -#endif -exit_decref_unistr: - Py_XDECREF(unistr); - return 0; -} - - - -static PyObject *py_normalize_token(PyObject *self, PyObject *args) -{ - PyObject *s; - - uint32_t offset; - uint32_t len; - uint16_t type; - - uint64_t options; - if (!PyArg_ParseTuple(args, "O(IIH)K:normalize", &s, &offset, &len, &type, &options)) { - PyErr_SetString(PyExc_TypeError, - "Error parsing arguments"); - return 0; + result = PyList_New((Py_ssize_t)num_tokens); + if (!result) { + goto exit_free_normalized_tokens; } - token_t token = (token_t){(size_t)offset, (size_t)len, type}; - - PyObject *unistr = PyUnicode_FromObject(s); - if (unistr == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be converted to unicode in scanner"); - return 0; - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - char *input = PyUnicode_AsUTF8(unistr); - - #else - // Python 2 encoding - - PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); - if (str == NULL) { - PyErr_SetString(PyExc_ValueError, - "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; + for (size_t i = 0; i < num_tokens; i++) { + libpostal_normalized_token_t normalized_token = normalized_tokens[i]; + char *token_str = normalized_token.str; + PyObject *py_token = PyUnicode_DecodeUTF8((const char *)token_str, strlen(token_str), "strict"); + if (py_token == NULL) { + Py_DECREF(result); + goto exit_free_normalized_tokens; } - char *input = PyBytes_AsString(str); + PyObject *t = PyTuple_New(2); + PyObject *py_token_type = PyInt_FromLong(normalized_token.token.type); - #endif + PyTuple_SetItem(t, 0, py_token); + PyTuple_SetItem(t, 1, py_token_type); - if (input == NULL) { - goto exit_decref_str; + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, t); } - char_array *token_buffer = char_array_new_size(token.len); - - add_normalized_token(token_buffer, input, token, options); - char *token_str = char_array_get_string(token_buffer); - PyObject *result = PyUnicode_DecodeUTF8((const char *)token_str, token_buffer->n - 1, "strict"); - - if (result == NULL) { - PyErr_SetString(PyExc_ValueError, - "Error decoding token"); - char_array_destroy(token_buffer); - goto exit_decref_str; + for (size_t i = 0; i < num_tokens; i++) { + free(normalized_tokens[i].str); } - - char_array_destroy(token_buffer); + free(normalized_tokens); #ifndef IS_PY3K Py_XDECREF(str); @@ -224,20 +174,24 @@ static PyObject *py_normalize_token(PyObject *self, PyObject *args) Py_XDECREF(unistr); return result; - -exit_decref_str: +exit_free_normalized_tokens: + for (size_t i = 0; i < num_tokens; i++) { + free(normalized_tokens[i].str); + } + free(normalized_tokens); +exit_normalized_tokens_decref_str: #ifndef IS_PY3K Py_XDECREF(str); #endif -exit_decref_unistr: +exit_normalized_tokens_decref_unistr: Py_XDECREF(unistr); return 0; } + static PyMethodDef normalize_methods[] = { - {"normalize_string_utf8", (PyCFunction)py_normalize_string_utf8, METH_VARARGS, "normalize_string_utf8(input, options)"}, - {"normalize_string_latin", (PyCFunction)py_normalize_string_latin, METH_VARARGS, "normalize_string_latin(input, options)"}, - {"normalize_token", (PyCFunction)py_normalize_token, METH_VARARGS, "normalize_token(input, options)"}, + {"normalize_string", (PyCFunction)py_normalize_string, METH_VARARGS, "normalize_string(input, options)"}, + {"normalized_tokens", (PyCFunction)py_normalized_tokens, METH_VARARGS, "normalize_token(input, string_options, token_options, whitespace)"}, {NULL, NULL}, }; @@ -295,32 +249,40 @@ init_normalize(void) { INITERROR; } - if (!transliteration_module_setup(NULL)) { + if (!libpostal_setup()) { PyErr_SetString(PyExc_RuntimeError, - "Could not load transliterate module"); + "Could not load libpostal"); Py_DECREF(module); INITERROR; } - PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LATIN_ASCII)); - PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRANSLITERATE)); - PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_STRIP_ACCENTS)); - PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE)); - PyModule_AddObject(module, "NORMALIZE_STRING_COMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_COMPOSE)); - PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE)); - PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM)); - PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_STRING_SIMPLE_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_SIMPLE_LATIN_ASCII)); + PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII)); + PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE)); + PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS)); + PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE)); + PyModule_AddObject(module, "NORMALIZE_STRING_COMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_COMPOSE)); + PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LOWERCASE)); + PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRIM)); + PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_STRING_SIMPLE_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_FINAL_PERIOD)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_DIGITS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS)); + + + PyModule_AddObject(module, "NORMALIZE_DEFAULT_STRING_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS)); + PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS)); + + PyModule_AddObject(module, "NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS)); + + PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC)); #if PY_MAJOR_VERSION >= 3 diff --git a/scripts/geodata/text/pytokenize.c b/scripts/geodata/text/pytokenize.c index 7986bae3..a69a86ea 100644 --- a/scripts/geodata/text/pytokenize.c +++ b/scripts/geodata/text/pytokenize.c @@ -1,6 +1,6 @@ #include -#include "src/scanner.h" +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -18,14 +18,17 @@ struct module_state { static struct module_state _state; #endif - static PyObject *py_tokenize(PyObject *self, PyObject *args) { PyObject *arg1; - if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) { + uint32_t arg_whitespace = 0; + + if (!PyArg_ParseTuple(args, "OI:tokenize", &arg1, &arg_whitespace)) { return 0; } + bool whitespace = arg_whitespace; + PyObject *unistr = PyUnicode_FromObject(arg1); if (unistr == NULL) { PyErr_SetString(PyExc_TypeError, @@ -57,26 +60,28 @@ static PyObject *py_tokenize(PyObject *self, PyObject *args) goto error_decref_str; } - token_array *tokens = tokenize(input); + size_t num_tokens; + + libpostal_token_t *tokens = libpostal_tokenize(input, whitespace, &num_tokens); if (tokens == NULL) { goto error_decref_str; } - PyObject *result = PyTuple_New(tokens->n); + PyObject *result = PyTuple_New(num_tokens); if (!result) { - token_array_destroy(tokens); + free(tokens); goto error_decref_str; return 0; } PyObject *tuple; - token_t token; - for (size_t i = 0; i < tokens->n; i++) { - token = tokens->a[i]; + libpostal_token_t token; + for (size_t i = 0; i < num_tokens; i++) { + token = tokens[i]; tuple = Py_BuildValue("III", token.offset, token.len, token.type); if (PyTuple_SetItem(result, i, tuple) < 0) { - token_array_destroy(tokens); + free(tokens); goto error_decref_str; } } @@ -86,7 +91,7 @@ static PyObject *py_tokenize(PyObject *self, PyObject *args) #endif Py_XDECREF(unistr); - token_array_destroy(tokens); + free(tokens); return result; @@ -100,12 +105,10 @@ error_decref_unistr: } static PyMethodDef tokenize_methods[] = { - {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"}, + {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text, whitespace)"}, {NULL, NULL}, }; - - #ifdef IS_PY3K static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) { diff --git a/scripts/geodata/text/tokenize.py b/scripts/geodata/text/tokenize.py index d3d18832..a05022bc 100644 --- a/scripts/geodata/text/tokenize.py +++ b/scripts/geodata/text/tokenize.py @@ -3,12 +3,9 @@ from geodata.text import _tokenize from geodata.text.token_types import token_types -def tokenize_raw(s): - return _tokenize.tokenize(safe_decode(s)) - -def tokenize(s): +def tokenize(s, whitespace=False): u = safe_decode(s) s = safe_encode(s) return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) - for start, length, token_type in _tokenize.tokenize(u)] + for start, length, token_type in _tokenize.tokenize(u, whitespace)] diff --git a/scripts/setup.py b/scripts/setup.py index a25b6b26..6bbf8891 100644 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -2,9 +2,7 @@ import os from setuptools import setup, Extension, find_packages -this_dir = os.path.dirname(__file__) -PROJECT_DIR = os.path.join(this_dir, os.pardir) -SRC_DIR = os.path.join(PROJECT_DIR, 'src') +RESOURCES_DIR = 'resources' def main(): @@ -14,35 +12,29 @@ def main(): packages=find_packages(), ext_modules=[ Extension('geodata.text._tokenize', - sources=[os.path.join(SRC_DIR, f) - for f in ('scanner.c', - 'string_utils.c', - 'tokens.c', - 'utf8proc/utf8proc.c', - ) - ] + ['geodata/text/pytokenize.c'], - include_dirs=[PROJECT_DIR], - extra_compile_args=['-O0', '-std=gnu99', + sources=['geodata/text/pytokenize.c'], + libraries=['postal'], + include_dirs=['/usr/local/include'], + library_dirs=['/usr/local/lib'], + extra_compile_args=['-std=c99', '-Wno-unused-function'], ), Extension('geodata.text._normalize', - sources=[os.path.join(SRC_DIR, f) - for f in ('normalize.c', - 'string_utils.c', - 'utf8proc/utf8proc.c', - 'tokens.c', - 'unicode_scripts.c', - 'transliterate.c', - 'file_utils.c', - 'trie.c', - 'trie_search.c',) - ] + ['geodata/text/pynormalize.c'], - include_dirs=[PROJECT_DIR], - extra_compile_args=['-std=gnu99', '-DHAVE_CONFIG_H', - '-DLIBPOSTAL_DATA_DIR="{}"'.format(os.getenv('LIBPOSTAL_DATA_DIR', os.path.realpath(os.path.join(PROJECT_DIR, 'data')))), + sources=['geodata/text/pynormalize.c'], + libraries=['postal'], + include_dirs=['/usr/local/include'], + library_dirs=['/usr/local/lib'], + extra_compile_args=['-std=c99', '-Wno-unused-function'], ), ], + data_files=[ + (os.path.join(RESOURCES_DIR, os.path.relpath(d, RESOURCES_DIR)), [os.path.join(d, filename) for filename in filenames]) + for d, _, filenames in os.walk(RESOURCES_DIR) + ], + package_data={ + 'geodata': ['**/*.sh'] + }, include_package_data=True, zip_safe=False, url='http://mapzen.com', diff --git a/src/Makefile.am b/src/Makefile.am index 6a13fce6..9b5f4887 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,7 +12,7 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include CFLAGS = lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = strndup.c libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c +libpostal_la_SOURCES = strndup.c libpostal.c expand.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c place.c near_dupe.c double_metaphone.c geohash/geohash.c dedupe.c string_similarity.c acronyms.c soft_tfidf.c jaccard.c libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined @@ -26,7 +26,7 @@ noinst_LTLIBRARIES = libscanner.la libscanner_la_SOURCES = klib/drand48.c scanner.c libscanner_la_CFLAGS = $(CFLAGS_O0) -D LIBPOSTAL_EXPORTS $(CFLAGS_SCANNER_EXTRA) -noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test +noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test near_dupe_test libpostal_SOURCES = strndup.c main.c json_encode.c file_utils.c string_utils.c utf8proc/utf8proc.c libpostal_LDADD = libpostal.la @@ -38,6 +38,11 @@ address_parser_SOURCES = strndup.c address_parser_cli.c json_encode.c linenoise/ address_parser_LDADD = libpostal.la $(CBLAS_LIBS) address_parser_CFLAGS = $(CFLAGS_O3) +near_dupe_test_SOURCES = strndup.c near_dupe_test.c string_utils.c utf8proc/utf8proc.c +near_dupe_test_LDADD = libpostal.la +near_dupe_test_CFLAGS = $(CFLAGS_O3) + + build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c build_address_dictionary_CFLAGS = $(CFLAGS_O3) build_numex_table_SOURCES = strndup.c numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c diff --git a/src/acronyms.c b/src/acronyms.c new file mode 100644 index 00000000..425b64f2 --- /dev/null +++ b/src/acronyms.c @@ -0,0 +1,150 @@ +#include "acronyms.h" + +static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_t num_languages, char **languages) { + size_t len = tokens->n; + uint32_array *stopwords_array = uint32_array_new_zeros(len); + + uint32_t *stopwords = stopwords_array->a; + + for (size_t l = 0; l < num_languages; l++) { + char *lang = languages[l]; + phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, tokens, lang); + + if (lang_phrases != NULL) { + size_t num_lang_phrases = lang_phrases->n; + for (size_t p = 0; p < num_lang_phrases; p++) { + phrase_t phrase = lang_phrases->a[p]; + + if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) { + for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) { + stopwords[stop_idx] = 1; + } + } + } + phrase_array_destroy(lang_phrases); + } + } + + return stopwords_array; +} + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { + if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { + return NULL; + } + + size_t len1 = tokens1->n; + size_t len2 = tokens2->n; + if (len1 == 0 || len2 == 0 || len1 == len2) return NULL; + + if (len1 > len2) { + const char *tmp_s = s1; + s1 = s2; + s2 = tmp_s; + + token_array *tmp_t = tokens1; + tokens1 = tokens2; + tokens2 = tmp_t; + + size_t tmp_l = len1; + len1 = len2; + len2 = tmp_l; + } + + phrase_array *alignments = NULL; + + token_t *t1 = tokens1->a; + token_t *t2 = tokens2->a; + + uint32_array *stopwords_array = stopword_tokens(s2, tokens2, num_languages, languages); + if (stopwords_array == NULL) { + return NULL; + } + + uint32_t *stopwords = stopwords_array->a; + + ssize_t acronym_start = -1; + ssize_t acronym_token_pos = -1; + + uint8_t *ptr1 = (uint8_t *)s1; + uint8_t *ptr2 = (uint8_t *)s2; + + int32_t c1, c2; + ssize_t c1_len; + ssize_t c2_len; + + size_t t2_consumed = 0; + + for (size_t i = 0; i < len1; i++) { + token_t ti = t1[i]; + + c1_len = utf8proc_iterate(ptr1 + ti.offset, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + + // Make sure it's a non-ideographic word. Single letter abbreviations will be captured by other methods + if (!is_word_token(ti.type) || is_ideographic(ti.type) || ti.len == c1_len) { + acronym_token_pos = -1; + continue; + } + + size_t ti_pos = 0; + + for (size_t j = t2_consumed; j < len2; j++) { + token_t tj = t2[j]; + c2_len = utf8proc_iterate(ptr2 + tj.offset, tj.len, &c2); + if (c2_len <= 0) { + break; + } + + if (utf8proc_tolower(c1) == utf8proc_tolower(c2)) { + ti_pos += c1_len; + if (acronym_start < 0) { + acronym_start = j; + acronym_token_pos = 0; + } + acronym_token_pos++; + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + } else if (stopwords[j] && acronym_token_pos > 0) { + continue; + } else if (is_punctuation(tj.type) && acronym_token_pos > 0) { + continue; + } else if (ti_pos < ti.len) { + acronym_token_pos = -1; + acronym_start = -1; + ti_pos = 0; + continue; + } + + if ((utf8_is_period(c1) || utf8_is_hyphen(c1)) && ti_pos < ti.len) { + ti_pos += c1_len; + if (ti_pos < ti.len) { + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + } + } + + if (ti_pos == ti.len) { + phrase_t phrase = (phrase_t){acronym_start, j - acronym_start + 1, i}; + // got alignment + if (alignments == NULL) { + alignments = phrase_array_new(); + } + + phrase_array_push(alignments, phrase); + + ti_pos = 0; + acronym_token_pos = -1; + acronym_start = -1; + } + } + + } + + uint32_array_destroy(stopwords_array); + + return alignments; +} diff --git a/src/acronyms.h b/src/acronyms.h new file mode 100644 index 00000000..5c61002e --- /dev/null +++ b/src/acronyms.h @@ -0,0 +1,15 @@ +#ifndef ACRONYMS_H +#define ACRONYMS_H + +#include +#include + +#include "address_dictionary.h" +#include "collections.h" +#include "tokens.h" +#include "token_types.h" + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); + + +#endif \ No newline at end of file diff --git a/src/address_dictionary.c b/src/address_dictionary.c index ed1116f6..9a1b328f 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -1,5 +1,6 @@ #include #include +#include #include "address_dictionary.h" @@ -35,6 +36,38 @@ inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint1 } +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (address_expansion_in_dictionary(expansion, dictionary_id)) { + return true; + } + } + return false; +} + + +bool address_phrase_in_dictionaries(phrase_t phrase, size_t n, ...) { + va_list args; + va_start(args, n); + bool in_dictionary = false; + for (size_t i = 0; i < n; i++) { + uint16_t dictionary_id = va_arg(args, uint16_t); + in_dictionary = address_phrase_in_dictionary(phrase, dictionary_id); + if (in_dictionary) break; + } + va_end(args); + return in_dictionary; +} + int32_t address_dictionary_next_canonical_index(void) { if (address_dict == NULL || address_dict->canonical == NULL) { @@ -63,6 +96,32 @@ char *address_dictionary_get_canonical(uint32_t index) { return cstring_array_get_string(address_dict->canonical, index); } +inline bool address_expansions_have_canonical_interpretation(address_expansion_array *expansions) { + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (expansion.canonical_index == NULL_CANONICAL_INDEX) { + return true; + } + } + return false; + +} + +inline bool address_phrase_has_canonical_interpretation(phrase_t phrase) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + + return address_expansions_have_canonical_interpretation(expansions); +} + + + address_expansion_value_t *address_expansion_value_new(void) { address_expansion_value_t *self = malloc(sizeof(address_expansion_value_t)); @@ -251,6 +310,31 @@ phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, return phrases; } + +phrase_t search_address_dictionaries_substring(char *str, size_t len, char *lang) { + if (str == NULL) return NULL_PHRASE; + if (address_dict == NULL) { + log_error(ADDRESS_DICTIONARY_SETUP_ERROR); + return NULL_PHRASE; + } + + trie_prefix_result_t prefix = get_language_prefix(lang); + + if (prefix.node_id == NULL_NODE_ID) { + log_debug("prefix.node_id == NULL_NODE_ID\n"); + return NULL_PHRASE; + } + + phrase_t phrase = trie_search_prefixes_from_index(address_dict->trie, str, len, prefix.node_id); + if (phrase.len == len) { + return phrase; + } else { + return NULL_PHRASE; + } + +} + + phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) { if (str == NULL) return NULL_PHRASE; if (address_dict == NULL) { diff --git a/src/address_dictionary.h b/src/address_dictionary.h index cc5e8748..bb000fb2 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -63,15 +63,20 @@ bool search_address_dictionaries_with_phrases(char *str, char *lang, phrase_arra phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, char *lang); bool search_address_dictionaries_tokens_with_phrases(char *str, token_array *tokens, char *lang, phrase_array **phrases); +phrase_t search_address_dictionaries_substring(char *str, size_t len, char *lang); phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang); phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); address_expansion_value_t *address_dictionary_get_expansions(uint32_t i); bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id); +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id); +bool address_phrase_in_dictionaries(phrase_t phrase, size_t n, ...); char *address_dictionary_get_canonical(uint32_t index); int32_t address_dictionary_next_canonical_index(void); bool address_dictionary_add_canonical(char *canonical); bool address_dictionary_add_expansion(char *key, char *language, address_expansion_t expansion); +bool address_expansions_have_canonical_interpretation(address_expansion_array *expansions); +bool address_phrase_has_canonical_interpretation(phrase_t phrase); void address_dictionary_destroy(address_dictionary_t *self); diff --git a/src/address_parser.h b/src/address_parser.h index 4c5e699f..b059a246 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -105,7 +105,14 @@ typedef enum { #define ADDRESS_PARSER_LABEL_HOUSE "house" #define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number" +#define ADDRESS_PARSER_LABEL_PO_BOX "po_box" +#define ADDRESS_PARSER_LABEL_BUILDING "building" +#define ADDRESS_PARSER_LABEL_ENTRANCE "entrance" +#define ADDRESS_PARSER_LABEL_STAIRCASE "staircase" +#define ADDRESS_PARSER_LABEL_LEVEL "level" +#define ADDRESS_PARSER_LABEL_UNIT "unit" #define ADDRESS_PARSER_LABEL_ROAD "road" +#define ADDRESS_PARSER_LABEL_METRO_STATION "metro_station" #define ADDRESS_PARSER_LABEL_SUBURB "suburb" #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district" #define ADDRESS_PARSER_LABEL_CITY "city" @@ -117,6 +124,8 @@ typedef enum { #define ADDRESS_PARSER_LABEL_COUNTRY "country" #define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region" +#define ADDRESS_PARSER_LABEL_WEBSITE "website" +#define ADDRESS_PARSER_LABEL_TELEPHONE "phone" typedef union address_parser_types { uint32_t value; diff --git a/src/dedupe.c b/src/dedupe.c new file mode 100644 index 00000000..30fbe2dd --- /dev/null +++ b/src/dedupe.c @@ -0,0 +1,400 @@ +#include "acronyms.h" +#include "address_parser.h" +#include "dedupe.h" +#include "expand.h" +#include "float_utils.h" +#include "jaccard.h" +#include "place.h" +#include "scanner.h" +#include "soft_tfidf.h" +#include "token_types.h" + +bool expansions_intersect(cstring_array *expansions1, cstring_array *expansions2) { + size_t n1 = cstring_array_num_strings(expansions1); + size_t n2 = cstring_array_num_strings(expansions2); + + bool intersect = false; + + for (size_t i = 0; i < n1; i++) { + char *e1 = cstring_array_get_string(expansions1, i); + for (size_t j = 0; j < n2; j++) { + char *e2 = cstring_array_get_string(expansions2, j); + if (string_equals(e1, e2)) { + intersect = true; + break; + } + } + if (intersect) break; + } + return intersect; +} + + +bool address_component_equals_root_option(char *s1, char *s2, libpostal_normalize_options_t options, bool root) { + uint64_t normalize_string_options = get_normalize_string_options(options); + + size_t n1, n2; + cstring_array *expansions1 = NULL; + cstring_array *expansions2 = NULL; + if (!root) { + expansions1 = expand_address(s1, options, &n1); + } else { + expansions1 = expand_address_root(s1, options, &n1); + } + + if (expansions1 == NULL) return false; + + if (!root) { + expansions2 = expand_address(s2, options, &n2); + } else { + expansions2 = expand_address_root(s2, options, &n2); + } + + if (expansions2 == NULL) { + cstring_array_destroy(expansions1); + return false; + } + + bool intersect = expansions_intersect(expansions1, expansions2); + + cstring_array_destroy(expansions1); + cstring_array_destroy(expansions2); + + return intersect; +} + +static inline bool address_component_equals(char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(s1, s2, options, false); +} + +static inline bool address_component_equals_root(char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(s1, s2, options, true); +} + + +static inline bool address_component_equals_root_fallback(char *s1, char *s2, libpostal_normalize_options_t options, bool root) { + return address_component_equals_root(s1, s2, options) || address_component_equals(s1, s2, options); +} + +libpostal_duplicate_status_t is_duplicate(char *value1, char *value2, libpostal_normalize_options_t normalize_options, libpostal_duplicate_options_t options, bool root_comparison_first, libpostal_duplicate_status_t root_comparison_status) { + if (value1 == NULL || value2 == NULL) { + return LIBPOSTAL_NULL_DUPLICATE_STATUS; + } + + normalize_options.num_languages = options.num_languages; + normalize_options.languages = options.languages; + + if (root_comparison_first) { + if (address_component_equals_root(value1, value2, normalize_options)) { + return root_comparison_status; + } else if (address_component_equals(value1, value2, normalize_options)) { + return LIBPOSTAL_EXACT_DUPLICATE; + } + } else { + if (address_component_equals(value1, value2, normalize_options)) { + return LIBPOSTAL_EXACT_DUPLICATE; + } else if (address_component_equals_root(value1, value2, normalize_options)) { + return root_comparison_status; + } + } + return LIBPOSTAL_NON_DUPLICATE; +} + +libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; + bool root_comparison_first = false; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} +libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; + bool root_comparison_first = false; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_ANY; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_ANY; + + place_t *place1 = place_from_components(num_components1, labels1, values1); + place_t *place2 = place_from_components(num_components2, labels2, values2); + + bool city_match = false; + libpostal_duplicate_status_t dupe_status = LIBPOSTAL_NON_DUPLICATE; + + if (place1->city != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->city, place2->city, normalize_options); + if (city_match) { + dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + } + } + + if (!city_match && place1->city == NULL && place1->city_district != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->city_district, place2->city, normalize_options); + if (city_match) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + } + + if (!city_match && place1->city == NULL && place1->suburb != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->suburb, place2->city, normalize_options); + if (city_match) { + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + } + + if (!city_match && place2->city == NULL && place2->city_district != NULL && place1->city != NULL) { + city_match = address_component_equals(place1->city, place2->city_district, normalize_options); + if (city_match) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + } + + if (!city_match && place2->city == NULL && place2->suburb != NULL && place1->city != NULL) { + city_match = address_component_equals(place1->suburb, place2->suburb, normalize_options); + if (city_match) { + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + } + + if (!city_match) { + goto exit_destroy_places; + } + + if (city_match && place1->state_district != NULL && place2->state_district != NULL && !address_component_equals_root(place1->state_district, place2->state_district, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + + if (city_match && place1->state != NULL && place2->state != NULL && !address_component_equals(place1->state, place2->state, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + + if (city_match && place1->country != NULL && place2->country != NULL && !address_component_equals(place1->country, place2->country, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + +exit_destroy_places: + place_destroy(place1); + place_destroy(place2); + return dupe_status; + +} + +char *joined_string_and_tokens_from_strings(char **strings, size_t num_strings, token_array *tokens) { + if (tokens == NULL || strings == NULL || num_strings == 0) return NULL; + token_array_clear(tokens); + + size_t full_len = 0; + for (size_t i = 0; i < num_strings; i++) { + full_len += strlen(strings[i]); + if (i < num_strings - 1) full_len++; + } + + char_array *a = char_array_new_size(full_len); + for (size_t i = 0; i < num_strings; i++) { + char *str = strings[i]; + size_t len = strlen(str); + size_t offset = a->n; + char_array_append(a, str); + + scanner_t scanner = scanner_from_string(str, len); + uint16_t token_type = scan_token(&scanner); + + token_t token = (token_t){offset, len, token_type}; + token_array_push(tokens, token); + if (i < num_strings - 1 && !is_ideographic(token.type)) { + char_array_append(a, " "); + } + } + + char_array_terminate(a); + return char_array_to_string(a); +} + +bool have_ideographic_word_tokens(token_array *token_array) { + if (token_array == NULL) return false; + + size_t n = token_array->n; + token_t *tokens = token_array->a; + for (size_t i = 0; i < n; i++) { + token_t token = tokens[i]; + if (is_ideographic(token.type) && is_word_token(token.type)) { + return true; + } + } + return false; +} + +libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) { + normalize_options.num_languages = options.num_languages; + normalize_options.languages = options.languages; + + normalize_options.address_components |= LIBPOSTAL_ADDRESS_ANY; + + double max_sim = 0.0; + + // Default is non-duplicate; + libpostal_duplicate_status_t dupe_status = LIBPOSTAL_NON_DUPLICATE; + + token_array *token_array1 = token_array_new_size(num_tokens1); + char *joined1 = joined_string_and_tokens_from_strings(tokens1, num_tokens1, token_array1); + + token_array *token_array2 = token_array_new_size(num_tokens2); + char *joined2 = joined_string_and_tokens_from_strings(tokens2, num_tokens2, token_array2); + + size_t num_languages = options.num_languages; + char **languages = options.languages; + + phrase_array *acronym_alignments = NULL; + + phrase_array *phrases1 = NULL; + phrase_array *phrases2 = NULL; + + bool is_ideographic = have_ideographic_word_tokens(token_array1) && have_ideographic_word_tokens(token_array2); + + if (!is_ideographic) { + if (do_acronyms) { + acronym_alignments = acronym_token_alignments(joined1, token_array1, joined2, token_array2, num_languages, languages); + } + + if (num_languages > 0) { + phrases1 = phrase_array_new(); + phrases2 = phrase_array_new(); + + for (size_t i = 0; i < num_languages; i++) { + char *lang = languages[i]; + phrase_array_clear(phrases1); + phrase_array_clear(phrases2); + + search_address_dictionaries_tokens_with_phrases(joined1, token_array1, lang, &phrases1); + search_address_dictionaries_tokens_with_phrases(joined2, token_array2, lang, &phrases2); + + double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options); + if (sim > max_sim) { + max_sim = sim; + } + } + } else if (do_acronyms) { + max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options); + } else { + max_sim = soft_tfidf_similarity(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options); + } + } else { + max_sim = jaccard_similarity_string_arrays(num_tokens1, tokens1, num_tokens2, tokens2); + if (string_equals(joined1, joined2)) { + dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + } else if (address_component_equals_root(joined1, joined2, normalize_options)) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + } + + if (dupe_status == LIBPOSTAL_NON_DUPLICATE) { + if (max_sim > options.likely_dupe_threshold || double_equals(max_sim, options.likely_dupe_threshold)) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } else if (max_sim > options.needs_review_threshold || double_equals(max_sim, options.needs_review_threshold)) { + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + } + + if (phrases1 != NULL) { + phrase_array_destroy(phrases1); + } + + if (phrases2 != NULL) { + phrase_array_destroy(phrases2); + } + + if (acronym_alignments != NULL) { + phrase_array_destroy(acronym_alignments); + } + + if (token_array1 != NULL) { + token_array_destroy(token_array1); + } + + if (joined1 != NULL) { + free(joined1); + } + + if (token_array2 != NULL) { + token_array_destroy(token_array2); + } + + if (joined2 != NULL) { + free(joined2); + } + + return (libpostal_fuzzy_duplicate_status_t){dupe_status, max_sim}; +} + +inline libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; + + bool do_acronyms = true; + + soft_tfidf_options_t soft_tfidf_options = soft_tfidf_default_options(); + + return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms); +} + + +inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; + + // General purpose acronyms didn't make as much sense in the street name context + // things like County Road = CR should be handled by the address dictionaries + bool do_acronyms = false; + + soft_tfidf_options_t soft_tfidf_options = soft_tfidf_default_options(); + + return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms); +} + diff --git a/src/dedupe.h b/src/dedupe.h new file mode 100644 index 00000000..5c40fb8c --- /dev/null +++ b/src/dedupe.h @@ -0,0 +1,23 @@ +#ifndef DEDUPE_H +#define DEDUPE_H + +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); + +libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); + + +#endif \ No newline at end of file diff --git a/src/double_metaphone.c b/src/double_metaphone.c new file mode 100644 index 00000000..54f03fad --- /dev/null +++ b/src/double_metaphone.c @@ -0,0 +1,981 @@ +#include +#include +#include + +#include "double_metaphone.h" +#include "string_utils.h" +#include "utf8proc/utf8proc.h" + +static bool is_vowel(char c) { + return (c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U' || c == 'Y'); +} + +static char get_char_at(char *str, size_t len, ssize_t idx) { + if (idx < 0 || idx >= len) return 0; + return str[idx]; +} + +static char *get_string_at(char *str, size_t len, ssize_t idx) { + if (idx < 0 || idx >= len) return NULL; + return str + idx; +} + +static inline bool is_slavo_germanic(char *s) { + return strstr(s, "W") + || strstr(s, "K") + || strstr(s, "CZ") + || strstr(s, "WITZ"); +} + +static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t substr_len, ...) { + char *string_at_index = get_string_at(str, len, index); + if (string_at_index == NULL) return false; + + va_list args; + va_start(args, substr_len); + + bool matched = false; + + while (true) { + char *sub = va_arg(args, char *); + if (sub == NULL) break; + + if (utf8_compare_len(string_at_index, sub, substr_len) == 0) { + matched = true; + break; + } + } + + va_end(args); + + return matched; + +} + +double_metaphone_codes_t *double_metaphone(char *input) { + if (input == NULL) return NULL; + + char *ptr = utf8_upper(input); + + /* Note: NFD normalization will help with simple decomposable accent characters + like "É", "Ü", etc. which effectively become "E\u0301" and "U\u0308". It does + not handle characters like "Ł". For these, use Latin-ASCII transliteration + prior to calling this function. + + We can still check for a specific accented character like C with cedilla (Ç), + by comparing with its decomposed form i.e. "C\xcc\xa7" + */ + + char *normalized = (char *)utf8proc_NFD((utf8proc_uint8_t *)ptr); + + if (normalized != NULL) { + free(ptr); + ptr = normalized; + } + + if (ptr == NULL) { + return NULL; + } + + char *str = ptr; + + size_t len = strlen(str); + char_array *primary = char_array_new_size(len + 1); + char_array *secondary = char_array_new_size(len + 1); + + bool slavo_germanic = is_slavo_germanic(str); + + size_t current = 0; + size_t last = len - 1; + + if (substring_equals(str, len, current, 2, "ʻ", NULL)) { + str += 2; + } else if (get_char_at(str, len, current) == '\'') { + str++; + } + + if (substring_equals(str, len, current, 2, "GN", "KN", "PN", "WR", "PS", NULL)) { + current++; + } else if (get_char_at(str, len, current) == 'X') { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + current++; + } + + while (true) { + char c = *(str + current); + if (c == '\x00') break; + + if (current == 0 && is_vowel(c)) { + char_array_append(primary, "A"); + char_array_append(secondary, "A"); + current++; + continue; + } else if (c == 'B') { + /* "-mb", e.g", "dumb", already skipped over... */ + char_array_append(primary, "P"); + char_array_append(secondary, "P"); + + if (get_char_at(str, len, current + 1) == 'B') { + current += 2; + } else { + current++; + } + continue; + // Ç - C with cedilla (denormalized) + } else if (substring_equals(str, len, current, 3, "C\xcc\xa7", NULL)) { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + current += 2; + } else if (c == 'C') { + // various germanic + if ((current > 1) + && !is_vowel(get_char_at(str, len, current - 2)) + && (substring_equals(str, len, current - 1, 3, "ACH", NULL) + && !substring_equals(str, len, current + 2, 1, "O", "A", "U", NULL)) + && ((get_char_at(str, len, current + 2) != 'I') + && ((get_char_at(str, len, current + 2) != 'E') + || substring_equals(str, len, current - 2, 6, "BACHER", "MACHER", NULL)) + ) + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + // special case for "caesar" + if ((current == 0) + && substring_equals(str, len, current, 6, "CAESAR", NULL)) + { + char_array_append(primary, "S"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + // Italian e.g. "chianti" + if (substring_equals(str, len, current, 4, "CHIA", NULL)) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + if (substring_equals(str, len, current, 2, "CH", NULL)) { + // "michael" + if ((current > 0) + && substring_equals(str, len, current, 4, "CHAE", NULL)) + { + char_array_append(primary, "K"); + char_array_append(secondary, "X"); + current += 2; + continue; + } + + // Greek roots e.g. "chemistry", "chorus" + if ((current == 0) + && (substring_equals(str, len, current + 1, 5, "HARAC", "HARIS", "HOREO", NULL) + || substring_equals(str, len, current + 1, 4, "HIRO", "HAOS", "HAOT", NULL) + || (substring_equals(str, len, current + 1, 3, "HOR", "HYM", "HIA", "HEM", "HIM", NULL) && !substring_equals(str, len, current + 1, 5, "HEMIN", NULL))) + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + // Germanic, Greek, or otherwise "ch" for "kh" sound + if ( + (substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL) + || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) + // "ochestra", "orchid", "architect" but not "arch" + || substring_equals(str, len, current - 2, 6, "ORCHES", "ARCHIT", "ORCHID", NULL) + || substring_equals(str, len, current + 2, 1, "T", "S", NULL) + || ( + (((current == 0) || substring_equals(str, len, current - 1, 1, "A", "O", "U", "E", NULL)) + // e.g. not "breach", "broach", "pouch", "beech", etc. + && !substring_equals(str, len, current - 2, 2, "EA", "OU", "EE", "OA", "OO", "AU", NULL) + // e.g. not "lunch", "birch", "gulch" + && !substring_equals(str, len, current - 1, 1, "L", "R", "N", NULL)) + // e.g. "wachtler", "wechsler", but not "tichner" + && ((current + 1 == last) || substring_equals(str, len, current + 2, 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", NULL)) + ) + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } else { + if (current > 0) { + if (substring_equals(str, len, 0, 2, "MC", NULL)) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "K"); + } + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + } + } + current += 2; + continue; + } + + // e.g, "czerny" + if (substring_equals(str, len, current, 2, "CZ", NULL) + && !substring_equals(str, len, current - 2, 4, "WICZ", NULL)) + { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + current += 2; + continue; + } + + // double 'C' but not if e.g. "McClellan" + if (substring_equals(str, len, current, 2, "CC", NULL) + && !((current == 1) && get_char_at(str, len, 0) == 'M')) + { + // "bellocchio" but not "bacchus" + if (substring_equals(str, len, current + 2, 1, "I", "E", "H", NULL) + && !substring_equals(str, len, current + 2, 3, "HUS", "HUM", "HUN", "HAN", NULL)) + { + // "accident", "accede", "succeed" + if (((current == 1) + && (get_char_at(str, len, current - 1) == 'A')) + || substring_equals(str, len, current - 1, 5, "UCCEE", "UCCES", NULL)) + { + char_array_append(primary, "KS"); + char_array_append(secondary, "KS"); + // "pinocchio" but not "riccio" or "picchu" + } else if (get_char_at(str, len, current + 2) == 'H' + && !substring_equals(str, len, current + 2, 2, "HU", "HA", NULL)) { + char_array_append(primary, "K"); + char_array_append(secondary, "X"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + } + current += 3; + continue; + } else { + // Pierce's rule + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + } + + if (substring_equals(str, len, current, 2, "CK", "CG", "CQ", NULL)) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + if (substring_equals(str, len, current, 2, "CI", "CJ", "CE", "CY", NULL)) { + if (substring_equals(str, len, current, 3, "CIO", "CIE", "CIA", "CIU", NULL)) { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + current += 2; + continue; + } + + // else + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + + if (substring_equals(str, len, current + 1, 2, " C", " Q", " G", NULL)) { + current += 3; + } else if (substring_equals(str, len, current + 1, 1, "C", "K", "Q", NULL) + && !substring_equals(str, len, current + 1, 2, "CE", "CI", NULL)) + { + current += 2; + } else { + current++; + } + + continue; + } else if (c == 'D') { + if (substring_equals(str, len, current, 2, "DG", NULL)) { + if (substring_equals(str, len, current + 2, 1, "I", "E", "Y", NULL)) { + // e.g. "edge" + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + current += 3; + continue; + } else { + char_array_append(primary, "TK"); + char_array_append(secondary, "TK"); + current += 2; + continue; + } + } + + if (substring_equals(str, len, current, 2, "DT", "DD", NULL)) { + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + current += 2; + continue; + } + + // else + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + current++; + continue; + } else if (c == 'F') { + if (get_char_at(str, len, current + 1) == 'F') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + continue; + } else if (c == 'G') { + if (get_char_at(str, len, current + 1) == 'H') { + if ((current > 0) && !is_vowel(get_char_at(str, len, current - 1))) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + if (current == 0) { + // "ghislane", "ghiradelli" + if (get_char_at(str, len, current + 2) == 'I') { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + } else { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } + current += 2; + continue; + } + + // Parker's rule (with some further refinements) - e.g. "hugh" + if ( + ((current > 1) + && substring_equals(str, len, current - 2, 1, "B", "H", "D", NULL)) + // e.g. "bough" + || ((current > 2) + && substring_equals(str, len, current - 3, 1, "B", "H", "D", NULL)) + // e.g. "broughton" + || ((current > 3) + && substring_equals(str, len, current - 4, 1, "B", "H", NULL)) + ) + { + current += 2; + continue; + } else { + // e.g. "laugh", "McLaughlin", "cough", "gough", "rough", "tough" + if ((current > 2) + && (get_char_at(str, len, current - 1) == 'U') + && substring_equals(str, len, current - 3, 1, "C", "G", "L", "R", "T", NULL)) + { + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + } else if ((current > 0) + && get_char_at(str, len, current - 1) != 'I') + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } + current += 2; + continue; + } + + } + + if (get_char_at(str, len, current + 1) == 'N') { + if ((current == 1) && is_vowel(get_char_at(str, len, 0)) + && !slavo_germanic) + { + char_array_append(primary, "KN"); + char_array_append(secondary, "N"); + // not e.g. "cagney" + } else if (!substring_equals(str, len, current + 2, 2, "EY", NULL) + && (get_char_at(str, len, current + 1) != 'Y') + && !slavo_germanic) + { + char_array_append(primary, "N"); + char_array_append(secondary, "KN"); + } else { + char_array_append(primary, "KN"); + char_array_append(secondary, "KN"); + } + current += 2; + continue; + } + + // "tagliaro" + if (substring_equals(str, len, current + 1, 2, "LI", NULL) + && !slavo_germanic) + { + char_array_append(primary, "KL"); + char_array_append(secondary, "L"); + current += 2; + continue; + } + + // -ges-, -gep-, -gel-, -gie- at beginning + if ((current == 0) + && ((get_char_at(str, len, current + 1) == 'Y') + || substring_equals(str, len, current + 1, 2, "ES", "EP", + "EB", "EL", "EY", "IB", "IL", "IN", "IE", + "EI", "ER", NULL))) + { + char_array_append(primary, "K"); + char_array_append(secondary, "J"); + current += 2; + continue; + } + + // -ger-, -gy- + if ( + (substring_equals(str, len, current + 1, 2, "ER", NULL) + || (get_char_at(str, len, current + 1) == 'Y')) + && !substring_equals(str, len, 0, 6, "DANGER", "RANGER", "MANGER", NULL) + && !substring_equals(str, len, current - 1, 1, "E", "I", NULL) + && !substring_equals(str, len, current - 1, 3, "RGY", "OGY", NULL) + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "J"); + current += 2; + continue; + } + + // italian e.g. "viaggi" + if (substring_equals(str, len, current + 1, 1, "E", "I", "Y", NULL) + || substring_equals(str, len, current - 1, 4, "AGGI", "OGGI", NULL)) + { + // obvious germanic + if ( + (substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL) + || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) + || substring_equals(str, len, current + 1, 2, "ET", NULL)) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + + } else { + if (substring_equals(str, len, current + 1, 4, "IER ", NULL) + || ((current == len - 3) && substring_equals(str, len, current + 1, 3, "IER", NULL))) + { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + } else { + char_array_append(primary, "J"); + char_array_append(secondary, "K"); + } + } + current += 2; + continue; + } + + if (get_char_at(str, len, current + 1) == 'G') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + continue; + } else if (c == 'H') { + // only keep if first & before vowel or between 2 vowels + if (((current == 0) || is_vowel(get_char_at(str, len, current - 1))) + && is_vowel(get_char_at(str, len, current + 1))) + { + char_array_append(primary, "H"); + char_array_append(secondary, "H"); + current += 2; + // also takes care of "HH" + } else { + current++; + } + continue; + } else if (c == 'J') { + // obvious Spanish, "Jose", "San Jacinto" + if (substring_equals(str, len, current, 4, "JOSE", NULL) + || substring_equals(str, len, current, 5, "JOSÉ", NULL) + || substring_equals(str, len, 0, 4, "SAN ", NULL)) + { + if (((current == 0) + && (get_char_at(str, len, current + 4) == ' ')) + || substring_equals(str, len, 0, 4, "SAN ", NULL)) + { + char_array_append(primary, "H"); + char_array_append(secondary, "H"); + } else { + char_array_append(primary, "J"); + char_array_append(secondary, "H"); + } + + current++; + continue; + } + + if ((current == 0) + && !substring_equals(str, len, current, 4, "JOSE", NULL) + && !substring_equals(str, len, current, 5, "JOSÉ", NULL)) + { + // Yankelovich/Jankelowicz + char_array_append(primary, "J"); + char_array_append(secondary, "A"); + current++; + continue; + } else { + // Spanish pronoun of e.g. "bajador" + if (is_vowel(get_char_at(str, len, current - 1)) + && !slavo_germanic + && ((get_char_at(str, len, current + 1) == 'A') + || (get_char_at(str, len, current + 1) == 'O'))) + { + char_array_append(primary, "J"); + char_array_append(secondary, "H"); + } else { + if (current == last || ((current == last - 1 || get_char_at(str, len, current + 2) == ' ') && isalpha(get_char_at(str, len, current - 1)) && substring_equals(str, len, current + 1, 1, "A", "O", NULL))) { + char_array_append(primary, "J"); + } else { + if (!substring_equals(str, len, current + 1, 1, "L", "T", + "K", "S", "N", "M", "B", "Z", NULL) + && !substring_equals(str, len, current - 1, 1, "S", "K", "L", NULL)) + { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + } + } + } + + // it could happen! + if (get_char_at(str, len, current + 1) == 'J') { + current += 2; + } else { + current++; + } + continue; + } + } else if (c == 'K') { + if (get_char_at(str, len, current + 1) == 'K') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + continue; + } else if (c == 'L') { + if (get_char_at(str, len, current + 1) == 'L') { + // Spanish e.g. "Cabrillo", "Gallegos" + if (((current == (len - 3)) + && substring_equals(str, len, current - 1, 4, "ILLO", "ILLA", "ALLE", NULL)) + || ((substring_equals(str, len, last - 1, 2, "AS", "OS", NULL) + || substring_equals(str, len, last, 1, "A", "O", NULL)) + && substring_equals(str, len, current - 1, 4, "ALLE", NULL) + ) + ) + { + char_array_append(primary, "L"); + current += 2; + continue; + } + + current += 2; + } else { + current++; + } + char_array_append(primary, "L"); + char_array_append(secondary, "L"); + continue; + } else if (c == 'M') { + if ((substring_equals(str, len, current - 1, 3, "UMB", NULL) + && (((current + 1) == last) + || substring_equals(str, len, current + 2, 2, "ER", NULL))) + || (get_char_at(str, len, current + 1) == 'M')) + { + current += 2; + } else { + current++; + } + char_array_append(primary, "M"); + char_array_append(secondary, "M"); + continue; + // Ñ (NFD normalized) + } else if (substring_equals(str, len, current, 3, "N\xcc\x83", NULL)) { + current += 3; + char_array_append(primary, "N"); + char_array_append(secondary, "N"); + continue; + } else if (c == 'N') { + if (get_char_at(str, len, current + 1) == 'N') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "N"); + char_array_append(secondary, "N"); + continue; + } else if (c == 'P') { + if (substring_equals(str, len, current + 1, 1, "H", "F", NULL)) { + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + current += 2; + continue; + } + + // also account for "Campbell", "raspberry" + if (substring_equals(str, len, current + 1, 1, "P", "B", NULL)) { + current += 2; + } else { + current++; + } + + char_array_append(primary, "P"); + char_array_append(secondary, "P"); + continue; + } else if (c == 'Q') { + if (get_char_at(str, len, current + 1) == 'Q') { + current += 2; + } else { + current += 1; + } + + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + continue; + } else if (c == 'R') { + // french e.g. "rogier", but exclude "hochmeier" + if ((current == last) + && !slavo_germanic + && substring_equals(str, len, current - 2, 2, "IE", NULL) + && !substring_equals(str, len, current - 4, 2, "ME", "MA", NULL)) + { + char_array_append(secondary, "R"); + } else { + char_array_append(primary, "R"); + char_array_append(secondary, "R"); + } + + if (get_char_at(str, len, current + 1) == 'R') { + current += 2; + } else { + current++; + } + continue; + } else if (c == 'S') { + // special cases "island", "isle", "carlisle", "carlysle" + if (substring_equals(str, len, current - 1, 3, "ISL", "YSL", NULL)) { + current++; + continue; + } + + // special case "sugar-" + if ((current == 0) + && substring_equals(str, len, current, 5, "SUGAR", NULL)) + { + char_array_append(primary, "X"); + char_array_append(secondary, "S"); + current++; + continue; + } + + if (substring_equals(str, len, current, 2, "SH", NULL)) { + // Germanic + if (substring_equals(str, len, current + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ", NULL)) { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + } + current += 2; + continue; + } + + // Italian & Armenian + if (substring_equals(str, len, current, 3, "SIO", "SIA", NULL) + || substring_equals(str, len, current, 4, "SIAN", NULL)) + { + if (!slavo_germanic) { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + current += 3; + continue; + } + + /* German & Anglicisations, e.g. "Smith" match "Schmidt", "Snider" match "Schneider" + also, -sz- in Slavic language although in Hungarian it is pronounced 's' */ + if (((current == 0) + && substring_equals(str, len, current + 1, 1, "M", "N", "L", "W", NULL)) + || substring_equals(str, len, current + 1, 1, "Z", NULL)) + { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + if (substring_equals(str, len, current + 1, 1, "Z", NULL)) { + current += 2; + } else { + current++; + } + continue; + } + + + if (substring_equals(str, len, current, 2, "SC", NULL)) { + // Schlesinger's rule + if (get_char_at(str, len, current + 2) == 'H') { + // Dutch origin e.g. "school", "schooner" + if (substring_equals(str, len, current + 3, 2, "OO", "ER", "EN", + "UY", "ED", "EM", NULL)) + { + // "Schermerhorn", "Schenker" + if (substring_equals(str, len, current + 3, 2, "ER", "EN", NULL)) { + char_array_append(primary, "X"); + char_array_append(secondary, "SK"); + } else { + char_array_append(primary, "SK"); + char_array_append(secondary, "SK"); + } + current += 3; + continue; + } else { + if ((current == 0) && !is_vowel(get_char_at(str, len, 3)) + && (get_char_at(str, len, 3) != 'W')) + { + char_array_append(primary, "X"); + char_array_append(secondary, "S"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + } + current += 3; + continue; + } + + if (substring_equals(str, len, current + 2, 1, "I", "E", "Y", NULL)) { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + current += 3; + continue; + } + + char_array_append(primary, "SK"); + char_array_append(secondary, "SK"); + current += 3; + continue; + } + } + + // French e.g. "resnais", "artois" + if ((current == last) + && substring_equals(str, len, current - 2, 2, "AI", "OI", NULL)) + { + char_array_append(secondary, "S"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + + if (substring_equals(str, len, current + 1, 1, "S", "Z", NULL)) { + + current += 2; + } else { + current++; + } + continue; + } else if (c == 'T') { + + if (substring_equals(str, len, current, 4, "TION", NULL)) { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + current += 3; + continue; + } + + if (substring_equals(str, len, current, 3, "TIA", "TCH", NULL)) { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + current += 3; + continue; + } + + if (substring_equals(str, len, current, 2, "TH", NULL) + || substring_equals(str, len, current, 3, "TTH", NULL)) + { + // special case "Thomas", "Thames", or Germanic + if (substring_equals(str, len, current + 2, 2, "OM", "AM", NULL) + || substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL) + || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) + { + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + } else { + // yes, zero + char_array_append(primary, "0"); + char_array_append(secondary, "T"); + } + + current += 2; + continue; + } + + if (substring_equals(str, len, current + 1, 1, "T", "D", NULL)) { + current += 2; + } else { + current++; + } + + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + continue; + } else if (c == 'V') { + if (get_char_at(str, len, current + 1) == 'V') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + continue; + } else if (c == 'W') { + // can also be in the middle of word + if (substring_equals(str, len, current, 2, "WR", NULL)) { + char_array_append(primary, "R"); + char_array_append(secondary, "R"); + current += 2; + continue; + } + + if ((current == 0) + && (is_vowel(get_char_at(str, len, current + 1)) + || substring_equals(str, len, current, 2, "WH", NULL))) + { + // Wasserman should match Vasserman + if (is_vowel(get_char_at(str, len, current + 1))) { + char_array_append(primary, "A"); + char_array_append(secondary, "F"); + } else { + // need Uomo to match Womo + char_array_append(primary, "A"); + char_array_append(secondary, "A"); + } + } + + // Arnow should match Arnoff + if (((current == last) && is_vowel(get_char_at(str, len, current - 1))) + || substring_equals(str, len, current - 1, 5, "EWSKI", "EWSKY", + "OWSKI", "OWSKY", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) + { + char_array_append(secondary, "F"); + current++; + continue; + } + + // Polish e.g. "Filipowicz" + if (substring_equals(str, len, current, 4, "WICZ", "WITZ", NULL)) { + char_array_append(primary, "TS"); + char_array_append(secondary, "FX"); + current += 4; + continue; + } + + // else skip it + current++; + continue; + } else if (c == 'X') { + // French e.g. "breaux" + if (!((current == last) + && (substring_equals(str, len, current - 3, 3, "IAU", "EAU", NULL) + || substring_equals(str, len, current - 2, 2, "AU", "OU", NULL)))) + { + char_array_append(primary, "KS"); + char_array_append(secondary, "KS"); + } + + if (substring_equals(str, len, current + 1, 1, "C", "X", NULL)) { + current += 2; + } else { + current++; + } + continue; + } else if (c == 'Z') { + // Chinese Pinyin e.g. "Zhao" + if (get_char_at(str, len, current + 1) == 'H') { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + current += 2; + continue; + } else if (substring_equals(str, len, current + 1, 2, "ZO", "ZI", "ZA", NULL) + || (slavo_germanic + && ((current > 0) + && get_char_at(str, len, current - 1) != 'T'))) + { + char_array_append(primary, "S"); + char_array_append(secondary, "TS"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + + if (get_char_at(str, len, current + 1) == 'Z') { + current += 2; + } else { + current++; + } + continue; + } else { + current++; + } + } + + double_metaphone_codes_t *codes = calloc(1, sizeof(double_metaphone_codes_t)); + if (codes == NULL) { + char_array_destroy(primary); + char_array_destroy(secondary); + return NULL; + } + + codes->primary = char_array_to_string(primary); + codes->secondary = char_array_to_string(secondary); + + free(ptr); + + return codes; +} + +void double_metaphone_codes_destroy(double_metaphone_codes_t *codes) { + if (codes != NULL) { + if (codes->primary != NULL) { + free(codes->primary); + } + + if (codes->secondary != NULL) { + free(codes->secondary); + } + + free(codes); + } +} \ No newline at end of file diff --git a/src/double_metaphone.h b/src/double_metaphone.h new file mode 100644 index 00000000..64dac8a7 --- /dev/null +++ b/src/double_metaphone.h @@ -0,0 +1,17 @@ +#ifndef DOUBLE_METAPHONE__H +#define DOUBLE_METAPHONE__H + +#include +#include + +typedef struct double_metaphone_codes { + char *primary; + char *secondary; +} double_metaphone_codes_t; + +double_metaphone_codes_t *double_metaphone(char *input); + +void double_metaphone_codes_destroy(double_metaphone_codes_t *codes); + +#endif + diff --git a/src/expand.c b/src/expand.c new file mode 100644 index 00000000..b0d62e3c --- /dev/null +++ b/src/expand.c @@ -0,0 +1,1640 @@ +#include + +#include "expand.h" + +#include "log/log.h" + +#include "address_dictionary.h" +#include "collections.h" +#include "constants.h" +#include "language_classifier.h" +#include "numex.h" +#include "normalize.h" +#include "scanner.h" +#include "string_utils.h" +#include "token_types.h" +#include "transliterate.h" + + +#define DEFAULT_KEY_LEN 32 + +#define EXCESSIVE_PERMUTATIONS 100 + +inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) { + uint64_t normalize_token_options = 0; + + normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0; + normalize_token_options |= options.delete_acronym_periods ? NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS : 0; + normalize_token_options |= options.drop_english_possessives ? NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES : 0; + normalize_token_options |= options.delete_apostrophes ? NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE : 0; + + return normalize_token_options; +} + +inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) { + uint64_t normalize_string_options = 0; + normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0; + normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0; + normalize_string_options |= options.decompose ? NORMALIZE_STRING_DECOMPOSE : 0; + normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0; + normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0; + normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0; + normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0; + + return normalize_string_options; +} + + +static inline size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { + size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); + + int32_t unichr = 0; + const uint8_t *ptr = (const uint8_t *)str; + + if (len_ordinal_suffix > 0) { + ssize_t start = 0; + size_t token_offset = token.offset; + size_t token_len = token.len; + + if (len_ordinal_suffix < token.len) { + start = token.offset + token.len - len_ordinal_suffix; + token_offset = token.offset; + token_len = token.len - len_ordinal_suffix; + } else { + start = prev_token.offset + prev_token.len; + token_offset = prev_token.offset; + token_len = prev_token.len; + } + ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); + if (prev_char_len <= 0) return 0; + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_likely_roman_numeral_len(str + token_offset, token_len)) { + return 0; + } + } else { + return 0; + } + + return len_ordinal_suffix; +} + +void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { + + uint64_t normalize_token_options = get_normalize_token_options(options); + + if (token.type != WHITESPACE ) { + + bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len); + + if (!contains_hyphen || token.type == HYPHEN) { + log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type); + normalize_token(strings, str, token, normalize_token_options); + } else if (is_word_token(token.type)) { + + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + token.offset, token.len); + if (prefix_hyphen_len > 0) { + token.offset += prefix_hyphen_len; + } + + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + token.offset, token.len); + if (suffix_hyphen_len > 0) { + token.len -= suffix_hyphen_len; + } + + normalize_token(strings, str, token, normalize_token_options); + + if (options.replace_word_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; + } + + if (options.delete_word_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; + } + + } else if (is_numeric_token(token.type)) { + + normalize_token(strings, str, token, normalize_token_options); + + if (options.replace_word_hyphens || options.replace_numeric_hyphens) { + if (options.replace_word_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; + } + + if (options.replace_numeric_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; + } + + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; + } + + if (options.delete_numeric_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; + } + } + + if (is_numeric_token(token.type) && options.split_alpha_from_numeric) { + bool split_alpha_from_numeric = true; + + for (size_t i = 0; i < options.num_languages; i++) { + char *lang = options.languages[i]; + if (valid_ordinal_suffix_len(str, token, NULL_TOKEN, lang) > 1) { + split_alpha_from_numeric = false; + break; + } + } + + if (split_alpha_from_numeric) { + normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + } + } + } else { + cstring_array_add_string(strings, " "); + } +} + +void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { + cstring_array_add_string(strings, str); + + if (options.roman_numerals) { + char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE); + if (numex_replaced != NULL) { + cstring_array_add_string(strings, numex_replaced); + free(numex_replaced); + } + + } + +} + + + +address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + if (value != NULL && value->components & options.address_components) { + return value->expansions; + } + + return NULL; +} + +inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { + if (expansion.canonical_index != NULL_CANONICAL_INDEX) { + char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + uint64_t normalize_string_options = get_normalize_string_options(options); + char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); + canonical = canonical_normalized != NULL ? canonical_normalized : canonical; + + char_array_cat(key, canonical); + if (canonical_normalized != NULL) { + free(canonical_normalized); + } + } else { + char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); + } +} + + +bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) { + cstring_array *strings = tree->strings; + + size_t skip_period = with_period ? 1 : 0; + + bool have_suffix = suffix.len > 0 && suffix.len < token.len; + bool have_prefix = prefix.len > 0 && prefix.len + with_period < token.len; + + if (!have_suffix && !have_prefix) { + return false; + } + + address_expansion_array *prefix_expansions = NULL; + address_expansion_array *suffix_expansions = NULL; + + address_expansion_t prefix_expansion; + address_expansion_t suffix_expansion; + + char *expansion; + + size_t num_strings = 0; + char *root_word = NULL; + size_t root_len; + token_t root_token; + cstring_array *root_strings = NULL; + int add_space = 0; + int spaces = 0; + + size_t prefix_start, prefix_end, root_end, suffix_start; + + if (have_prefix) { + prefix_expansions = valid_affix_expansions(prefix, options); + if (prefix_expansions == NULL) have_prefix = false; + } + + if (have_suffix) { + suffix_expansions = valid_affix_expansions(suffix, options); + if (suffix_expansions == NULL) have_suffix = false; + } + + if (!have_suffix && !have_prefix) { + return false; + } + + char_array *key = char_array_new_size(token.len); + + if (have_prefix && have_suffix) { + for (size_t i = 0; i < prefix_expansions->n; i++) { + prefix_expansion = prefix_expansions->a[i]; + char_array_clear(key); + + cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + prefix_start = key->n - 1; + + add_space = (int)prefix_expansion.separable || with_period; + if (prefix.len + skip_period + suffix.len < token.len && !prefix_expansion.separable) { + add_space = suffix_expansion.separable || with_period; + } + + for (spaces = skip_period; spaces <= add_space; spaces++) { + key->n = prefix_start; + if (spaces) { + char_array_cat(key, " "); + } + + prefix_end = key->n; + + if (prefix.len + skip_period + suffix.len < token.len) { + root_len = token.len - suffix.len - prefix.len - skip_period; + size_t root_start = token.offset + prefix.len + skip_period; + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); + root_start += prefix_hyphen_len; + root_len -= prefix_hyphen_len; + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); + root_len -= suffix_hyphen_len; + root_token = (token_t){root_start, root_len, token.type}; + root_strings = cstring_array_new_size(root_len); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + for (size_t j = 0; j < num_strings; j++) { + key->n = prefix_end; + root_word = cstring_array_get_string(root_strings, j); + char_array_cat(key, root_word); + root_end = key->n - 1; + + for (size_t k = 0; k < suffix_expansions->n; k++) { + key->n = root_end; + suffix_expansion = suffix_expansions->a[k]; + + int add_suffix_space = suffix_expansion.separable; + + suffix_start = key->n; + for (int suffix_spaces = skip_period; suffix_spaces <= add_suffix_space; suffix_spaces++) { + key->n = suffix_start; + if (suffix_spaces) { + char_array_cat(key, " "); + } + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(strings, expansion); + + } + + + } + } + + cstring_array_destroy(root_strings); + root_strings = NULL; + + } else { + for (size_t j = 0; j < suffix_expansions->n; j++) { + key->n = prefix_end - skip_period; + suffix_expansion = suffix_expansions->a[j]; + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + } + } + + } + } else if (have_suffix) { + log_debug("suffix.start=%" PRId32 "\n", suffix.start); + root_len = suffix.start; + root_token = (token_t){token.offset, root_len, token.type}; + log_debug("root_len=%zu\n", root_len); + log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type); + + root_strings = cstring_array_new_size(root_len + 1); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + log_debug("num_strings = %zu\n", num_strings); + + for (size_t j = 0; j < num_strings; j++) { + char_array_clear(key); + root_word = cstring_array_get_string(root_strings, j); + log_debug("root_word=%s\n", root_word); + char_array_cat(key, root_word); + root_end = key->n - 1; + + for (size_t k = 0; k < suffix_expansions->n; k++) { + key->n = root_end; + suffix_expansion = suffix_expansions->a[k]; + + add_space = (suffix_expansion.separable || with_period) && suffix.len < token.len; + suffix_start = key->n; + + for (int spaces = skip_period; spaces <= add_space; spaces++) { + key->n = suffix_start; + if (spaces) { + char_array_cat(key, " "); + } + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + } + } + } else if (have_prefix) { + if (prefix.len + skip_period <= token.len) { + root_len = token.len - prefix.len - skip_period; + size_t root_start = token.offset + prefix.len + skip_period; + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); + root_start += prefix_hyphen_len; + root_len -= prefix_hyphen_len; + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); + root_len -= suffix_hyphen_len; + root_token = (token_t){root_start, root_len, token.type}; + root_strings = cstring_array_new_size(root_len); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + } else { + root_strings = cstring_array_new_size(token.len); + add_normalized_strings_token(root_strings, str, token, options); + num_strings = cstring_array_num_strings(root_strings); + + for (size_t k = 0; k < num_strings; k++) { + root_word = cstring_array_get_string(root_strings, k); + cstring_array_add_string(tree->strings, root_word); + } + + char_array_destroy(key); + cstring_array_destroy(root_strings); + return false; + + } + + for (size_t j = 0; j < prefix_expansions->n; j++) { + char_array_clear(key); + prefix_expansion = prefix_expansions->a[j]; + + cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + prefix_end = key->n - 1; + + add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len; + for (int spaces = skip_period; spaces <= add_space; spaces++) { + key->n = prefix_end; + if (spaces) { + char_array_cat(key, " "); + } + size_t prefix_space_len = key->n - spaces; + for (size_t k = 0; k < num_strings; k++) { + key->n = prefix_space_len; + root_word = cstring_array_get_string(root_strings, k); + char_array_cat(key, root_word); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + + } + } + } + + char_array_destroy(key); + + if (root_strings != NULL) { + cstring_array_destroy(root_strings); + } + + return true; + +} + +inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); + + phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); + + if ((suffix.len == 0 && prefix.len == 0)) return false; + + bool with_period = false; + + return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); +} + +inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + ssize_t first_period_index = string_next_period_len(str + token.offset, token.len); + if (first_period_index > 0) { + ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1); + // Token contains only one period or one + a final period + if (next_period_index < 0 || next_period_index == token.len - 1) { + phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang); + + phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang); + if (suffix.len > 0) { + suffix.start = first_period_index + 1; + } + + if (suffix.len == 0 && prefix.len == 0) return false; + + bool with_period = true; + + return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); + } else { + return false; + } + } else { + return false; + } +} + +bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) { + bool have_period_affixes = false; + if (string_contains_period_len(str + token.offset, token.len)) { + for (size_t l = 0; l < options.num_languages; l++) { + char *lang = options.languages[l]; + if (expand_affixes_period(tree, str, lang, token, options)) { + have_period_affixes = true; + break; + } + } + } + + if (!have_period_affixes) { + string_tree_add_string_len(tree, str + token.offset, token.len); + } + + return have_period_affixes; +} + + +static inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_ACADEMIC_DEGREE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_BUILDING_TYPE: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_COMPANY_TYPE: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_ELISION: + return LIBPOSTAL_ADDRESS_ANY; + case DICTIONARY_ENTRANCE: + return LIBPOSTAL_ADDRESS_ENTRANCE; + case DICTIONARY_HOUSE_NUMBER: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + case DICTIONARY_LEVEL_NUMBERED: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_STANDALONE: + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY); + case DICTIONARY_LEVEL_MEZZANINE: + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL| LIBPOSTAL_ADDRESS_ANY); + case DICTIONARY_LEVEL_BASEMENT: + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY); + case DICTIONARY_LEVEL_SUB_BASEMENT: + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY); + case DICTIONARY_NUMBER: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_NO_NUMBER: + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY); + case DICTIONARY_PERSONAL_TITLE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_POST_OFFICE: + return LIBPOSTAL_ADDRESS_PO_BOX; + case DICTIONARY_POSTAL_CODE: + return LIBPOSTAL_ADDRESS_POSTAL_CODE; + case DICTIONARY_QUALIFIER: + return LIBPOSTAL_ADDRESS_TOPONYM; + case DICTIONARY_STAIRCASE: + return LIBPOSTAL_ADDRESS_STAIRCASE; + case DICTIONARY_STOPWORD: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; + case DICTIONARY_STREET_TYPE: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_UNIT_NUMBERED: + return LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_UNIT_STANDALONE: + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY); + case DICTIONARY_UNIT_DIRECTION: + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY); + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + + +static inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE; + case DICTIONARY_STOPWORD: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; + case DICTIONARY_STREET_TYPE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_SYNONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +static inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) { + switch (dictionary_id) { + // Pre/post directionals can be removed if there are non-phrase tokens + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_COMPANY_TYPE: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +static inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_LEVEL_STANDALONE: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_MEZZANINE: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_BASEMENT: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_SUB_BASEMENT: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_UNIT_STANDALONE: + return LIBPOSTAL_ADDRESS_UNIT; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + + +static inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_ACADEMIC_DEGREE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PERSONAL_TITLE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_NUMBER: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_QUALIFIER: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_SYNONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_TOPONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +typedef enum { + GAZETTEER_MATCH_IGNORABLE, + GAZETTEER_MATCH_EDGE_IGNORABLE, + GAZETTEER_MATCH_POSSIBLE_ROOT, + GAZETTEER_MATCH_SPECIFIER, + GAZETTEER_MATCH_VALID_COMPONENTS +} gazetteer_match_type_t; + + +static inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) { + for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { + uint16_t dictionary_id = expansion.dictionary_ids[j]; + uint32_t components = 0; + switch (match_type) { + case GAZETTEER_MATCH_IGNORABLE: + components = gazetteer_ignorable_components(dictionary_id); + break; + case GAZETTEER_MATCH_EDGE_IGNORABLE: + components = gazetteer_edge_ignorable_components(dictionary_id); + break; + case GAZETTEER_MATCH_POSSIBLE_ROOT: + components = gazetteer_possible_root_components(dictionary_id); + break; + case GAZETTEER_MATCH_SPECIFIER: + components = gazetteer_specifier_components(dictionary_id); + break; + case GAZETTEER_MATCH_VALID_COMPONENTS: + components = gazetteer_valid_components(dictionary_id); + break; + default: + break; + } + if (components & address_components) { + return true; + } + } + return false; +} + +bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_IGNORABLE); +} + +bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE); +} + +bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT); +} + +bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER); +} + +bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); +} + + +bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions->a[i]; + + if (address_expansion_matches_type_for_components(expansion, address_components, match_type)) { + return true; + } + } + return false; +} + +inline bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_IGNORABLE); +} + +inline bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE); +} + + +inline bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT); +} + +inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER); +} + +inline bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); +} + + +bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (!address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { + return true; + } + } + return false; +} + +// Delete non-canonical phrases only + +string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { + char_array *key = NULL; + + log_debug("input=%s\n", str); + token_array *token_array = tokenize_keep_whitespace(str); + + if (token_array == NULL) { + return NULL; + } + + size_t len = strlen(str); + + token_t *tokens = token_array->a; + size_t num_tokens = token_array->n; + + log_debug("tokenized, num tokens=%zu\n", num_tokens); + + bool last_was_punctuation = false; + + phrase_language_array *phrases = NULL; + phrase_array *lang_phrases = NULL; + + for (size_t i = 0; i < options.num_languages; i++) { + char *lang = options.languages[i]; + log_debug("lang=%s\n", lang); + + lang_phrases = search_address_dictionaries_tokens(str, token_array, lang); + + if (lang_phrases == NULL) { + log_debug("lang_phrases NULL\n"); + continue; + } + + log_debug("lang_phrases->n = %zu\n", lang_phrases->n); + + phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); + + for (size_t j = 0; j < lang_phrases->n; j++) { + phrase_t p = lang_phrases->a[j]; + log_debug("lang=%s, (%d, %d)\n", lang, p.start, p.len); + phrase_language_array_push(phrases, (phrase_language_t){lang, p}); + } + + phrase_array_destroy(lang_phrases); + } + + + lang_phrases = search_address_dictionaries_tokens(str, token_array, ALL_LANGUAGES); + if (lang_phrases != NULL) { + phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); + + for (size_t j = 0; j < lang_phrases->n; j++) { + phrase_t p = lang_phrases->a[j]; + phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); + } + phrase_array_destroy(lang_phrases); + + } + + string_tree_t *tree = string_tree_new_size(len); + + bool last_added_was_whitespace = false; + + uint64_t normalize_string_options = get_normalize_string_options(options); + + if (phrases != NULL) { + log_debug("phrases not NULL, n=%zu\n", phrases->n); + ks_introsort(phrase_language_array, phrases->n, phrases->a); + + phrase_language_t phrase_lang; + + size_t start = 0; + size_t end = 0; + + phrase_t phrase = NULL_PHRASE; + phrase_t prev_phrase = NULL_PHRASE; + + key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); + + log_debug("phrase_option = %d\n", phrase_option); + + bool delete_phrases = phrase_option == DELETE_PHRASES; + bool expand_phrases = phrase_option == EXPAND_PHRASES; + + size_t num_phrases = phrases->n; + + bool have_non_phrase_tokens = false; + bool have_canonical_phrases = false; + bool have_ambiguous = false; + bool have_possible_root = false; + bool have_strictly_ignorable = false; + bool have_strictly_ignorable_abbreviation = false; + + size_t prev_phrase_end = 0; + + if (delete_phrases) { + for (size_t i = 0; i < num_phrases; i++) { + phrase_lang = phrases->a[i]; + phrase = phrase_lang.phrase; + + log_debug("phrase.start = %zu, prev_phrase_end = %zu\n", phrase.start, prev_phrase_end); + + token_t inter_token; + if (phrase.start > prev_phrase_end) { + for (size_t j = prev_phrase_end; j < phrase.start; j++) { + inter_token = tokens[j]; + if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { + log_debug("have_non_phrase_tokens\n"); + have_non_phrase_tokens = true; + break; + } + } + } + + if (i == num_phrases - 1 && phrase.start + phrase.len < num_tokens) { + for (size_t j = phrase.start + phrase.len; j < num_tokens; j++) { + inter_token = tokens[j]; + if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { + have_non_phrase_tokens = true; + break; + } + } + } + + bool phrase_is_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(phrase, options.address_components) && !phrase_is_ambiguous; + bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase); + + have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous); + have_strictly_ignorable = have_strictly_ignorable || phrase_is_strictly_ignorable; + have_strictly_ignorable_abbreviation = have_strictly_ignorable_abbreviation || (phrase_is_strictly_ignorable && !phrase_is_canonical); + if (have_strictly_ignorable_abbreviation) { + log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical); + } + + have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(phrase, options.address_components); + + have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous); + have_ambiguous = have_ambiguous || phrase_is_ambiguous; + + if (have_non_phrase_tokens) { + break; + } + + prev_phrase_end = phrase.start + phrase.len; + } + + + log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens); + log_debug("have_canonical_phrases = %d\n", have_canonical_phrases); + log_debug("have_ambiguous = %d\n", have_ambiguous); + log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable); + log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation); + } + + bool skipped_last_edge_phrase = false; + + for (size_t i = 0; i < phrases->n; i++) { + phrase_lang = phrases->a[i]; + + phrase = phrase_lang.phrase; + + log_debug("phrase.start=%d, phrase.len=%d, lang=%s, prev_phrase.start=%d, prev_phrase.len=%d\n", phrase.start, phrase.len, phrase_lang.language, prev_phrase.start, prev_phrase.len); + + if ((phrase.start > prev_phrase.start && phrase.start < prev_phrase.start + prev_phrase.len) || (phrase.start == prev_phrase.start && i > 0 && phrase.len < prev_phrase.len)) { + log_debug("continuing\n"); + continue; + } + + char_array_clear(key); + + char_array_cat(key, phrase_lang.language); + char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); + + size_t namespace_len = key->n; + + end = phrase.start; + + log_debug("start=%zu, end=%zu\n", start, end); + for (size_t j = start; j < end; j++) { + log_debug("Adding token %zu\n", j); + token_t token = tokens[j]; + if (is_punctuation(token.type)) { + last_was_punctuation = true; + continue; + } + + if (token.type != WHITESPACE) { + if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) ) { + log_debug("Adding space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); + + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + string_tree_finalize_token(tree); + last_added_was_whitespace = false; + } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 ) { + log_debug("Adding pre-phrase whitespace\n"); + last_added_was_whitespace = true; + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } else { + continue; + } + + last_was_punctuation = false; + } + + size_t added_expansions = 0; + token_t token; + + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components); + + if (expansion_valid_components) { + key->n = namespace_len; + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { + token = tokens[j]; + if (token.type != WHITESPACE) { + char_array_cat_len(key, str + token.offset, token.len); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace) { + char_array_cat(key, " "); + last_added_was_whitespace = true; + } + } + + char *key_str = char_array_get_string(key); + log_debug("key_str=%s\n", key_str); + address_expansion_array *expansions = value->expansions; + + if (expansions != NULL) { + bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool added_pre_phrase_space = false; + bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components); + bool current_phrase_have_edge_ignorable = false; + + bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components); + bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase); + bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(phrase, options.address_components); + + log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier); + + bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase); + + /* + Edge phrase handling. This is primarily for handling pre-directionals/post-directionals + in English and other languages. + */ + bool skip_edge_phrase = false; + bool other_phrase_is_ignorable = false; + + if (delete_phrases) { + phrase_language_t other_phrase_lang; + phrase_t other_phrase; + + log_debug("i = %zu, phrase.start = %u\n", i, phrase.start); + if (i == 0 && phrase.start == 0 && phrase.start + phrase.len < num_tokens) { + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + // Delete "E" in "E 125th St" + if (current_phrase_have_edge_ignorable) { + log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); + skip_edge_phrase = true; + } + + if (!skip_edge_phrase || !have_non_phrase_tokens) { + for (size_t other_i = i + 1; other_i < phrases->n; other_i++) { + other_phrase_lang = phrases->a[other_i]; + other_phrase = other_phrase_lang.phrase; + log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len); + log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language); + if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) { + if (other_phrase.start + other_phrase.len == num_tokens) { + skip_edge_phrase = false; + if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) { + // don't delete the "E" in "E St" + log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n"); + + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); + } else { + log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n"); + // delete "Avenue" in "Avenue E" + other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase)); + skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + + } + } else { + // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", the first token is probably a legit token instead of a pre-directional + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(other_phrase) || address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + log_debug("phrase is possible root. skip_edge_phrase = %d\n", skip_edge_phrase); + } + break; + } + } + } + } else if (phrases->n > 1 && i == phrases->n - 1 && phrase.start + phrase.len == num_tokens && phrase.start > 0) { + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + if (current_phrase_have_edge_ignorable) { + log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); + skip_edge_phrase = true; + } + + log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens); + if (!skip_edge_phrase || !have_non_phrase_tokens) { + for (ssize_t other_j = i - 1; other_j >= 0; other_j--) { + other_phrase_lang = phrases->a[other_j]; + other_phrase = other_phrase_lang.phrase; + log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len); + log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language); + if (other_phrase.start + other_phrase.len <= phrase.start && string_equals(other_phrase_lang.language, phrase_lang.language)) { + if (other_phrase.start == 0) { + //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components); + skip_edge_phrase = false; + if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) { + // don't delete the "E" in "Avenue E" + log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n"); + + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))) && string_tree_num_tokens(tree) > 0; + } else { + log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n"); + // delete "St" in "E St" + other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase)); + skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + + //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); + } + } + break; + } + } + } + } + } + + if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) { + skip_edge_phrase = true; + } + + for (size_t j = 0; j < expansions->n; j++) { + if (skip_edge_phrase) { + skipped_last_edge_phrase = true; + log_debug("skip edge phrase\n"); + continue; + } else { + skipped_last_edge_phrase = false; + } + + address_expansion_t expansion = expansions->a[j]; + + bool current_phrase_ignorable = false; + bool current_phrase_expandable = expand_phrases && expansion.canonical_index != NULL_CANONICAL_INDEX; + + bool is_ambiguous = address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION); + + if (delete_phrases) { + bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components); + bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX; + + log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d, current_phrase_have_possible_root=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable, current_phrase_have_possible_root); + + current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; + + if (!is_canonical) { + char *canon = address_dictionary_get_canonical(expansion.canonical_index); + log_debug("canonical = %s\n", canon); + } + + // Edge phrase calculations from above + if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) { + log_debug("current_phrase_have_edge_ignorable\n"); + log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); + current_phrase_ignorable = skip_edge_phrase; + // Don't delete "PH" in "PH 1" for unit expansions + } else if (is_ignorable && current_phrase_have_specifier) { + log_debug("current_phrase_have_specifier\n"); + current_phrase_ignorable = false; + // Delete "Avenue" in "5th Avenue" + } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) { + log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n"); + current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0; + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S" + } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) { + log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n"); + current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0; + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases || have_possible_root)) { + log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases); + current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); + + log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n"); + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + } else { + log_debug("none of the above\n"); + } + + if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) { + log_debug("Adding space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + last_added_was_whitespace = true; + added_pre_phrase_space = true; + } + + } + + if (current_phrase_ignorable) { + continue; + } + + if (delete_phrases) { + current_phrase_expandable = !current_phrase_ignorable; + } + + log_debug("current_phrase_expandable = %d\n", current_phrase_expandable); + + log_debug("expansion.canonical_index = %d\n", expansion.canonical_index); + + if (expansion.canonical_index != NULL_CANONICAL_INDEX && current_phrase_expandable) { + log_debug("expansion.canonical_index != NULL_CANONICAL_INDEX, delete_phrases = %d, phrase_option = %d\n", delete_phrases, phrase_option); + char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); + + canonical = canonical_normalized != NULL ? canonical_normalized : canonical; + + if (phrase.start + phrase.len < num_tokens - 1) { + token_t next_token = tokens[phrase.start + phrase.len]; + if (!is_numeric_token(next_token.type)) { + log_debug("non-canonical phrase, adding canonical string: %s\n", canonical); + string_tree_add_string(tree, canonical); + last_added_was_whitespace = false; + } else { + log_debug("adding canonical with cstring_array methods: %s\n", canonical); + uint32_t start_index = cstring_array_start_token(tree->strings); + cstring_array_append_string(tree->strings, canonical); + cstring_array_append_string(tree->strings, " "); + last_added_was_whitespace = true; + cstring_array_terminate(tree->strings); + } + } else { + log_debug("adding canonical: %s\n", canonical); + string_tree_add_string(tree, canonical); + last_added_was_whitespace = false; + } + + if (canonical_normalized != NULL) { + free(canonical_normalized); + } + } else if (expansion.canonical_index == NULL_CANONICAL_INDEX || !current_phrase_expandable) { + log_debug("canonical phrase, adding canonical string\n"); + + uint32_t start_index = cstring_array_start_token(tree->strings); + for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) { + token = tokens[k]; + if (token.type != WHITESPACE) { + cstring_array_append_string_len(tree->strings, str + token.offset, token.len); + last_added_was_whitespace = false; + } else { + log_debug("space\n"); + cstring_array_append_string(tree->strings, " "); + last_added_was_whitespace = true; + } + } + cstring_array_terminate(tree->strings); + } else { + continue; + } + + added_expansions++; + } + + } + } + + log_debug("expansion_valid_components == %d\n", expansion_valid_components); + + if (added_expansions == 0 && (!delete_phrases || !expansion_valid_components)) { + if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { + log_debug("Adding space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + last_added_was_whitespace = true; + } + + uint32_t start_index = cstring_array_start_token(tree->strings); + + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { + token = tokens[j]; + + if (token.type != WHITESPACE) { + log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset); + cstring_array_append_string_len(tree->strings, str + token.offset, token.len); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace) { + log_debug("Adding space\n"); + cstring_array_append_string(tree->strings, " "); + last_added_was_whitespace = true; + } + + } + + cstring_array_terminate(tree->strings); + + } + + if (!delete_phrases || !expansion_valid_components || added_expansions > 0) { + log_debug("i=%zu\n", i); + bool end_of_phrase = false; + if (i < phrases->n - 1) { + phrase_t next_phrase = phrases->a[i + 1].phrase; + end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); + } else { + end_of_phrase = true; + } + + log_debug("end_of_phrase=%d\n", end_of_phrase); + if (end_of_phrase) { + log_debug("finalize at i=%zu\n", i); + string_tree_finalize_token(tree); + } + } + + start = phrase.start + phrase.len; + prev_phrase = phrase; + + } + + char_array_destroy(key); + + end = (int)num_tokens; + + if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1 && !last_added_was_whitespace) { + token_t next_token = tokens[phrase.start + phrase.len]; + if (next_token.type != WHITESPACE && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !is_ideographic(next_token.type)) { + log_debug("space after phrase\n"); + string_tree_add_string(tree, " "); + last_added_was_whitespace = true; + string_tree_finalize_token(tree); + } + } + + + for (size_t j = start; j < end; j++) { + log_debug("On token %zu\n", j); + token_t token = tokens[j]; + if (is_punctuation(token.type)) { + log_debug("last_was_punctuation\n"); + last_was_punctuation = true; + continue; + } + + if (token.type != WHITESPACE) { + if (j > 0 && last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { + log_debug("Adding another space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); + + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { + log_debug("Adding space IV\n"); + string_tree_add_string(tree, " "); + last_added_was_whitespace = true; + } else { + log_debug("Skipping token %zu\n", j); + continue; + } + + last_was_punctuation = false; + string_tree_finalize_token(tree); + + } + + } else { + log_debug("phrases NULL\n"); + for (size_t j = 0; j < num_tokens; j++) { + log_debug("On token %zu\n", j); + token_t token = tokens[j]; + if (is_punctuation(token.type)) { + log_debug("punctuation, skipping\n"); + last_was_punctuation = true; + continue; + } + + if (token.type != WHITESPACE) { + if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { + log_debug("Adding space V\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { + log_debug("Adding space VI\n"); + string_tree_add_string(tree, " "); + last_added_was_whitespace = true; + } else { + continue; + } + + last_was_punctuation = false; + string_tree_finalize_token(tree); + } + } + + if (phrases != NULL) { + phrase_language_array_destroy(phrases); + } + + token_array_destroy(token_array); + + return tree; +} + +inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { + size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang); + + if (len_ordinal_suffix > 0) { + cstring_array *strings = tree->strings; + // Add the original form first. When this function returns true, + // add_normalized_strings_token won't be called a second time. + add_normalized_strings_token(strings, str, token, options); + token_t normalized_token = token; + normalized_token.len = token.len - len_ordinal_suffix; + add_normalized_strings_token(strings, str, normalized_token, options); + return true; + } + + return false; +} + +inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { + cstring_array *strings = tree->strings; + + token_t prev_token = (token_t){0, 0, 0}; + + for (size_t i = 0; i < tokens->n; i++) { + token_t token = tokens->a[i]; + bool have_phrase = false; + bool have_ordinal = false; + + if (is_special_token(token.type)) { + string_tree_add_string_len(tree, str + token.offset, token.len); + string_tree_finalize_token(tree); + continue; + } + + for (size_t j = 0; j < options.num_languages; j++) { + char *lang = options.languages[j]; + if (expand_affixes(tree, str, lang, token, options)) { + have_phrase = true; + break; + } + + if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) { + have_ordinal = true; + break; + } + } + + if (!have_phrase && !have_ordinal) { + add_normalized_strings_token(strings, str, token, options); + } + + string_tree_finalize_token(tree); + prev_token = token; + } + +} + + +void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { + size_t len = strlen(str); + token_array *tokens = tokenize_keep_whitespace(str); + string_tree_t *token_tree = string_tree_new_size(len); + + add_normalized_strings_tokenized(token_tree, str, tokens, options); + + string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); + + string_tree_iterator_t *iter; + + char_array *temp_string = char_array_new_size(len); + + char *token; + + char *lang; + + kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); + + bool excessive_perms_outer = tokenized_iter->remaining >= EXCESSIVE_PERMUTATIONS; + + if (!excessive_perms_outer) { + kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); + } + + log_debug("tokenized_iter->remaining=%d\n", tokenized_iter->remaining); + + for (; !string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) { + char_array_clear(temp_string); + + string_tree_iterator_foreach_token(tokenized_iter, token, { + if (token == NULL) { + continue; + } + char_array_append(temp_string, token); + }) + char_array_terminate(temp_string); + + char *tokenized_str = char_array_get_string(temp_string); + + string_tree_t *alternatives; + + int ret; + log_debug("Adding alternatives for single normalization\n"); + alternatives = add_string_alternatives_phrase_option(tokenized_str, options, phrase_option); + + log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives)); + + if (alternatives == NULL) { + log_debug("alternatives = NULL\n"); + continue; + } + + iter = string_tree_iterator_new(alternatives); + log_debug("iter->num_tokens=%d\n", iter->num_tokens); + log_debug("iter->remaining=%d\n", iter->remaining); + + bool excessive_perms_inner = iter->remaining >= EXCESSIVE_PERMUTATIONS; + + if (!excessive_perms_inner && !excessive_perms_outer) { + for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { + char_array_clear(temp_string); + string_tree_iterator_foreach_token(iter, token, { + log_debug("token=%s\n", token); + char_array_append(temp_string, token); + }) + char_array_terminate(temp_string); + + token = char_array_get_string(temp_string); + + size_t token_len = strlen(token); + + if (token_len == 0) continue; + + size_t left_spaces = string_left_spaces_len(token, token_len); + size_t right_spaces = string_right_spaces_len(token, token_len); + + if (left_spaces + right_spaces == token_len) { + continue; + } + + char *dupe_token = strndup(token + left_spaces, token_len - left_spaces - right_spaces); + + log_debug("full string=%s\n", token); + khiter_t k = kh_get(str_set, unique_strings, dupe_token); + + if (k == kh_end(unique_strings)) { + log_debug("doing postprocessing\n"); + add_postprocessed_string(strings, dupe_token, options); + k = kh_put(str_set, unique_strings, dupe_token, &ret); + } else { + free(dupe_token); + } + + log_debug("iter->remaining = %d\n", iter->remaining); + + } + } else { + cstring_array_add_string(strings, tokenized_str); + } + + string_tree_iterator_destroy(iter); + string_tree_destroy(alternatives); + + if (excessive_perms_outer) { + break; + } + } + + string_tree_iterator_destroy(tokenized_iter); + string_tree_destroy(token_tree); + + token_array_destroy(tokens); + + char_array_destroy(temp_string); +} + + + +cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { + options.address_components |= LIBPOSTAL_ADDRESS_ANY; + + uint64_t normalize_string_options = get_normalize_string_options(options); + + size_t len = strlen(input); + + language_classifier_response_t *lang_response = NULL; + + if (options.num_languages == 0) { + lang_response = classify_languages(input); + if (lang_response != NULL) { + options.num_languages = lang_response->num_languages; + options.languages = lang_response->languages; + } + } + + string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages); + + cstring_array *strings = cstring_array_new_size(len * 2); + char_array *temp_string = char_array_new_size(len); + + khash_t(str_set) *unique_strings = kh_init(str_set); + + char *token; + + log_debug("string_tree_num_tokens(tree) = %d\n", string_tree_num_tokens(tree)); + + if (string_tree_num_strings(tree) == 1) { + char *normalized = string_tree_get_alternative(tree, 0, 0); + expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option); + + } else { + log_debug("Adding alternatives for multiple normalizations\n"); + string_tree_iterator_t *iter = string_tree_iterator_new(tree); + + for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { + char *segment; + char_array_clear(temp_string); + bool is_first = true; + + string_tree_iterator_foreach_token(iter, segment, { + if (!is_first) { + char_array_append(temp_string, " "); + } + char_array_append(temp_string, segment); + is_first = false; + }) + char_array_terminate(temp_string); + token = char_array_get_string(temp_string); + log_debug("current permutation = %s\n", token); + expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option); + } + + string_tree_iterator_destroy(iter); + } + + char *key_str = NULL; + for (size_t i = kh_begin(unique_strings); i != kh_end(unique_strings); ++i) { + if (!kh_exist(unique_strings, i)) continue; + key_str = (char *)kh_key(unique_strings, i); + free(key_str); + } + + kh_destroy(str_set, unique_strings); + + if (lang_response != NULL) { + language_classifier_response_destroy(lang_response); + } + + char_array_destroy(temp_string); + string_tree_destroy(tree); + + *n = cstring_array_num_strings(strings); + + return strings; + +} + +cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(input, options, n, EXPAND_PHRASES); +} + +cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(input, options, n, DELETE_PHRASES); +} + + + +void expansion_array_destroy(char **expansions, size_t n) { + for (size_t i = 0; i < n; i++) { + free(expansions[i]); + } + free(expansions); +} + diff --git a/src/expand.h b/src/expand.h new file mode 100644 index 00000000..70980daa --- /dev/null +++ b/src/expand.h @@ -0,0 +1,64 @@ +#ifndef EXPAND_H +#define EXPAND_H + +#include +#include + +#include "libpostal.h" + +#include "address_dictionary.h" +#include "collections.h" +#include "klib/khash.h" +#include "klib/ksort.h" +#include "trie_search.h" + +typedef struct phrase_language { + char *language; + phrase_t phrase; +} phrase_language_t; + +VECTOR_INIT(phrase_language_array, phrase_language_t) + +#define ks_lt_phrase_language(a, b) ((a).phrase.start < (b).phrase.start || ((a).phrase.start == (b).phrase.start && (a).phrase.len > (b).phrase.len)) + +KSORT_INIT(phrase_language_array, phrase_language_t, ks_lt_phrase_language) + +uint64_t get_normalize_token_options(libpostal_normalize_options_t options); +uint64_t get_normalize_string_options(libpostal_normalize_options_t options); + +void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options); +void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options); + +address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options); + +void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options); +bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period); + +bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); +bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); +bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options); + +bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options); + +void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options); + + +bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components); + + +typedef enum { + EXPAND_PHRASES, + KEEP_PHRASES, + DELETE_PHRASES +} expansion_phrase_option_t; + +cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); +cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); +void expansion_array_destroy(char **expansions, size_t n); + +#endif diff --git a/src/gazetteer_data.c b/src/gazetteer_data.c index 0c23759a..444c225e 100644 --- a/src/gazetteer_data.c +++ b/src/gazetteer_data.c @@ -25,7 +25,7 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_NAMED_ORGANIZATION, LIBPOSTAL_ADDRESS_NAME}, {DICTIONARY_NAMED_PERSON, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_NO_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER}, - {DICTIONARY_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE}, + {DICTIONARY_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE}, {DICTIONARY_PERSONAL_SUFFIX, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_PERSONAL_TITLE, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_PLACE_NAME, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, diff --git a/src/jaccard.c b/src/jaccard.c new file mode 100644 index 00000000..87e27b8b --- /dev/null +++ b/src/jaccard.c @@ -0,0 +1,69 @@ +#include "jaccard.h" + +double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) { + if (s1 == NULL || s2 == NULL) return 0.0; + + size_t set_intersection = 0; + size_t set_union = 0; + + khiter_t k; + const char *key; + + kh_foreach_key(s1, key, { + k = kh_get(str_set, s2, key); + if (k != kh_end(s2)) { + set_intersection++; + } else { + set_union++; + } + }); + + // set_union contains all the keys that were in s1 but not s2 + // so just add all the keys in s2 to complete the union + set_union += kh_size(s2); + + return (double)set_intersection / set_union; +} + + +double jaccard_similarity_string_arrays(size_t num_strings1, char **strings1, size_t num_strings2, char **strings2) { + if (strings1 == NULL || strings2 == NULL || num_strings1 == 0 || num_strings2 == 0) return 0.0; + + khash_t(str_set) *string_set1 = kh_init(str_set); + if (string_set1 == NULL) return 0.0; + + kh_resize(str_set, string_set1, num_strings1); + int ret = 0; + + khiter_t k; + + for (size_t i = 0; i < num_strings1; i++) { + char *str1 = strings1[i]; + k = kh_put(str_set, string_set1, str1, &ret); + if (ret < 0) { + kh_destroy(str_set, string_set1); + return 0.0; + } + } + + khash_t(str_set) *string_set2 = kh_init(str_set); + if (string_set2 == NULL) { + kh_destroy(str_set, string_set1); + return 0.0; + } + kh_resize(str_set, string_set2, num_strings2); + for (size_t i = 0; i < num_strings2; i++) { + char *str2 = strings2[i]; + k = kh_put(str_set, string_set2, str2, &ret); + if (ret < 0) { + kh_destroy(str_set, string_set1); + kh_destroy(str_set, string_set2); + return 0.0; + } + } + + double sim = jaccard_similarity(string_set1, string_set2); + kh_destroy(str_set, string_set1); + kh_destroy(str_set, string_set2); + return sim; +} diff --git a/src/jaccard.h b/src/jaccard.h new file mode 100644 index 00000000..9f93266d --- /dev/null +++ b/src/jaccard.h @@ -0,0 +1,12 @@ +#ifndef JACCARD_H +#define JACCARD_H + +#include +#include + +#include "collections.h" + +double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2); +double jaccard_similarity_string_arrays(size_t num_strings1, char **strings1, size_t num_strings2, char **strings2); + +#endif \ No newline at end of file diff --git a/src/libpostal.c b/src/libpostal.c index 152cf77b..288f42c9 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -8,30 +8,16 @@ #include "address_dictionary.h" #include "address_parser.h" -#include "collections.h" -#include "constants.h" +#include "dedupe.h" +#include "expand.h" + #include "language_classifier.h" -#include "numex.h" +#include "near_dupe.h" #include "normalize.h" +#include "place.h" #include "scanner.h" #include "string_utils.h" #include "token_types.h" -#include "transliterate.h" - -typedef struct phrase_language { - char *language; - phrase_t phrase; -} phrase_language_t; - -VECTOR_INIT(phrase_language_array, phrase_language_t) - -#define ks_lt_phrase_language(a, b) ((a).phrase.start < (b).phrase.start || ((a).phrase.start == (b).phrase.start && (a).phrase.len > (b).phrase.len)) - -KSORT_INIT(phrase_language_array, phrase_language_t, ks_lt_phrase_language) - -#define DEFAULT_KEY_LEN 32 - -#define EXCESSIVE_PERMUTATIONS 100 static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = { .languages = NULL, @@ -61,973 +47,162 @@ libpostal_normalize_options_t libpostal_get_default_options(void) { return LIBPOSTAL_DEFAULT_OPTIONS; } -static inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) { - uint64_t normalize_token_options = 0; - - normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0; - normalize_token_options |= options.delete_acronym_periods ? NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS : 0; - normalize_token_options |= options.drop_english_possessives ? NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES : 0; - normalize_token_options |= options.delete_apostrophes ? NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE : 0; - - return normalize_token_options; -} - -static inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) { - uint64_t normalize_string_options = 0; - normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0; - normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0; - normalize_string_options |= options.decompose ? NORMALIZE_STRING_DECOMPOSE : 0; - normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0; - normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0; - normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0; - normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0; - - return normalize_string_options; -} - -static void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { - - uint64_t normalize_token_options = get_normalize_token_options(options); - - if (token.type != WHITESPACE ) { - - bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len); - - if (!contains_hyphen || token.type == HYPHEN) { - log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type); - normalize_token(strings, str, token, normalize_token_options); - } else if (is_word_token(token.type)) { - normalize_token(strings, str, token, normalize_token_options); - - if (options.replace_word_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; - } - - if (options.delete_word_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; - } - - } else if (is_numeric_token(token.type)) { - normalize_token(strings, str, token, normalize_token_options); - - if (options.replace_numeric_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; - } - - if (options.delete_numeric_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; - } - } - - if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) { - normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; - } - - } else { - cstring_array_add_string(strings, " "); - } -} - -static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) { - char_array *key = NULL; - - log_debug("input=%s\n", str); - token_array *tokens = tokenize_keep_whitespace(str); - - if (tokens == NULL) { - return NULL; - } - - size_t len = strlen(str); - - log_debug("tokenized, num tokens=%zu\n", tokens->n); - - bool last_was_punctuation = false; - - phrase_language_array *phrases = NULL; - phrase_array *lang_phrases = NULL; - - - for (size_t i = 0; i < options.num_languages; i++) { - char *lang = options.languages[i]; - log_debug("lang=%s\n", lang); - - lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); - - if (lang_phrases == NULL) { - log_debug("lang_phrases NULL\n"); - continue; - } - - log_debug("lang_phrases->n = %zu\n", lang_phrases->n); - - phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); - - for (size_t j = 0; j < lang_phrases->n; j++) { - phrase_t p = lang_phrases->a[j]; - log_debug("lang=%s, (%d, %d)\n", lang, p.start, p.len); - phrase_language_array_push(phrases, (phrase_language_t){lang, p}); - } - - phrase_array_destroy(lang_phrases); - } - - - lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); - if (lang_phrases != NULL) { - phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); - - for (size_t j = 0; j < lang_phrases->n; j++) { - phrase_t p = lang_phrases->a[j]; - phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); - } - phrase_array_destroy(lang_phrases); - - } - - string_tree_t *tree = string_tree_new_size(len); - - bool last_added_was_whitespace = false; - - uint64_t normalize_string_options = get_normalize_string_options(options); - - if (phrases != NULL) { - log_debug("phrases not NULL, n=%zu\n", phrases->n); - ks_introsort(phrase_language_array, phrases->n, phrases->a); - - phrase_language_t phrase_lang; - - size_t start = 0; - size_t end = 0; - - phrase_t phrase = NULL_PHRASE; - phrase_t prev_phrase = NULL_PHRASE; - - key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); - - for (size_t i = 0; i < phrases->n; i++) { - phrase_lang = phrases->a[i]; - - phrase = phrase_lang.phrase; - - log_debug("phrase.start=%d, phrase.len=%d, lang=%s, prev_phrase.start=%d, prev_phrase.len=%d\n", phrase.start, phrase.len, phrase_lang.language, prev_phrase.start, prev_phrase.len); - - if ((phrase.start > prev_phrase.start && phrase.start < prev_phrase.start + prev_phrase.len) || (phrase.start == prev_phrase.start && i > 0 && phrase.len < prev_phrase.len)) { - log_debug("continuing\n"); - continue; - } - - char_array_clear(key); - - char_array_cat(key, phrase_lang.language); - char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); - - size_t namespace_len = key->n; - - end = phrase.start; - - log_debug("start=%zu, end=%zu\n", start, end); - for (size_t j = start; j < end; j++) { - log_debug("Adding token %zu\n", j); - token_t token = tokens->a[j]; - if (is_punctuation(token.type)) { - last_was_punctuation = true; - continue; - } - - if (token.type != WHITESPACE) { - if (phrase.start > 0 && last_was_punctuation && !last_added_was_whitespace) { - string_tree_add_string(tree, " "); - string_tree_finalize_token(tree); - } - log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - - string_tree_add_string_len(tree, str + token.offset, token.len); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding pre-phrase whitespace\n"); - last_added_was_whitespace = true; - string_tree_add_string(tree, " "); - } else { - continue; - } - - last_was_punctuation = false; - string_tree_finalize_token(tree); - } - - if (phrase.start > 0 && start < end) { - token_t prev_token = tokens->a[phrase.start - 1]; - log_debug("last_added_was_whitespace=%d\n", last_added_was_whitespace); - if (!last_added_was_whitespace && phrase.start - 1 > 0 && (!is_ideographic(prev_token.type) || last_was_punctuation)) { - log_debug("Adding space III\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - string_tree_finalize_token(tree); - } - } - - uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - - token_t token; - - size_t added_expansions = 0; - if ((value->components & options.address_components) > 0) { - key->n = namespace_len; - for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; - if (token.type != WHITESPACE) { - char_array_cat_len(key, str + token.offset, token.len); - last_added_was_whitespace = false; - } else { - char_array_cat(key, " "); - last_added_was_whitespace = true; - } - } - - char *key_str = char_array_get_string(key); - log_debug("key_str=%s\n", key_str); - address_expansion_array *expansions = value->expansions; - - if (expansions != NULL) { - for (size_t j = 0; j < expansions->n; j++) { - address_expansion_t expansion = expansions->a[j]; - - if ((expansion.address_components & options.address_components) == 0 && !address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { - continue; - } - - if (expansion.canonical_index != NULL_CANONICAL_INDEX) { - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); - char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); - - canonical = canonical_normalized != NULL ? canonical_normalized : canonical; - - - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; - if (!is_numeric_token(next_token.type)) { - log_debug("non-canonical phrase, adding canonical string\n"); - string_tree_add_string(tree, canonical); - last_added_was_whitespace = false; - } else { - log_debug("adding canonical with cstring_array methods\n"); - uint32_t start_index = cstring_array_start_token(tree->strings); - cstring_array_append_string(tree->strings, canonical); - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - cstring_array_terminate(tree->strings); - } - } else { - string_tree_add_string(tree, canonical); - last_added_was_whitespace = false; - - } - - if (canonical_normalized != NULL) { - free(canonical_normalized); - } - } else { - log_debug("canonical phrase, adding canonical string\n"); - - uint32_t start_index = cstring_array_start_token(tree->strings); - for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) { - token = tokens->a[k]; - if (token.type != WHITESPACE) { - cstring_array_append_string_len(tree->strings, str + token.offset, token.len); - last_added_was_whitespace = false; - } else { - log_debug("space\n"); - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - } - cstring_array_terminate(tree->strings); - } - - added_expansions++; - } - - - } - } - - if (added_expansions == 0) { - uint32_t start_index = cstring_array_start_token(tree->strings); - for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; - - if (token.type != WHITESPACE) { - log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset); - cstring_array_append_string_len(tree->strings, str + token.offset, token.len); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding space\n"); - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - - } - - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len + 1]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - } - - cstring_array_terminate(tree->strings); - - } - - log_debug("i=%zu\n", i); - bool end_of_phrase = false; - if (i < phrases->n - 1) { - phrase_t next_phrase = phrases->a[i + 1].phrase; - end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); - } else { - end_of_phrase = true; - } - - log_debug("end_of_phrase=%d\n", end_of_phrase); - if (end_of_phrase) { - log_debug("finalize at i=%zu\n", i); - string_tree_finalize_token(tree); - } - - start = phrase.start + phrase.len; - prev_phrase = phrase; - - } - - char_array_destroy(key); - - end = (int)tokens->n; - - if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { - log_debug("space after phrase\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - string_tree_finalize_token(tree); - } - } - - - for (size_t j = start; j < end; j++) { - log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; - if (is_punctuation(token.type)) { - log_debug("last_was_punctuation\n"); - last_was_punctuation = true; - continue; - } - - if (token.type != WHITESPACE) { - if (j > 0 && last_was_punctuation && !last_added_was_whitespace) { - log_debug("Adding another space\n"); - string_tree_add_string(tree, " "); - string_tree_finalize_token(tree); - } - log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - - string_tree_add_string_len(tree, str + token.offset, token.len); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding space IV\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - } else { - log_debug("Skipping token %zu\n", j); - continue; - } - - last_was_punctuation = false; - string_tree_finalize_token(tree); - - } - - - } else { - - for (size_t j = 0; j < tokens->n; j++) { - log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; - if (is_punctuation(token.type)) { - log_debug("punctuation, skipping\n"); - last_was_punctuation = true; - continue; - } - - if (token.type != WHITESPACE) { - if (last_was_punctuation && !last_added_was_whitespace) { - log_debug("Adding space V\n"); - string_tree_add_string(tree, " "); - string_tree_finalize_token(tree); - } - - string_tree_add_string_len(tree, str + token.offset, token.len); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding space VI\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - } else { - continue; - } - - last_was_punctuation = false; - string_tree_finalize_token(tree); - } - } - - if (phrases != NULL) { - phrase_language_array_destroy(phrases); - } - - token_array_destroy(tokens); - - return tree; -} - -static void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { - cstring_array_add_string(strings, str); - - if (options.roman_numerals) { - char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE); - if (numex_replaced != NULL) { - cstring_array_add_string(strings, numex_replaced); - free(numex_replaced); - } - - } - -} - - - -static address_expansion_array *get_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { - uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - if (value != NULL && value->components & options.address_components) { - return value->expansions; - } - - return NULL; -} - -static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { - if (expansion.canonical_index != NULL_CANONICAL_INDEX) { - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); - uint64_t normalize_string_options = get_normalize_string_options(options); - char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); - canonical = canonical_normalized != NULL ? canonical_normalized : canonical; - - char_array_cat(key, canonical); - if (canonical_normalized != NULL) { - free(canonical_normalized); - } - } else { - char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); - } -} - -static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options) { - cstring_array *strings = tree->strings; - - bool have_suffix = suffix.len > 0 && suffix.len < token.len; - bool have_prefix = prefix.len > 0 && prefix.len < token.len; - - if (!have_suffix && !have_prefix) { - return false; - } - - address_expansion_array *prefix_expansions = NULL; - address_expansion_array *suffix_expansions = NULL; - - address_expansion_t prefix_expansion; - address_expansion_t suffix_expansion; - - char *expansion; - - size_t num_strings = 0; - char *root_word = NULL; - size_t root_len; - token_t root_token; - cstring_array *root_strings = NULL; - int add_space = 0; - int spaces = 0; - - size_t prefix_start, prefix_end, root_end, suffix_start; - - if (have_prefix) { - prefix_expansions = get_affix_expansions(prefix, options); - if (prefix_expansions == NULL) have_prefix = false; - } - - if (have_suffix) { - suffix_expansions = get_affix_expansions(suffix, options); - if (suffix_expansions == NULL) have_suffix = false; - } - - if (!have_suffix && !have_prefix) { - return false; - } - - char_array *key = char_array_new_size(token.len); - - if (have_prefix && have_suffix) { - for (size_t i = 0; i < prefix_expansions->n; i++) { - prefix_expansion = prefix_expansions->a[i]; - char_array_clear(key); - - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); - prefix_start = key->n - 1; - - add_space = (int)prefix_expansion.separable; - if (prefix.len + suffix.len < token.len && !prefix_expansion.separable) { - add_space = suffix_expansion.separable; - } - - for (spaces = 0; spaces <= add_space; spaces++) { - key->n = prefix_start; - if (spaces) { - char_array_cat(key, " "); - } - - prefix_end = key->n; - - if (prefix.len + suffix.len < token.len) { - root_len = token.len - suffix.len - prefix.len; - root_token = (token_t){token.offset + prefix.len, root_len, token.type}; - root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - for (size_t j = 0; j < num_strings; j++) { - key->n = prefix_end; - root_word = cstring_array_get_string(root_strings, j); - char_array_cat(key, root_word); - root_end = key->n - 1; - - for (size_t k = 0; k < suffix_expansions->n; k++) { - key->n = root_end; - suffix_expansion = suffix_expansions->a[k]; - - int add_suffix_space = suffix_expansion.separable; - - suffix_start = key->n; - for (int suffix_spaces = 0; suffix_spaces <= add_suffix_space; suffix_spaces++) { - key->n = suffix_start; - if (suffix_spaces) { - char_array_cat(key, " "); - } - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(strings, expansion); - - } - - - } - } - - cstring_array_destroy(root_strings); - root_strings = NULL; - - } else { - for (size_t j = 0; j < suffix_expansions->n; j++) { - key->n = prefix_end; - suffix_expansion = suffix_expansions->a[j]; - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - } - } - - } - } else if (have_suffix) { - log_debug("suffix.start=%" PRId32 "\n", suffix.start); - root_len = suffix.start; - root_token = (token_t){token.offset, root_len, token.type}; - log_debug("root_len=%zu\n", root_len); - log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type); - - root_strings = cstring_array_new_size(root_len + 1); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - log_debug("num_strings = %zu\n", num_strings); - - for (size_t j = 0; j < num_strings; j++) { - char_array_clear(key); - root_word = cstring_array_get_string(root_strings, j); - log_debug("root_word=%s\n", root_word); - char_array_cat(key, root_word); - root_end = key->n - 1; - - for (size_t k = 0; k < suffix_expansions->n; k++) { - key->n = root_end; - suffix_expansion = suffix_expansions->a[k]; - - add_space = suffix_expansion.separable && suffix.len < token.len; - suffix_start = key->n; - - for (int spaces = 0; spaces <= add_space; spaces++) { - key->n = suffix_start; - if (spaces) { - char_array_cat(key, " "); - } - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - } - } - } else if (have_prefix) { - if (prefix.len <= token.len) { - root_len = token.len - prefix.len; - root_token = (token_t){token.offset + prefix.len, root_len, token.type}; - root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - } else { - root_strings = cstring_array_new_size(token.len); - add_normalized_strings_token(root_strings, str, token, options); - num_strings = cstring_array_num_strings(root_strings); - - for (size_t k = 0; k < num_strings; k++) { - root_word = cstring_array_get_string(root_strings, k); - cstring_array_add_string(tree->strings, root_word); - } - - char_array_destroy(key); - cstring_array_destroy(root_strings); - return false; - - } - - for (size_t j = 0; j < prefix_expansions->n; j++) { - char_array_clear(key); - prefix_expansion = prefix_expansions->a[j]; - - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); - prefix_end = key->n - 1; - - add_space = prefix_expansion.separable && prefix.len < token.len; - for (int spaces = 0; spaces <= add_space; spaces++) { - key->n = prefix_end; - if (spaces) { - char_array_cat(key, " "); - } - for (size_t k = 0; k < num_strings; k++) { - root_word = cstring_array_get_string(root_strings, k); - char_array_cat(key, root_word); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - - } - } - } - - char_array_destroy(key); - - if (root_strings != NULL) { - cstring_array_destroy(root_strings); - } - - return true; - -} - -static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { - phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); - - phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); - - if ((suffix.len == 0 && prefix.len == 0)) return false; - - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options); -} - -static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { - size_t token_digit_len = possible_ordinal_digit_len(str + token.offset, token.len); - size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - - bool ret = false; - - if (len_ordinal_suffix == 0 || token_digit_len == 0 || token_digit_len + len_ordinal_suffix < token.len) { - return false; - } else if (len_ordinal_suffix == token.len && i > 0 && prev_token.len > 0) { - size_t prev_token_digit_len = possible_ordinal_digit_len(str + prev_token.offset, prev_token.len); - ret = prev_token_digit_len == prev_token.len; - } else { - ret = true; - } - - cstring_array *strings = tree->strings; - // Add the original form first. When this function returns true, - // add_normalized_strings_token won't be called a second time. - add_normalized_strings_token(strings, str, token, options); - - char_array *key = char_array_new_size(token.len - len_ordinal_suffix + 1); - char_array_cat_len(key, str + token.offset, token.len - len_ordinal_suffix); - char *expansion = char_array_get_string(key); - cstring_array_add_string(strings, expansion); - char_array_destroy(key); - return ret; -} - -static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { - cstring_array *strings = tree->strings; - - token_t prev_token = (token_t){0, 0, 0}; - - for (size_t i = 0; i < tokens->n; i++) { - token_t token = tokens->a[i]; - bool have_phrase = false; - bool have_ordinal = false; - - if (is_special_token(token.type)) { - string_tree_add_string_len(tree, str + token.offset, token.len); - string_tree_finalize_token(tree); - continue; - } - - for (size_t j = 0; j < options.num_languages; j++) { - char *lang = options.languages[j]; - if (expand_affixes(tree, str, lang, token, options)) { - have_phrase = true; - break; - } - - if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) { - have_ordinal = true; - break; - } - } - - if (!have_phrase && !have_ordinal) { - add_normalized_strings_token(strings, str, token, options); - } - - string_tree_finalize_token(tree); - prev_token = token; - } - -} - - -static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options) { - size_t len = strlen(str); - token_array *tokens = tokenize_keep_whitespace(str); - string_tree_t *token_tree = string_tree_new_size(len); - - add_normalized_strings_tokenized(token_tree, str, tokens, options); - - string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); - - string_tree_iterator_t *iter; - - char_array *temp_string = char_array_new_size(len); - - char *token; - - char *lang; - - kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); - - bool excessive_perms_outer = tokenized_iter->remaining >= EXCESSIVE_PERMUTATIONS; - - if (!excessive_perms_outer) { - kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); - } - - log_debug("tokenized_iter->remaining=%d\n", tokenized_iter->remaining); - - for (; !string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) { - char_array_clear(temp_string); - - string_tree_iterator_foreach_token(tokenized_iter, token, { - if (token == NULL) { - continue; - } - char_array_append(temp_string, token); - }) - char_array_terminate(temp_string); - - char *tokenized_str = char_array_get_string(temp_string); - - string_tree_t *alternatives; - - int ret; - log_debug("Adding alternatives for single normalization\n"); - alternatives = add_string_alternatives(tokenized_str, options); - - log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives)); - - if (alternatives == NULL) { - log_debug("alternatives = NULL\n"); - continue; - } - - iter = string_tree_iterator_new(alternatives); - log_debug("iter->num_tokens=%d\n", iter->num_tokens); - log_debug("iter->remaining=%d\n", iter->remaining); - - bool excessive_perms_inner = iter->remaining >= EXCESSIVE_PERMUTATIONS; - - if (!excessive_perms_inner && !excessive_perms_outer) { - for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { - char_array_clear(temp_string); - string_tree_iterator_foreach_token(iter, token, { - log_debug("token=%s\n", token); - char_array_append(temp_string, token); - }) - char_array_terminate(temp_string); - - token = char_array_get_string(temp_string); - log_debug("full string=%s\n", token); - khiter_t k = kh_get(str_set, unique_strings, token); - - if (k == kh_end(unique_strings)) { - log_debug("doing postprocessing\n"); - add_postprocessed_string(strings, token, options); - k = kh_put(str_set, unique_strings, strdup(token), &ret); - } - - log_debug("iter->remaining = %d\n", iter->remaining); - - } - } else { - cstring_array_add_string(strings, tokenized_str); - } - - string_tree_iterator_destroy(iter); - string_tree_destroy(alternatives); - - if (excessive_perms_outer) { - break; - } - } - - string_tree_iterator_destroy(tokenized_iter); - string_tree_destroy(token_tree); - - token_array_destroy(tokens); - - char_array_destroy(temp_string); -} - char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { - options.address_components |= LIBPOSTAL_ADDRESS_ANY; - - uint64_t normalize_string_options = get_normalize_string_options(options); - - size_t len = strlen(input); - - language_classifier_response_t *lang_response = NULL; - - if (options.num_languages == 0) { - lang_response = classify_languages(input); - if (lang_response != NULL) { - options.num_languages = lang_response->num_languages; - options.languages = lang_response->languages; - } - } - - string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages); - - cstring_array *strings = cstring_array_new_size(len * 2); - char_array *temp_string = char_array_new_size(len); - - khash_t(str_set) *unique_strings = kh_init(str_set); - - char *token; - - log_debug("string_tree_num_tokens(tree) = %d\n", string_tree_num_tokens(tree)); - - if (string_tree_num_strings(tree) == 1) { - char *normalized = string_tree_get_alternative(tree, 0, 0); - expand_alternative(strings, unique_strings, normalized, options); - - } else { - log_debug("Adding alternatives for multiple normalizations\n"); - string_tree_iterator_t *iter = string_tree_iterator_new(tree); - - for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { - char *segment; - char_array_clear(temp_string); - bool is_first = true; - - string_tree_iterator_foreach_token(iter, segment, { - if (!is_first) { - char_array_append(temp_string, " "); - } - char_array_append(temp_string, segment); - is_first = false; - }) - char_array_terminate(temp_string); - token = char_array_get_string(temp_string); - log_debug("current permutation = %s\n", token); - expand_alternative(strings, unique_strings, token, options); - } - - string_tree_iterator_destroy(iter); - } - - char *key_str = NULL; - for (size_t i = kh_begin(unique_strings); i != kh_end(unique_strings); ++i) { - if (!kh_exist(unique_strings, i)) continue; - key_str = (char *)kh_key(unique_strings, i); - free(key_str); - } - - kh_destroy(str_set, unique_strings); - - if (lang_response != NULL) { - language_classifier_response_destroy(lang_response); - } - - char_array_destroy(temp_string); - string_tree_destroy(tree); - - *n = cstring_array_num_strings(strings); - + cstring_array *strings = expand_address(input, options, n); + if (strings == NULL) return NULL; return cstring_array_to_strings(strings); +} +char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { + cstring_array *strings = expand_address_root(input, options, n); + if (strings == NULL) return NULL; + return cstring_array_to_strings(strings); } void libpostal_expansion_array_destroy(char **expansions, size_t n) { - for (size_t i = 0; i < n; i++) { - free(expansions[i]); - } - free(expansions); + expansion_array_destroy(expansions, n); } +#define DEFAULT_NEAR_DUPE_GEOHASH_PRECISION 6 + +static libpostal_near_dupe_hash_options_t LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS = { + .with_name = true, + .with_address = true, + .with_unit = false, + .with_city_or_equivalent = true, + .with_small_containing_boundaries = true, + .with_postal_code = true, + .with_latlon = false, + .latitude = 0.0, + .longitude = 0.0, + .geohash_precision = DEFAULT_NEAR_DUPE_GEOHASH_PRECISION, + .name_and_address_keys = true, + .name_only_keys = false, + .address_only_keys = false +}; + +libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void) { + return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS; +} + +char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) { + cstring_array *strings = near_dupe_hashes(num_components, labels, values, options); + if (strings == NULL) { + *num_hashes = 0; + return NULL; + } + *num_hashes = cstring_array_num_strings(strings); + return cstring_array_to_strings(strings); +} + + +char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes) { + cstring_array *strings = near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages); + if (strings == NULL) { + *num_hashes = 0; + return NULL; + } + *num_hashes = cstring_array_num_strings(strings); + return cstring_array_to_strings(strings); +} + + +char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) { + language_classifier_response_t *lang_response = place_languages(num_components, labels, values); + if (lang_response == NULL) { + *num_languages = 0; + return NULL; + } + + char **languages = lang_response->languages; + lang_response->languages = NULL; + *num_languages = lang_response->num_languages; + lang_response->num_languages = 0; + + language_classifier_response_destroy(lang_response); + return languages; +} + +static libpostal_duplicate_options_t LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS = { + .num_languages = 0, + .languages = NULL +}; + +libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void) { + return LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS; +} + +libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages) { + libpostal_duplicate_options_t options = LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS; + options.num_languages = num_languages; + options.languages = languages; + return options; +} + +libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_name_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_street_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_house_number_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_po_box_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_unit_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_floor_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_postal_code_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { + return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options); +} + +#define DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD 0.7 +#define DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD 0.9 + +static libpostal_fuzzy_duplicate_options_t DEFAULT_FUZZY_DUPLICATE_OPTIONS = { + .num_languages = 0, + .languages = NULL, + .needs_review_threshold = DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD, + .likely_dupe_threshold = DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD +}; + + +libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void) { + return DEFAULT_FUZZY_DUPLICATE_OPTIONS; +} + +libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages) { + libpostal_fuzzy_duplicate_options_t options = DEFAULT_FUZZY_DUPLICATE_OPTIONS; + options.num_languages = num_languages; + options.languages = languages; + return options; +} + + +libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { + return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +} + +libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { + return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +} + + void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { if (self == NULL) return; @@ -1066,7 +241,6 @@ libpostal_address_parser_response_t *libpostal_parse_address(char *address, libp if (parsed == NULL) { log_error("Parser returned NULL\n"); - libpostal_address_parser_response_destroy(parsed); return NULL; } @@ -1141,6 +315,76 @@ bool libpostal_setup_language_classifier_datadir(char *datadir) { return true; } + +libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) { + token_array *tokens = NULL; + if (!whitespace) { + tokens = tokenize(input); + } else { + tokens = tokenize_keep_whitespace(input); + } + + if (tokens == NULL) { + return NULL; + } + + libpostal_token_t *a = tokens->a; + *n = tokens->n; + free(tokens); + return a; +} + +char *libpostal_normalize_string(char *str, uint64_t options) { + if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) { + return normalize_string_latin(str, strlen(str), options); + } else { + return normalize_string_utf8(str, options); + } +} + +libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) { + if (input == NULL) { + return NULL; + } + char *normalized = libpostal_normalize_string(input, string_options); + if (normalized == NULL) { + return NULL; + } + + token_array *tokens = NULL; + if (!whitespace) { + tokens = tokenize(normalized); + } else { + tokens = tokenize_keep_whitespace(normalized); + } + + if (tokens == NULL || tokens->a == NULL) { + free(normalized); + return NULL; + } + + size_t num_tokens = tokens->n; + token_t *token_array = tokens->a; + char_array *normalized_token = char_array_new_size(strlen(normalized)); + + libpostal_normalized_token_t *result = malloc(sizeof(libpostal_normalized_token_t) * num_tokens); + + for (size_t i = 0; i < num_tokens; i++) { + token_t token = token_array[i]; + char_array_clear(normalized_token); + add_normalized_token(normalized_token, normalized, token, token_options); + char *token_str = strdup(char_array_get_string(normalized_token)); + result[i] = (libpostal_normalized_token_t){token_str, token}; + } + + free(normalized); + token_array_destroy(tokens); + char_array_destroy(normalized_token); + + *n = num_tokens; + return result; +} + bool libpostal_setup_language_classifier(void) { return libpostal_setup_language_classifier_datadir(NULL); } diff --git a/src/libpostal.h b/src/libpostal.h index ce428e62..5f253566 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -24,6 +24,67 @@ extern "C" { #define LIBPOSTAL_MAX_LANGUAGE_LEN 4 +// Doing these as #defines so we can duplicate the values exactly in Python + + +typedef enum { + LIBPOSTAL_TOKEN_TYPE_END = 0, // Null byte + // Word types + LIBPOSTAL_TOKEN_TYPE_WORD = 1, // Any letter-only word (includes all unicode letters) + LIBPOSTAL_TOKEN_TYPE_ABBREVIATION = 2, // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) + LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR = 3, // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character + LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE = 4, // Hangul syllable sequences which contain more than one codepoint + LIBPOSTAL_TOKEN_TYPE_ACRONYM = 5, // Specifically things like U.N. where we may delete internal periods + + LIBPOSTAL_TOKEN_TYPE_PHRASE = 10, // Not part of the first stage tokenizer, but may be used after phrase parsing + + // Special tokens + LIBPOSTAL_TOKEN_TYPE_EMAIL = 20, // Make sure emails are tokenized altogether + LIBPOSTAL_TOKEN_TYPE_URL = 21, // Make sure urls are tokenized altogether + LIBPOSTAL_TOKEN_TYPE_US_PHONE = 22, // US phone number (with or without country code) + LIBPOSTAL_TOKEN_TYPE_INTL_PHONE = 23, // A non-US phone number (must have country code) + + // Numbers and numeric types + LIBPOSTAL_TOKEN_TYPE_NUMERIC = 50, // Any sequence containing a digit + LIBPOSTAL_TOKEN_TYPE_ORDINAL = 51, // 1st, 2nd, 1er, 1 etc. + LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL = 52, // II, III, VI, etc. + LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER = 53, // All numeric ideographic characters, includes e.g. Han numbers and chars like "²" + + // Punctuation types, may separate a phrase + LIBPOSTAL_TOKEN_TYPE_PERIOD = 100, + LIBPOSTAL_TOKEN_TYPE_EXCLAMATION = 101, + LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK = 102, + LIBPOSTAL_TOKEN_TYPE_COMMA = 103, + LIBPOSTAL_TOKEN_TYPE_COLON = 104, + LIBPOSTAL_TOKEN_TYPE_SEMICOLON = 105, + LIBPOSTAL_TOKEN_TYPE_PLUS = 106, + LIBPOSTAL_TOKEN_TYPE_AMPERSAND = 107, + LIBPOSTAL_TOKEN_TYPE_AT_SIGN = 108, + LIBPOSTAL_TOKEN_TYPE_POUND = 109, + LIBPOSTAL_TOKEN_TYPE_ELLIPSIS = 110, + LIBPOSTAL_TOKEN_TYPE_DASH = 111, + LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH = 112, + LIBPOSTAL_TOKEN_TYPE_HYPHEN = 113, + LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN = 114, + LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE = 115, + LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE = 119, + LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE = 120, + LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE = 121, + LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE = 122, + LIBPOSTAL_TOKEN_TYPE_SLASH = 124, + LIBPOSTAL_TOKEN_TYPE_BACKSLASH = 125, + LIBPOSTAL_TOKEN_TYPE_GREATER_THAN = 126, + LIBPOSTAL_TOKEN_TYPE_LESS_THAN = 127, + + // Non-letters and whitespace + LIBPOSTAL_TOKEN_TYPE_OTHER = 200, + LIBPOSTAL_TOKEN_TYPE_WHITESPACE = 300, + LIBPOSTAL_TOKEN_TYPE_NEWLINE = 301, + + LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR = 500 +} libpostal_token_type_t; + + /* Address dictionaries */ @@ -77,6 +138,7 @@ typedef struct libpostal_normalize_options { LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void); LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +LIBPOSTAL_EXPORT char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n); @@ -90,6 +152,8 @@ typedef struct libpostal_address_parser_response { char **labels; } libpostal_address_parser_response_t; +typedef libpostal_address_parser_response_t libpostal_parsed_address_components_t; + typedef struct libpostal_address_parser_options { char *language; char *country; @@ -103,6 +167,87 @@ LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(ch LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features); + +/* +Deduping +*/ + + +// Near-dupe hashing methods + +typedef struct libpostal_near_dupe_hash_options { + bool with_name; + bool with_address; + bool with_unit; + bool with_city_or_equivalent; + bool with_small_containing_boundaries; + bool with_postal_code; + bool with_latlon; + double latitude; + double longitude; + uint32_t geohash_precision; + bool name_and_address_keys; + bool name_only_keys; + bool address_only_keys; +} libpostal_near_dupe_hash_options_t; + + +LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void); +LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); +LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); + +// Dupe language classification + +LIBPOSTAL_EXPORT char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages); + +// Pairwise dupe methods + +typedef enum { + LIBPOSTAL_NULL_DUPLICATE_STATUS = -1, + LIBPOSTAL_NON_DUPLICATE = 0, + LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW = 3, + LIBPOSTAL_LIKELY_DUPLICATE = 6, + LIBPOSTAL_EXACT_DUPLICATE = 9, +} libpostal_duplicate_status_t; + +typedef struct libpostal_duplicate_options { + size_t num_languages; + char **languages; +} libpostal_duplicate_options_t; + + +LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void); +LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages); + +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); + +// Pairwise fuzzy dupe methods, return status & similarity + +typedef struct libpostal_fuzzy_duplicate_options { + size_t num_languages; + char **languages; + double needs_review_threshold; + double likely_dupe_threshold; +} libpostal_fuzzy_duplicate_options_t; + +typedef struct libpostal_fuzzy_duplicate_status { + libpostal_duplicate_status_t status; + double similarity; +} libpostal_fuzzy_duplicate_status_t; + +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages); + +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); + // Setup/teardown methods LIBPOSTAL_EXPORT bool libpostal_setup(void); @@ -117,6 +262,58 @@ LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void); LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir); LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void); +/* Tokenization and token normalization APIs */ + +typedef struct libpostal_token { + size_t offset; + size_t len; + uint16_t type; +} libpostal_token_t; + +LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); + +// Normalize string options +#define LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII 1 << 0 +#define LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE 1 << 1 +#define LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS 1 << 2 +#define LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE 1 << 3 +#define LIBPOSTAL_NORMALIZE_STRING_LOWERCASE 1 << 4 +#define LIBPOSTAL_NORMALIZE_STRING_TRIM 1 << 5 +#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 +#define LIBPOSTAL_NORMALIZE_STRING_COMPOSE 1 << 7 +#define LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 +#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 + +// Normalize token options +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 +#define LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 +#define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9 + +#define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE) + +#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS (LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) + +#define LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS (LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) + +#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) + +LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options); + + +typedef struct libpostal_normalized_token { + char *str; + libpostal_token_t token; +} libpostal_normalized_token_t; + +libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); + #ifdef __cplusplus } #endif diff --git a/src/near_dupe.c b/src/near_dupe.c new file mode 100644 index 00000000..f2c08280 --- /dev/null +++ b/src/near_dupe.c @@ -0,0 +1,970 @@ +#include + +#include "log/log.h" + +#include "near_dupe.h" +#include "double_metaphone.h" +#include "expand.h" +#include "features.h" +#include "float_utils.h" +#include "place.h" +#include "scanner.h" +#include "string_utils.h" +#include "tokens.h" +#include "unicode_scripts.h" +#include "unicode_script_types.h" + +#include "geohash/geohash.h" + +#define MAX_GEOHASH_PRECISION 12 + +#define NAME_KEY_PREFIX "n" +#define ADDRESS_KEY_PREFIX "a" +#define UNIT_KEY_PREFIX "u" +#define PO_BOX_KEY_PREFIX "p" +#define HOUSE_NUMBER_KEY_PREFIX "h" +#define STREET_KEY_PREFIX "s" + +#define GEOHASH_KEY_PREFIX "gh" +#define POSTCODE_KEY_PREFIX "pc" +#define CITY_KEY_PREFIX "ct" +#define CONTAINING_BOUNDARY_PREFIX "cb" + +#define NAME_ADDRESS_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_ADDRESS_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_ADDRESS_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_ADDRESS_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_ADDRESS_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_ADDRESS_CITY_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_ADDRESS_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_ADDRESS_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_HOUSE_NUMBER_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_HOUSE_NUMBER_CITY_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_HOUSE_NUMBER_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_HOUSE_NUMBER_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_STREET_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_STREET_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_STREET_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_STREET_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_STREET_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_STREET_CITY_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_STREET_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_STREET_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_PO_BOX_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_PO_BOX_CITY_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_PO_BOX_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_PO_BOX_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_CITY_KEY_PREFIX NAME_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define ADDRESS_UNIT_GEOHASH_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define ADDRESS_UNIT_CITY_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define ADDRESS_UNIT_CONTAINING_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define ADDRESS_UNIT_POSTCODE_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define ADDRESS_GEOHASH_KEY_PREFIX ADDRESS_KEY_PREFIX GEOHASH_KEY_PREFIX +#define ADDRESS_CITY_KEY_PREFIX ADDRESS_KEY_PREFIX CITY_KEY_PREFIX +#define ADDRESS_CONTAINING_KEY_PREFIX ADDRESS_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define ADDRESS_POSTCODE_KEY_PREFIX ADDRESS_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define HOUSE_NUMBER_GEOHASH_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX GEOHASH_KEY_PREFIX +#define HOUSE_NUMBER_CITY_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CITY_KEY_PREFIX +#define HOUSE_NUMBER_CONTAINING_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define HOUSE_NUMBER_POSTCODE_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define STREET_GEOHASH_KEY_PREFIX STREET_KEY_PREFIX GEOHASH_KEY_PREFIX +#define STREET_CITY_KEY_PREFIX STREET_KEY_PREFIX CITY_KEY_PREFIX +#define STREET_CONTAINING_KEY_PREFIX STREET_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define STREET_POSTCODE_KEY_PREFIX STREET_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define STREET_UNIT_GEOHASH_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define STREET_UNIT_CITY_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define STREET_UNIT_CONTAINING_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define STREET_UNIT_POSTCODE_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define PO_BOX_GEOHASH_KEY_PREFIX PO_BOX_KEY_PREFIX GEOHASH_KEY_PREFIX +#define PO_BOX_CITY_KEY_PREFIX PO_BOX_KEY_PREFIX CITY_KEY_PREFIX +#define PO_BOX_CONTAINING_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define PO_BOX_POSTCODE_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX + +cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, size_t *n) { + size_t num_expansions = 0; + cstring_array *expansions = expand_address(input, options, &num_expansions); + + size_t num_root_expansions = 0; + cstring_array *root_expansions = expand_address_root(input, options, &num_root_expansions); + + if (num_root_expansions == 0) { + cstring_array_destroy(root_expansions); + *n = num_expansions; + return expansions; + } else if (num_expansions == 0) { + cstring_array_destroy(expansions); + *n = num_root_expansions; + return root_expansions; + } else { + khash_t(str_set) *unique_strings = kh_init(str_set); + char *expansion; + khiter_t k; + int ret; + + cstring_array *all_expansions = cstring_array_new(); + + for (size_t i = 0; i < num_expansions; i++) { + expansion = cstring_array_get_string(expansions, i); + k = kh_get(str_set, unique_strings, expansion); + + if (k == kh_end(unique_strings)) { + cstring_array_add_string(all_expansions, expansion); + k = kh_put(str_set, unique_strings, expansion, &ret); + if (ret < 0) { + break; + } + } + } + + for (size_t i = 0; i < num_root_expansions; i++) { + expansion = cstring_array_get_string(root_expansions, i); + k = kh_get(str_set, unique_strings, expansion); + + if (k == kh_end(unique_strings)) { + cstring_array_add_string(all_expansions, expansion); + k = kh_put(str_set, unique_strings, expansion, &ret); + if (ret < 0) { + break; + } + } + } + + *n = cstring_array_num_strings(all_expansions); + + kh_destroy(str_set, unique_strings); + cstring_array_destroy(root_expansions); + cstring_array_destroy(expansions); + + return all_expansions; + } +} + +static inline cstring_array *expanded_component_root_with_fallback(char *input, libpostal_normalize_options_t options, size_t *n) { + cstring_array *root_expansions = expand_address_root(input, options, n); + if (*n > 0) { + return root_expansions; + } else { + cstring_array_destroy(root_expansions); + *n = 0; + return expand_address(input, options, n); + } +} + + +static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) { + if (geohash_precision == 0) return NULL; + + if (geohash_precision > MAX_GEOHASH_PRECISION) geohash_precision = MAX_GEOHASH_PRECISION; + size_t geohash_len = geohash_precision + 1; + + char geohash[geohash_len]; + if (geohash_encode(latitude, longitude, geohash, geohash_len) != GEOHASH_OK) { + return NULL; + } + + size_t neighbors_size = geohash_len * 8; + char neighbors[neighbors_size]; + + int num_strings = 0; + + if (geohash_neighbors(geohash, neighbors, neighbors_size, &num_strings) == GEOHASH_OK && num_strings == 8) { + cstring_array *strings = cstring_array_new_size(9 * geohash_len); + cstring_array_add_string(strings, geohash); + + for (int i = 0; i < num_strings; i++) { + char *neighbor = neighbors + geohash_len * i; + cstring_array_add_string(strings, neighbor); + } + return strings; + } + + return NULL; +} + +#define MAX_NAME_TOKENS 50 + + +cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; + size_t num_expansions = 0; + cstring_array *name_expansions = expanded_component_root_with_fallback(name, normalize_options, &num_expansions); + if (num_expansions == 0) { + cstring_array_destroy(name_expansions); + return NULL; + } + + size_t len = strlen(name); + + char_array *token_string_array = char_array_new_size(len); + cstring_array *strings = cstring_array_new_size(len); + token_array *token_array = token_array_new(); + + char_array *combined_words_no_whitespace = char_array_new(); + + bool keep_whitespace = false; + + khash_t(str_set) *unique_strings = kh_init(str_set); + khiter_t k; + int ret = 0; + + for (size_t i = 0; i < num_expansions; i++) { + char *expansion = cstring_array_get_string(name_expansions, i); + log_debug("expansion = %s\n", expansion); + tokenize_add_tokens(token_array, expansion, strlen(expansion), keep_whitespace); + size_t num_tokens = token_array->n; + token_t *tokens = token_array->a; + token_t prev_token; + char *token_str; + for (size_t j = 0; j < num_tokens; j++) { + token_t token = tokens[j]; + bool ideogram = is_ideographic(token.type); + + string_script_t token_script = get_string_script(expansion + token.offset, token.len); + bool is_latin = token_script.len == token.len && token_script.script == SCRIPT_LATIN; + + char_array_clear(token_string_array); + // For ideograms, since the "words" are characters, we use shingles of two characters + if (ideogram && j > 0 && is_ideographic(prev_token.type)) { + log_debug("cat ideogram\n"); + char_array_cat_len(token_string_array, expansion + prev_token.offset, prev_token.len); + } + + // For Latin script, add double metaphone of the words + if (is_latin && !is_numeric_token(token.type) && !ideogram && !is_punctuation(token.type)) { + char_array_clear(token_string_array); + char_array_cat_len(token_string_array, expansion + token.offset, token.len); + token_str = char_array_get_string(token_string_array); + + log_debug("token_str = %s\n", token_str); + + double_metaphone_codes_t *dm_codes = double_metaphone(token_str); + if (dm_codes == NULL) { + prev_token = token; + continue; + } + char *dm_primary = dm_codes->primary; + char *dm_secondary = dm_codes->secondary; + + if (!string_equals(dm_primary, "")) { + + k = kh_get(str_set, unique_strings, dm_primary); + + if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { + log_debug("adding dm_primary = %s\n", dm_primary); + cstring_array_add_string(strings, dm_primary); + k = kh_put(str_set, unique_strings, strdup(dm_primary), &ret); + if (ret < 0) { + break; + } + } + + if (!string_equals(dm_secondary, dm_primary)) { + + k = kh_get(str_set, unique_strings, dm_secondary); + + if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { + log_debug("adding dm_secondary = %s\n", dm_secondary); + cstring_array_add_string(strings, dm_secondary); + k = kh_put(str_set, unique_strings, strdup(dm_secondary), &ret); + if (ret < 0) { + break; + } + } + } + } + double_metaphone_codes_destroy(dm_codes); + // For non-Latin words (Arabic, Cyrllic, etc.) just add the word + // For ideograms, we do two-character shingles, so only add the first character if the string has one token + } else if (!ideogram || j > 0 || num_tokens == 1) { + char_array_cat_len(token_string_array, expansion + token.offset, token.len); + token_str = char_array_get_string(token_string_array); + log_debug("token_str = %s\n", token_str); + k = kh_get(str_set, unique_strings, token_str); + + if (k == kh_end(unique_strings)) { + cstring_array_add_string(strings, token_str); + k = kh_put(str_set, unique_strings, strdup(token_str), &ret); + if (ret < 0) { + break; + } + } + } + + prev_token = token; + } + + token_array_clear(token_array); + } + + char_array_destroy(token_string_array); + token_array_destroy(token_array); + char_array_destroy(combined_words_no_whitespace); + + cstring_array_destroy(name_expansions); + + const char *key; + + kh_foreach_key(unique_strings, key, { + free((char *)key); + }); + kh_destroy(str_set, unique_strings); + + return strings; +} + + +static inline void add_string_arrays_to_tree(string_tree_t *tree, size_t n, va_list args) { + for (size_t i = 0; i < n; i++) { + cstring_array *a = va_arg(args, cstring_array *); + size_t num_strings = cstring_array_num_strings(a); + if (num_strings == 0) continue; + for (size_t j = 0; j < num_strings; j++) { + char *str = cstring_array_get_string(a, j); + string_tree_add_string(tree, str); + } + string_tree_finalize_token(tree); + } + va_end(args); +} + +static inline void add_hashes_from_tree(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree) { + string_tree_iterator_t *iter = string_tree_iterator_new(tree); + if (iter->num_tokens > 0) { + log_debug("iter->num_tokens = %zu\n", iter->num_tokens); + + for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { + + cstring_array_start_token(near_dupe_hashes); + cstring_array_append_string(near_dupe_hashes, prefix); + + char *str; + string_tree_iterator_foreach_token(iter, str, { + cstring_array_append_string(near_dupe_hashes, "|"); + cstring_array_append_string(near_dupe_hashes, str); + //log_debug("str=%s\n", str); + }); + + cstring_array_terminate(near_dupe_hashes); + } + } + + string_tree_iterator_destroy(iter); +} + + +static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree, size_t n, ...) { + string_tree_clear(tree); + + log_debug("prefix=%s\n", prefix); + + va_list args; + va_start(args, n); + add_string_arrays_to_tree(tree, n, args); + va_end(args); + + log_debug("string_tree_num_strings(tree)=%zu\n", string_tree_num_strings(tree)); + + add_hashes_from_tree(near_dupe_hashes, prefix, tree); +} + + +cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) { + if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_postal_code) return NULL; + + place_t *place = place_from_components(num_components, labels, values); + log_debug("created place\n"); + if (place == NULL) return NULL; + + bool have_valid_geo = options.with_latlon; + + if (!have_valid_geo && options.with_postal_code && place->postal_code != NULL) { + have_valid_geo = true; + } + + if (!have_valid_geo && options.with_city_or_equivalent && (place->city != NULL || place->city_district != NULL || place->suburb != NULL || place->island != NULL)) { + have_valid_geo = true; + } + + if (!have_valid_geo && options.with_small_containing_boundaries && (place->state_district != NULL)) { + have_valid_geo = true; + } + + + if (!have_valid_geo) { + log_debug("no valid geo\n"); + place_destroy(place); + return NULL; + } + + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + + language_classifier_response_t *lang_response = NULL; + + if (num_languages == 0) { + lang_response = place_languages(num_components, labels, values); + + if (lang_response != NULL) { + log_debug("got %zu place languages\n", lang_response->num_languages); + normalize_options.num_languages = lang_response->num_languages; + normalize_options.languages = lang_response->languages; + } + } else { + normalize_options.num_languages = num_languages; + normalize_options.languages = languages; + } + + string_tree_t *tree = string_tree_new(); + + cstring_array *name_expansions = NULL; + size_t num_name_expansions = 0; + if (place->name != NULL && options.with_name) { + log_debug("Doing name expansions for %s\n", place->name); + name_expansions = name_word_hashes(place->name, normalize_options); + if (name_expansions != NULL) { + num_name_expansions = cstring_array_num_strings(name_expansions); + log_debug("Got %zu name expansions\n", num_name_expansions); + } + } + + + cstring_array *street_expansions = NULL; + size_t num_street_expansions = 0; + if (place->street != NULL) { + log_debug("Doing street expansions for %s\n", place->street); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; + street_expansions = expanded_component_combined(place->street, normalize_options, &num_street_expansions); + log_debug("Got %zu street expansions\n", num_street_expansions); + } + + cstring_array *house_number_expansions = NULL; + size_t num_house_number_expansions = 0; + if (place->house_number != NULL) { + log_debug("Doing house number expansions for %s\n", place->house_number); + normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; + house_number_expansions = expand_address_root(place->house_number, normalize_options, &num_house_number_expansions); + log_debug("Got %zu house number expansions\n", num_house_number_expansions); + } + + cstring_array *unit_expansions = NULL; + size_t num_unit_expansions = 0; + if (place->unit != NULL && options.with_unit) { + log_debug("Doing unit expansions for %s\n", place->unit); + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; + unit_expansions = expand_address_root(place->unit, normalize_options, &num_unit_expansions); + log_debug("Got %zu unit expansions\n", num_unit_expansions); + } + + cstring_array *building_expansions = NULL; + size_t num_building_expansions = 0; + if (place->building != NULL && options.with_unit) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; + building_expansions = expand_address_root(place->building, normalize_options, &num_building_expansions); + } + + cstring_array *level_expansions = NULL; + size_t num_level_expansions = 0; + if (place->level != NULL && options.with_unit) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; + level_expansions = expand_address_root(place->level, normalize_options, &num_level_expansions); + } + + cstring_array *po_box_expansions = NULL; + size_t num_po_box_expansions = 0; + if (place->po_box != NULL) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; + po_box_expansions = expand_address_root(place->po_box, normalize_options, &num_po_box_expansions); + } + + cstring_array *place_expansions = NULL; + cstring_array *containing_expansions = NULL; + + if (options.with_city_or_equivalent) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_ANY; + + if (place->city != NULL) { + size_t num_city_expansions = 0; + cstring_array *city_expansions = expand_address_root(place->city, normalize_options, &num_city_expansions); + if (place_expansions == NULL) { + place_expansions = city_expansions; + } else if (city_expansions != NULL && num_city_expansions > 0) { + cstring_array_extend(place_expansions, city_expansions); + cstring_array_destroy(city_expansions); + } + + } + + if (place->city_district != NULL) { + size_t num_city_district_expansions = 0; + cstring_array *city_district_expansions = expand_address_root(place->city_district, normalize_options, &num_city_district_expansions); + if (place_expansions == NULL) { + place_expansions = city_district_expansions; + } else if (city_district_expansions != NULL && num_city_district_expansions > 0) { + cstring_array_extend(place_expansions, city_district_expansions); + cstring_array_destroy(city_district_expansions); + } + } + + if (place->suburb != NULL) { + size_t num_suburb_expansions = 0; + cstring_array *suburb_expansions = expand_address_root(place->suburb, normalize_options, &num_suburb_expansions); + if (place_expansions == NULL) { + place_expansions = suburb_expansions; + } else if (suburb_expansions != NULL && num_suburb_expansions > 0) { + cstring_array_extend(place_expansions, suburb_expansions); + cstring_array_destroy(suburb_expansions); + } + } + + + if (place->island != NULL) { + size_t num_island_expansions = 0; + cstring_array *island_expansions = expand_address_root(place->island, normalize_options, &num_island_expansions); + if (place_expansions == NULL) { + place_expansions = island_expansions; + } else if (island_expansions != NULL && num_island_expansions > 0) { + cstring_array_extend(place_expansions, island_expansions); + cstring_array_destroy(island_expansions); + } + } + + if (place->state_district != NULL && options.with_small_containing_boundaries) { + size_t num_state_district_expansions = 0; + cstring_array *state_district_expansions = expand_address_root(place->state_district, normalize_options, &num_state_district_expansions); + if (containing_expansions == NULL) { + containing_expansions = state_district_expansions; + } else if (state_district_expansions != NULL && num_state_district_expansions > 0) { + cstring_array_extend(containing_expansions, state_district_expansions); + cstring_array_destroy(state_district_expansions); + } + } + } + + cstring_array *postal_code_expansions = NULL; + size_t num_postal_code_expansions = 0; + if (options.with_postal_code && place->postal_code != NULL) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_ANY; + postal_code_expansions = expand_address_root(place->postal_code, normalize_options, &num_postal_code_expansions); + } + + cstring_array *geohash_expansions = NULL; + if (options.with_latlon && !(double_equals(options.latitude, 0.0) && double_equals(options.longitude, 0.0))) { + geohash_expansions = geohash_and_neighbors(options.latitude, options.longitude, options.geohash_precision); + } + + size_t num_geohash_expansions = geohash_expansions != NULL ? cstring_array_num_strings(geohash_expansions) : 0; + if (num_geohash_expansions == 0 && num_postal_code_expansions == 0 && place_expansions == NULL && containing_expansions == NULL) { + return NULL; + } + + num_name_expansions = name_expansions != NULL ? cstring_array_num_strings(name_expansions) : 0; + num_street_expansions = street_expansions != NULL ? cstring_array_num_strings(street_expansions) : 0; + num_house_number_expansions = house_number_expansions != NULL ? cstring_array_num_strings(house_number_expansions) : 0; + num_po_box_expansions = po_box_expansions != NULL ? cstring_array_num_strings(po_box_expansions) : 0; + num_unit_expansions = unit_expansions != NULL ? cstring_array_num_strings(unit_expansions) : 0; + num_building_expansions = building_expansions != NULL ? cstring_array_num_strings(building_expansions) : 0; + num_level_expansions = level_expansions != NULL ? cstring_array_num_strings(level_expansions) : 0; + + bool have_unit = num_unit_expansions > 0 || num_building_expansions > 0 || num_level_expansions > 0; + cstring_array *unit_or_equivalent_expansions = NULL; + if (num_unit_expansions > 0) { + unit_or_equivalent_expansions = unit_expansions; + } else if (num_building_expansions > 0) { + unit_or_equivalent_expansions = building_expansions; + } else if (num_level_expansions > 0) { + unit_or_equivalent_expansions = level_expansions; + } + + cstring_array *near_dupe_hashes = cstring_array_new(); + + if (num_name_expansions > 0) { + if (num_street_expansions > 0 && num_house_number_expansions > 0 && options.name_and_address_keys) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_GEOHASH_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_CITY_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_CONTAINING_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_POSTCODE_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_CITY_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, postal_code_expansions); + } + } + // Japan, other places with no street names + } else if (num_house_number_expansions > 0 && options.name_and_address_keys) { + // House number and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // House number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_CITY_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, postal_code_expansions); + } + } + // Addresses in India, UK, Ireland, many university addresses, etc. may have house name + street with no house numbers + } else if (num_street_expansions > 0 && options.name_and_address_keys) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_CITY_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, street_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_CITY_KEY_PREFIX, tree, 3, name_expansions, street_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, street_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, street_expansions, postal_code_expansions); + } + } + // PO Box only addresses, mailing addresses + } else if (num_po_box_expansions > 0 && options.name_and_address_keys) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, geohash_expansions); + } + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_CITY_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, postal_code_expansions); + } + // Only name + } else if (options.name_only_keys) { + // Have name and unit, some university addresses + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_CITY_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have name and geo only + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_GEOHASH_KEY_PREFIX, tree, 2, name_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_CITY_KEY_PREFIX, tree, 2, name_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_CONTAINING_KEY_PREFIX, tree, 2, name_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_POSTCODE_KEY_PREFIX, tree, 2, name_expansions, postal_code_expansions); + } + } + } + } + + if (options.address_only_keys) { + if (num_street_expansions > 0 && num_house_number_expansions > 0) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_GEOHASH_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_CITY_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_CONTAINING_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_POSTCODE_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_GEOHASH_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_CITY_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_CONTAINING_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_POSTCODE_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, postal_code_expansions); + } + } + // Japan, other places with no street names + } else if (num_house_number_expansions > 0) { + // House number and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // House number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_GEOHASH_KEY_PREFIX, tree, 2, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_CITY_KEY_PREFIX, tree, 2, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_CONTAINING_KEY_PREFIX, tree, 2, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_POSTCODE_KEY_PREFIX, tree, 2, house_number_expansions, postal_code_expansions); + } + } + // Addresses in India, UK, Ireland, many university addresses, etc. may have house name + street with no house numbers + } else if (num_street_expansions > 0) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_GEOHASH_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_CITY_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_CONTAINING_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_POSTCODE_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_GEOHASH_KEY_PREFIX, tree, 2, street_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_CITY_KEY_PREFIX, tree, 2, street_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_CONTAINING_KEY_PREFIX, tree, 2, street_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_POSTCODE_KEY_PREFIX, tree, 2, street_expansions, postal_code_expansions); + } + } + // PO Box only addresses, mailing addresses + } else if (num_po_box_expansions > 0) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_GEOHASH_KEY_PREFIX, tree, 2, po_box_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_CITY_KEY_PREFIX, tree, 2, po_box_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_CONTAINING_KEY_PREFIX, tree, 2, po_box_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_POSTCODE_KEY_PREFIX, tree, 2, po_box_expansions, postal_code_expansions); + } + } + + } + + if (place != NULL) { + place_destroy(place); + } + + if (tree != NULL) { + string_tree_destroy(tree); + } + + if (name_expansions != NULL) { + cstring_array_destroy(name_expansions); + } + + if (street_expansions != NULL) { + cstring_array_destroy(street_expansions); + } + + if (house_number_expansions != NULL) { + cstring_array_destroy(house_number_expansions); + } + + if (unit_expansions != NULL) { + cstring_array_destroy(unit_expansions); + } + + if (building_expansions != NULL) { + cstring_array_destroy(building_expansions); + } + + if (level_expansions != NULL) { + cstring_array_destroy(level_expansions); + } + + if (po_box_expansions != NULL) { + cstring_array_destroy(po_box_expansions); + } + + if (place_expansions != NULL) { + cstring_array_destroy(place_expansions); + } + + + if (containing_expansions != NULL) { + cstring_array_destroy(containing_expansions); + } + + if (postal_code_expansions != NULL) { + cstring_array_destroy(postal_code_expansions); + } + + if (geohash_expansions != NULL) { + cstring_array_destroy(geohash_expansions); + } + + if (lang_response != NULL) { + language_classifier_response_destroy(lang_response); + } + + return near_dupe_hashes; +} + +inline cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options) { + return near_dupe_hashes_languages(num_components, labels, values, options, 0, NULL); +} diff --git a/src/near_dupe.h b/src/near_dupe.h new file mode 100644 index 00000000..9e3d33f8 --- /dev/null +++ b/src/near_dupe.h @@ -0,0 +1,14 @@ + +#ifndef NEAR_DUPE_H +#define NEAR_DUPE_H + +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options); +cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages); + +#endif \ No newline at end of file diff --git a/src/near_dupe_test.c b/src/near_dupe_test.c new file mode 100644 index 00000000..a5fa5aa7 --- /dev/null +++ b/src/near_dupe_test.c @@ -0,0 +1,124 @@ +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +int main(int argc, char **argv) { + if (argc < 3) { + printf("Usage: ./test_near_dupe label value [...]\n"); + exit(EXIT_FAILURE); + } + + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + exit(EXIT_FAILURE); + } + + libpostal_near_dupe_hash_options_t options = libpostal_get_near_dupe_hash_default_options(); + + cstring_array *labels_array = cstring_array_new(); + cstring_array *values_array = cstring_array_new(); + cstring_array *languages_array = NULL; + + bool label = true; + bool next_is_latitude = false; + bool next_is_longitude = false; + bool next_is_geohash_precision = false; + bool have_latitude = false; + bool have_longitude = false; + bool next_is_language = false; + double longitude = 0.0; + double latitude = 0.0; + + + for (size_t i = 1; i < argc; i++) { + char *arg = argv[i]; + + if (string_equals(arg, "--with-unit")) { + options.with_unit = true; + } else if (string_equals(arg, "--latitude")) { + next_is_latitude = true; + } else if (string_equals(arg, "--longitude")) { + next_is_longitude = true; + } else if (string_equals(arg, "--geohash-precision")) { + next_is_geohash_precision = true; + } else if (string_equals(arg, "--name-only-keys")) { + options.name_only_keys = true; + } else if (string_equals(arg, "--address-only-keys")) { + options.address_only_keys = true; + } else if (string_equals(arg, "--language")) { + next_is_language = true; + } else if (next_is_latitude) { + sscanf(arg, "%lf", &latitude); + next_is_latitude = false; + have_latitude = true; + } else if (next_is_longitude) { + sscanf(arg, "%lf", &longitude); + next_is_longitude = false; + have_longitude = true; + } else if (next_is_geohash_precision) { + size_t geohash_precision = 0; + sscanf(arg, "%zu", &geohash_precision); + options.geohash_precision = geohash_precision; + next_is_geohash_precision = false; + } else if (next_is_language) { + if (languages_array == NULL) { + languages_array = cstring_array_new(); + } + cstring_array_add_string(languages_array, arg); + } else if (label) { + cstring_array_add_string(labels_array, arg); + label = false; + } else { + cstring_array_add_string(values_array, arg); + label = true; + } + } + + if (have_latitude && have_longitude) { + options.with_latlon = true; + options.latitude = latitude; + options.longitude = longitude; + } + + size_t num_languages = 0; + char **languages = NULL; + if (languages_array != NULL) { + num_languages = cstring_array_num_strings(languages_array); + languages = cstring_array_to_strings(languages_array); + } + + + size_t num_components = cstring_array_num_strings(labels_array); + if (num_components != cstring_array_num_strings(values_array)) { + cstring_array_destroy(labels_array); + cstring_array_destroy(values_array); + printf("Must have same number of labels and values\n"); + exit(EXIT_FAILURE); + } + + char **labels = cstring_array_to_strings(labels_array); + char **values = cstring_array_to_strings(values_array); + + size_t num_near_dupe_hashes = 0; + char **near_dupe_hashes = libpostal_near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages, &num_near_dupe_hashes); + if (near_dupe_hashes != NULL) { + for (size_t i = 0; i < num_near_dupe_hashes; i++) { + char *near_dupe_hash = near_dupe_hashes[i]; + printf("%s\n", near_dupe_hash); + } + + libpostal_expansion_array_destroy(near_dupe_hashes, num_near_dupe_hashes); + } + + libpostal_expansion_array_destroy(labels, num_components); + libpostal_expansion_array_destroy(values, num_components); + + if (languages != NULL) { + libpostal_expansion_array_destroy(languages, num_languages); + } + + libpostal_teardown(); + libpostal_teardown_language_classifier(); + +} diff --git a/src/normalize.c b/src/normalize.c index 802c6d9a..3e218e9d 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -401,9 +401,12 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t char *append_if_not_numeric = NULL; int32_t ch; + int32_t next_ch; ssize_t char_len; + ssize_t next_char_len; bool last_was_letter = false; + bool last_was_number = false; bool append_char = true; while (idx < len) { @@ -417,13 +420,21 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t bool is_letter = utf8_is_letter(cat); bool is_number = utf8_is_number(cat); + next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch); + int next_cat = utf8proc_category(next_ch); + bool next_is_number = utf8_is_number(next_cat); + bool next_is_letter = utf8_is_letter(next_cat); + + bool is_full_stop = ch == FULL_STOP_CODEPOINT; - if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) { + bool is_hyphen_between_letter_and_number = is_hyphen && ((next_is_number && last_was_letter) || (next_is_letter && last_was_number)); + + if (is_hyphen && options & NORMALIZE_TOKEN_REPLACE_HYPHENS && (!(last_was_number && next_is_number) || options & NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS)) { char_array_append(array, " "); append_char = false; } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) { - append_char = false; + append_char = !is_hyphen_between_letter_and_number; } if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) { @@ -444,7 +455,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t append_char = false; } - if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) { + if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && ((last_was_letter && is_number) || (last_was_number && is_letter)) && !alpha_numeric_split) { char_array_append(array, " "); alpha_numeric_split = true; } @@ -482,7 +493,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t append_char = true; last_was_letter = is_letter; - + last_was_number = is_number; } } diff --git a/src/normalize.h b/src/normalize.h index d485f67f..9d58f78b 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -30,34 +30,38 @@ As well as normalizations for individual string tokens: #include "constants.h" #include "klib/khash.h" +#include "libpostal.h" #include "string_utils.h" #include "utf8proc/utf8proc.h" #include "unicode_scripts.h" #include "numex.h" +#include "scanner.h" #include "transliterate.h" #include "trie.h" #include "tokens.h" #include "vector.h" -#define NORMALIZE_STRING_LATIN_ASCII 1 << 0 -#define NORMALIZE_STRING_TRANSLITERATE 1 << 1 -#define NORMALIZE_STRING_STRIP_ACCENTS 1 << 2 -#define NORMALIZE_STRING_DECOMPOSE 1 << 3 -#define NORMALIZE_STRING_LOWERCASE 1 << 4 -#define NORMALIZE_STRING_TRIM 1 << 5 -#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 -#define NORMALIZE_STRING_COMPOSE 1 << 7 -#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 -#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 +#define NORMALIZE_STRING_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII +#define NORMALIZE_STRING_TRANSLITERATE LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE +#define NORMALIZE_STRING_STRIP_ACCENTS LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS +#define NORMALIZE_STRING_DECOMPOSE LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE +#define NORMALIZE_STRING_LOWERCASE LIBPOSTAL_NORMALIZE_STRING_LOWERCASE +#define NORMALIZE_STRING_TRIM LIBPOSTAL_NORMALIZE_STRING_TRIM +#define NORMALIZE_STRING_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS +#define NORMALIZE_STRING_COMPOSE LIBPOSTAL_NORMALIZE_STRING_COMPOSE +#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII +#define NORMALIZE_STRING_REPLACE_NUMEX LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX -#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 -#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 -#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2 -#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 -#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 -#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 -#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 -#define NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 +#define NORMALIZE_TOKEN_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS +#define NORMALIZE_TOKEN_DELETE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS +#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD +#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS +#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES +#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE +#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC +#define NORMALIZE_TOKEN_REPLACE_DIGITS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS +#define NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS +#define NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS // Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings #define DIGIT_CHAR "D" diff --git a/src/numex.c b/src/numex.c index b8a0f0e7..7f4ef630 100644 --- a/src/numex.c +++ b/src/numex.c @@ -439,7 +439,7 @@ bool numex_table_read(FILE *f) { log_debug("read num_languages = %" PRIu64 "\n", num_languages); - int i = 0; + size_t i = 0; numex_language_t *language; @@ -541,7 +541,7 @@ bool numex_table_write(FILE *f) { numex_rule_t rule; - int i = 0; + size_t i = 0; for (i = 0; i < num_rules; i++) { rule = numex_table->rules->a[i]; @@ -848,14 +848,21 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value); } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && (!whole_tokens_only || complete_token)) { log_debug("Had previous token with no context, finishing previous rule before returning\n"); - result.len = prev_result_len; number_finished = true; + complete_token = false; advance_index = false; state = start_state; prev_rule_was_number = true; rule = prev_rule = NUMEX_NULL_RULE; prev_result_len = 0; + } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && whole_tokens_only && !complete_token) { + log_debug("whole_tokens_only = %d, complete_token = %d\n", whole_tokens_only, complete_token); + rule = NUMEX_NULL_RULE; + last_was_separator = false; + prev_rule_was_number = false; + state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN; + continue; } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) { log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n"); prev_rule = rule; @@ -885,7 +892,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { continue; } - prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL; if (rule.rule_type != NUMEX_STOPWORD) { @@ -903,6 +909,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) { number_finished = true; } + log_debug("rule is ordinal\n"); } @@ -941,6 +948,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { log_debug("Adding phrase, value=%" PRId64 "\n", result.value); result = NULL_NUMEX_RESULT; number_finished = false; + rule = prev_rule = NUMEX_NULL_RULE; } prev_state = state; @@ -1060,6 +1068,7 @@ size_t possible_ordinal_digit_len(char *str, size_t len) { int32_t ch; size_t digit_len = 0; + bool seen_first_digit = false; while (idx < len) { ssize_t char_len = utf8proc_iterate(ptr, len, &ch); @@ -1071,10 +1080,14 @@ size_t possible_ordinal_digit_len(char *str, size_t len) { // 0-9 only for this is_digit = ch >= 48 && ch <= 57; - if ((idx == 0 && !is_digit) || (idx > 0 && is_digit && !last_was_digit)) { + if ((seen_first_digit && is_digit && !last_was_digit)) { return 0; } + if (is_digit && !seen_first_digit) { + seen_first_digit = true; + } + if (is_digit) { digit_len += char_len; } @@ -1124,23 +1137,115 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { return 0; } + + +static inline bool is_roman_numeral_char(char c) { + return (c == 'i' || + c == 'v' || + c == 'x' || + c == 'l' || + c == 'c' || + c == 'd' || + c == 'm' || + c == 'I' || + c == 'V' || + c == 'X' || + c == 'L' || + c == 'C' || + c == 'D' || + c == 'M'); +} + +static inline bool is_likely_single_roman_numeral_char(char c) { + return (c == 'i' || + c == 'v' || + c == 'x' || + c == 'I' || + c == 'V' || + c == 'X'); +} + + +bool is_valid_roman_numeral(char *str, size_t len) { + char *copy = strndup(str, len); + if (copy == NULL) return false; + + numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE); + if (results == NULL) { + free(copy); + return false; + } + + bool ret = results->n == 1 && results->a[0].len == len; + numex_result_array_destroy(results); + free(copy); + return ret; +} + +bool is_likely_roman_numeral_len(char *str, size_t len) { + bool seen_roman = false; + for (size_t i = 0; i < len; i++) { + char c = *(str + i); + if (c == 0) break; + if ((len <= 2 && is_likely_single_roman_numeral_char(c)) || (len > 2 && is_roman_numeral_char(c))) { + seen_roman = true; + } else { + return false; + } + } + + return seen_roman && is_valid_roman_numeral(str, len); +} + +inline bool is_likely_roman_numeral(char *str) { + return is_likely_roman_numeral_len(str, strlen(str)); +} + char *replace_numeric_expressions(char *str, char *lang) { numex_result_array *results = convert_numeric_expressions(str, lang); if (results == NULL) return NULL; + bool is_latin = string_equals(lang, LATIN_LANGUAGE_CODE); + size_t len = strlen(str); char_array *replacement = char_array_new_size(len); size_t start = 0; size_t end = 0; - for (int i = 0; i < results->n; i++) { - numex_result_t result = results->a[i]; + bool have_valid_numex = false; + numex_result_t result = NULL_NUMEX_RESULT; + + for (size_t i = 0; i < results->n; i++) { + result = results->a[i]; if (result.len == 0) { continue; } + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + continue; + } + have_valid_numex = true; + break; + } + + if (!have_valid_numex) { + numex_result_array_destroy(results); + return NULL; + } + + for (size_t i = 0; i < results->n; i++) { + result = results->a[i]; + + if (result.len == 0) { + continue; + } + + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + continue; + } + end = result.start; log_debug("lang=%s, start = %zu, len = %zu, value=%" PRId64 "\n", lang, result.start, result.len, result.value); @@ -1170,3 +1275,4 @@ char *replace_numeric_expressions(char *str, char *lang) { return char_array_to_string(replacement); } + diff --git a/src/numex.h b/src/numex.h index 5b289f13..1a0d89b7 100644 --- a/src/numex.h +++ b/src/numex.h @@ -152,6 +152,9 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang); size_t ordinal_suffix_len(char *s, size_t len, char *lang); size_t possible_ordinal_digit_len(char *str, size_t len); +bool is_likely_roman_numeral(char *str); +bool is_likely_roman_numeral_len(char *str, size_t len); + bool numex_table_write(FILE *file); bool numex_table_save(char *filename); diff --git a/src/place.c b/src/place.c new file mode 100644 index 00000000..549f1f48 --- /dev/null +++ b/src/place.c @@ -0,0 +1,181 @@ +#include "place.h" +#include "address_parser.h" + +static inline bool is_address_text_component(char *label) { + return (string_equals(label, ADDRESS_PARSER_LABEL_HOUSE) || + string_equals(label, ADDRESS_PARSER_LABEL_ROAD) || + string_equals(label, ADDRESS_PARSER_LABEL_METRO_STATION) || + string_equals(label, ADDRESS_PARSER_LABEL_SUBURB) || + string_equals(label, ADDRESS_PARSER_LABEL_CITY_DISTRICT) || + string_equals(label, ADDRESS_PARSER_LABEL_CITY) || + string_equals(label, ADDRESS_PARSER_LABEL_STATE_DISTRICT) || + string_equals(label, ADDRESS_PARSER_LABEL_ISLAND) || + string_equals(label, ADDRESS_PARSER_LABEL_STATE) || + string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY_REGION) || + string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY) || + string_equals(label, ADDRESS_PARSER_LABEL_WORLD_REGION) + ); +} + +language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) { + if (num_components == 0 || values == NULL || labels == NULL) return NULL; + + language_classifier_response_t *lang_response = NULL; + + char *label; + char *value; + + size_t total_size = 0; + for (size_t i = 0; i < num_components; i++) { + value = values[i]; + label = labels[i]; + if (is_address_text_component(label)) { + total_size += strlen(value); + // extra char for spaces + if (i < num_components - 1) { + total_size++; + } + } + } + + char_array *combined = char_array_new_size(total_size); + if (combined == NULL) { + return NULL; + } + + for (size_t i = 0; i < num_components; i++) { + value = values[i]; + label = labels[i]; + if (is_address_text_component(label)) { + char_array_cat(combined, value); + if (i < num_components - 1) { + char_array_cat(combined, " "); + } + } + } + + char *combined_input = char_array_get_string(combined); + + lang_response = classify_languages(combined_input); + + char_array_destroy(combined); + return lang_response; +} + + + +place_t *place_new(void) { + place_t *place = calloc(1, sizeof(place_t)); + return place; +} + +void place_destroy(place_t *place) { + if (place == NULL) return; + free(place); +} + + +place_t *place_from_components(size_t num_components, char **labels, char **values) { + if (num_components == 0 || labels == NULL || values == NULL) { + return NULL; + } + + place_t *place = place_new(); + if (place == NULL) return NULL; + + for (size_t i = 0; i < num_components; i++) { + char *value = values[i]; + char *label = labels[i]; + if (string_equals(label, ADDRESS_PARSER_LABEL_ROAD)) { + if (place->street == NULL) { + place->street = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_HOUSE)) { + if (place->name == NULL) { + place->name = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_HOUSE_NUMBER)) { + if (place->house_number == NULL) { + place->house_number = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_POSTAL_CODE)) { + if (place->postal_code == NULL) { + place->postal_code = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_CITY)) { + if (place->city == NULL) { + place->city = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_STATE)) { + if (place->state == NULL) { + place->state = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY)) { + if (place->country == NULL) { + place->country = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_SUBURB)) { + if (place->suburb == NULL) { + place->suburb = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_CITY_DISTRICT)) { + if (place->city_district == NULL) { + place->city_district = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_STATE_DISTRICT)) { + if (place->state_district == NULL) { + place->state_district = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY_REGION)) { + if (place->country_region == NULL) { + place->country_region = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_ISLAND)) { + if (place->island == NULL) { + place->island = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_WORLD_REGION)) { + if (place->world_region == NULL) { + place->world_region = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_UNIT)) { + if (place->unit == NULL) { + place->unit = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_TELEPHONE)) { + if (place->telephone == NULL) { + place->telephone = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_WEBSITE)) { + if (place->website == NULL) { + place->website = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_LEVEL)) { + if (place->level == NULL) { + place->level = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_PO_BOX)) { + if (place->po_box == NULL) { + place->po_box = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_BUILDING)) { + if (place->building == NULL) { + place->building = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_STAIRCASE)) { + if (place->staircase == NULL) { + place->staircase = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_ENTRANCE)) { + if (place->entrance == NULL) { + place->entrance = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_METRO_STATION)) { + if (place->metro_station == NULL) { + place->metro_station = value; + } + } + } + + return place; +} diff --git a/src/place.h b/src/place.h new file mode 100644 index 00000000..88920582 --- /dev/null +++ b/src/place.h @@ -0,0 +1,43 @@ +#ifndef PLACE_H +#define PLACE_H + +#include +#include + +#include "libpostal.h" +#include "language_classifier.h" + +typedef struct place { + char *name; + char *house_number; + char *street; + char *building; + char *entrance; + char *staircase; + char *level; + char *unit; + char *po_box; + char *metro_station; + char *suburb; + char *city_district; + char *city; + char *state_district; + char *island; + char *state; + char *country_region; + char *country; + char *world_region; + char *postal_code; + char *telephone; + char *website; +} place_t; + +language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values); + +place_t *place_new(void); + +place_t *place_from_components(size_t num_components, char **labels, char **values); + +void place_destroy(place_t *place); + +#endif \ No newline at end of file diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c new file mode 100644 index 00000000..3c77be89 --- /dev/null +++ b/src/soft_tfidf.c @@ -0,0 +1,374 @@ +#include "soft_tfidf.h" +#include "address_dictionary.h" +#include "float_utils.h" +#include "string_similarity.h" +#include "string_utils.h" +#include "log/log.h" + +static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = { + .jaro_winkler_min = 0.9, + .damerau_levenshtein_max = 1, + .damerau_levenshtein_min_length = 4, + .use_abbreviations = true +}; + + +soft_tfidf_options_t soft_tfidf_default_options(void) { + return DEFAULT_SOFT_TFIDF_OPTIONS; +} + +bool compare_canonical(address_expansion_t e1, char **tokens1, phrase_t match1, address_expansion_t e2, char **tokens2, phrase_t match2) { + bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; + bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; + + if (!e1_canonical && !e2_canonical) { + return e1.canonical_index == e2.canonical_index; + } else if (e1_canonical && e2_canonical) { + if (match1.len != match2.len || match1.len == 0) return false; + for (size_t i = 0; i < match1.len; i++) { + char *s1 = tokens1[match1.start + i]; + char *s2 = tokens2[match2.start + i]; + if (!string_equals(s1, s2)) return false; + } + return true; + } else { + char **canonical_tokens = e1_canonical ? tokens1 : tokens2; + char *other_canonical = e1_canonical ? address_dictionary_get_canonical(e2.canonical_index) : address_dictionary_get_canonical(e1.canonical_index); + phrase_t match = e1_canonical ? match1 : match2; + + size_t canonical_index = 0; + size_t canonical_len = strlen(other_canonical); + + for (size_t i = match.start; i < match.start + match.len; i++) { + char *canonical_token = canonical_tokens[i]; + size_t canonical_token_len = strlen(canonical_token); + + if (canonical_index + canonical_token_len <= canonical_len && strncmp(other_canonical + canonical_index, canonical_token, canonical_token_len) == 0) { + canonical_index += canonical_token_len; + + if (i < match.start + match.len - 1 && canonical_index < canonical_len && strncmp(other_canonical + canonical_index, " ", 1) == 0) { + canonical_index++; + } + } else { + return false; + } + } + return true; + } +} + +typedef enum { + CANONICAL_NO_MATCH = 0, + NEITHER_CANONICAL, + SECOND_CANONICAL, + FIRST_CANONICAL, + BOTH_CANONICAL +} canonical_match_t; + +bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) { + address_expansion_value_t *val1 = address_dictionary_get_expansions(match1.data); + address_expansion_value_t *val2 = address_dictionary_get_expansions(match2.data); + + if (val1 == NULL || val2 == NULL) return false; + + address_expansion_array *expansions_array1 = val1->expansions; + address_expansion_array *expansions_array2 = val2->expansions; + + if (expansions_array1 == NULL || expansions_array2 == NULL) return false; + + address_expansion_t *expansions1 = expansions_array1->a; + address_expansion_t *expansions2 = expansions_array2->a; + + *response = CANONICAL_NO_MATCH; + + bool same_canonical = false; + for (size_t i = 0; i < expansions_array1->n; i++) { + address_expansion_t e1 = expansions1[i]; + + for (size_t j = 0; j < expansions_array2->n; j++) { + address_expansion_t e2 = expansions2[j]; + + same_canonical = compare_canonical(e1, tokens1, match1, e2, tokens2, match2); + if (same_canonical) { + bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; + bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; + + if (e1_canonical && e2_canonical) { + *response = BOTH_CANONICAL; + } else if (e1_canonical) { + *response = FIRST_CANONICAL; + } else if (e2_canonical) { + *response = SECOND_CANONICAL; + } else { + *response = NEITHER_CANONICAL; + } + break; + } + } + if (same_canonical) break; + } + + return same_canonical; +} + + +double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options) { + if (token_scores1 == NULL || token_scores2 == NULL) return 0.0; + + if (num_tokens1 > num_tokens2) { + double *tmp_scores = token_scores1; + token_scores1 = token_scores2; + token_scores2 = tmp_scores; + char **tmp_tokens = tokens1; + tokens1 = tokens2; + tokens2 = tmp_tokens; + + phrase_array *tmp_phrases = phrases1; + phrases1 = phrases2; + phrases2 = tmp_phrases; + + size_t tmp_num_tokens = num_tokens1; + num_tokens1 = num_tokens2; + num_tokens2 = tmp_num_tokens; + } + + size_t len1 = num_tokens1; + size_t len2 = num_tokens2; + + double total_sim = 0.0; + + uint32_array **t1_tokens_unicode = NULL; + uint32_array **t2_tokens_unicode = NULL; + + uint32_array *t1_unicode; + uint32_array *t2_unicode; + + int64_array *phrase_memberships_array1 = NULL; + int64_array *phrase_memberships_array2 = NULL; + int64_t *phrase_memberships1 = NULL; + int64_t *phrase_memberships2 = NULL; + + int64_array *acronym_memberships_array = NULL; + int64_t *acronym_memberships = NULL; + + t1_tokens_unicode = calloc(len1, sizeof(uint32_array *)); + if (t1_tokens_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + for (size_t i = 0; i < len1; i++) { + t1_unicode = unicode_codepoints(tokens1[i]); + if (t1_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + t1_tokens_unicode[i] = t1_unicode; + } + + t2_tokens_unicode = calloc(len2, sizeof(uint32_array *)); + if (t2_tokens_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + + for (size_t j = 0; j < len2; j++) { + t2_unicode = unicode_codepoints(tokens2[j]); + if (t2_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + t2_tokens_unicode[j] = t2_unicode; + } + + + if (phrases1 != NULL && phrases2 != NULL) { + phrase_memberships_array1 = int64_array_new(); + phrase_memberships_array2 = int64_array_new(); + token_phrase_memberships(phrases1, phrase_memberships_array1, len1); + token_phrase_memberships(phrases2, phrase_memberships_array2, len2); + + if (phrase_memberships_array1->n == len1) { + phrase_memberships1 = phrase_memberships_array1->a; + } + + if (phrase_memberships_array2->n == len2) { + phrase_memberships2 = phrase_memberships_array2->a; + } + } + + if (acronym_alignments != NULL) { + acronym_memberships_array = int64_array_new(); + token_phrase_memberships(acronym_alignments, acronym_memberships_array, len2); + if (acronym_memberships_array->n == len2) { + acronym_memberships = acronym_memberships_array->a; + } + } + + double jaro_winkler_min = options.jaro_winkler_min; + size_t damerau_levenshtein_max = options.damerau_levenshtein_max; + size_t damerau_levenshtein_min_length = options.damerau_levenshtein_min_length; + bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && len1 >= damerau_levenshtein_min_length; + + bool use_abbreviations = options.use_abbreviations; + + for (size_t i = 0; i < len1; i++) { + uint32_array *t1u = t1_tokens_unicode[i]; + uint32_array *t2u; + double t1_score = token_scores1[i]; + + double max_sim = 0.0; + size_t min_dist = t1u->n; + size_t argmax_sim = 0; + size_t argmin_dist = 0; + double argmin_dist_sim = 0.0; + size_t last_abbreviation = 0; + double last_abbreviation_sim = 0.0; + bool have_abbreviation = false; + bool have_acronym_match = false; + phrase_t acronym_phrase = NULL_PHRASE; + bool have_phrase_match = false; + int64_t pm1 = phrase_memberships1 != NULL ? phrase_memberships1[i] : NULL_PHRASE_MEMBERSHIP; + phrase_t p1 = pm1 >= 0 ? phrases1->a[pm1] : NULL_PHRASE; + phrase_t argmax_phrase = NULL_PHRASE; + + canonical_match_t best_canonical_phrase_response = CANONICAL_NO_MATCH; + + double t2_score; + + for (size_t j = 0; j < len2; j++) { + t2u = t2_tokens_unicode[j]; + int64_t pm2 = phrase_memberships2 != NULL ? phrase_memberships2[j] : NULL_PHRASE_MEMBERSHIP; + phrase_t p2 = pm2 >= 0 ? phrases2->a[pm2] : NULL_PHRASE; + + canonical_match_t canonical_response = CANONICAL_NO_MATCH; + if (p1.len > 0 && p2.len > 0 && phrases_have_same_canonical(num_tokens1, tokens1, num_tokens2, tokens2, p1, p2, &canonical_response)) { + if (canonical_response > best_canonical_phrase_response) { + best_canonical_phrase_response = canonical_response; + argmax_sim = j; + argmax_phrase = p2; + max_sim = 1.0; + have_phrase_match = true; + continue; + } + } + + if (unicode_equals(t1u, t2u)) { + max_sim = 1.0; + argmax_sim = j; + break; + } + + if (acronym_memberships != NULL) { + int64_t acronym_membership = acronym_memberships[j]; + log_debug("acronym_membership = %zd\n", acronym_membership); + if (acronym_membership >= 0) { + acronym_phrase = acronym_alignments->a[acronym_membership]; + uint32_t acronym_match_index = acronym_phrase.data; + if (acronym_match_index == i) { + max_sim = 1.0; + argmax_sim = j; + have_acronym_match = true; + log_debug("have acronym match\n"); + break; + } + } + } + + double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u); + if (jaro_winkler > max_sim) { + max_sim = jaro_winkler; + argmax_sim = j; + } + + + if (use_damerau_levenshtein) { + size_t replace_cost = 0; + ssize_t dist = damerau_levenshtein_distance_unicode(t1u, t2u, replace_cost); + if (dist >= 0 && dist < min_dist) { + min_dist = (size_t)dist; + argmin_dist = j; + argmin_dist_sim = jaro_winkler; + } + } + + if (use_abbreviations) { + bool is_abbreviation = possible_abbreviation_unicode(t1u, t2u); + if (is_abbreviation) { + last_abbreviation = j; + last_abbreviation_sim = jaro_winkler; + have_abbreviation = true; + } + } + + } + + // Note: here edit distance, affine gap and abbreviations are only used in the thresholding process. + // Jaro-Winkler is still used to calculate similarity + + if (!have_acronym_match && !have_phrase_match) { + if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) { + log_debug("have max sim = %f\n", max_sim); + t2_score = token_scores2[argmax_sim]; + total_sim += max_sim * t1_score * t2_score; + } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { + log_debug("levenshtein\n"); + t2_score = token_scores2[argmin_dist]; + total_sim += argmin_dist_sim * t1_score * t2_score; + } else if (use_abbreviations && have_abbreviation) { + log_debug("have abbreviation\n"); + t2_score = token_scores2[last_abbreviation]; + total_sim += last_abbreviation_sim * t1_score * t2_score; + } + } else if (have_phrase_match) { + for (size_t p = argmax_phrase.start; p < argmax_phrase.start + argmax_phrase.len; p++) { + t2_score = token_scores2[p]; + total_sim += max_sim * t1_score * t2_score; + } + } else { + for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) { + t2_score = token_scores2[p]; + total_sim += max_sim * t1_score * t2_score; + } + } + } + +return_soft_tfidf_score: + if (t1_tokens_unicode != NULL) { + for (size_t i = 0; i < len1; i++) { + t1_unicode = t1_tokens_unicode[i]; + if (t1_unicode != NULL) { + uint32_array_destroy(t1_unicode); + } + } + free(t1_tokens_unicode); + } + + if (t2_tokens_unicode != NULL) { + for (size_t i = 0; i < len2; i++) { + t2_unicode = t2_tokens_unicode[i]; + if (t2_unicode != NULL) { + uint32_array_destroy(t2_unicode); + } + } + free(t2_tokens_unicode); + } + + if (phrase_memberships_array1 != NULL) { + int64_array_destroy(phrase_memberships_array1); + } + + if (phrase_memberships_array2 != NULL) { + int64_array_destroy(phrase_memberships_array2); + } + + if (acronym_memberships_array != NULL) { + int64_array_destroy(acronym_memberships_array); + } + + return total_sim; +} + + +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) { + return soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, options); +} \ No newline at end of file diff --git a/src/soft_tfidf.h b/src/soft_tfidf.h new file mode 100644 index 00000000..244578ba --- /dev/null +++ b/src/soft_tfidf.h @@ -0,0 +1,48 @@ +#ifndef SOFT_TFIDF_H +#define SOFT_TFIDF_H + +#include +#include "collections.h" +#include "libpostal.h" +#include "trie_search.h" + +/* +This is a variant of Soft-TFIDF as described in: + +Cohen, Ravikumar, and Fienberg. A comparison of string distance +metrics for name-matching tasks. (2003) +https://www.cs.cmu.edu/~wcohen/postscript/ijcai-ws-2003.pdf + +Soft TFIDF is a hybrid similarity function for strings, typically names, +which combines both global statistics (TF-IDF) and a local similarity +function (e.g. Jaro-Winkler, which the authors suggest performs best). + +Given two strings, s1 and s2, each token t1 in s1 is matched with its most +similar counterpart t2 in s2 according to the local distance function. + +The Soft-TFIDF similarity is then the dot product of the max token +similarities and the cosine similarity of the TF-IDF vectors for all tokens +if the max similarity is >= a given threshold theta. + +This version is a modified Soft-TFIDF. Jaro-Winkler is used as the secondary +distance metric. However, the defintion of two tokens being "similar" is +defined as either: + +1. Jaro-Winkler distance >= theta +2. Damerau-Levenshtein edit distance <= max_edit_distance +3. Affine gap edit counts indicate a possible abbreviation (# matches == min(len1, len2)) +*/ + +typedef struct soft_tfidf_options { + double jaro_winkler_min; + size_t damerau_levenshtein_max; + size_t damerau_levenshtein_min_length; + bool use_abbreviations; +} soft_tfidf_options_t; + +soft_tfidf_options_t soft_tfidf_default_options(void); + +double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options); +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options); + +#endif \ No newline at end of file diff --git a/src/string_similarity.c b/src/string_similarity.c new file mode 100644 index 00000000..6a16518d --- /dev/null +++ b/src/string_similarity.c @@ -0,0 +1,586 @@ +#include "string_similarity.h" +#include "string_utils.h" + +#include + +static affine_gap_edits_t NULL_AFFINE_GAP_EDITS = { + .num_matches = 0, + .num_mismatches = 0, + .num_transpositions = 0, + .num_gap_opens = 0, + .num_gap_extensions = 0 +}; + +typedef enum { + AFFINE_CHAR_MATCH, + AFFINE_CHAR_MISMATCH, + AFFINE_TRANSPOSITION, + AFFINE_GAP_OPEN, + AFFINE_GAP_EXTEND +} affine_gap_op; + +static inline bool space_or_equivalent(int32_t c) { + int cat = utf8proc_category(c); + return utf8_is_whitespace(c) || utf8_is_hyphen(c) || utf8_is_punctuation(cat); +} + +affine_gap_edits_t affine_gap_distance_unicode_costs(uint32_array *u1_array, uint32_array *u2_array, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { + if (u1_array->n < u2_array->n) { + uint32_array *tmp_array = u1_array; + u1_array = u2_array; + u2_array = tmp_array; + } + + size_t m = u1_array->n; + size_t n = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + affine_gap_edits_t edits = NULL_AFFINE_GAP_EDITS; + + if (unicode_equals(u1_array, u2_array)) { + edits.num_matches = n; + return edits; + } + + size_t num_bytes = (m + 1) * sizeof(size_t); + + size_t *C = malloc(num_bytes); + if (C == NULL) { + return NULL_AFFINE_GAP_EDITS; + } + + size_t *D = malloc(num_bytes); + if (D == NULL) { + free(C); + return NULL_AFFINE_GAP_EDITS; + } + + affine_gap_edits_t *E = malloc((m + 1) * sizeof(affine_gap_edits_t)); + if (E == NULL) { + free(C); + free(D); + return NULL_AFFINE_GAP_EDITS; + } + + affine_gap_edits_t *ED = malloc((m + 1) * sizeof(affine_gap_edits_t)); + if (ED == NULL) { + free(C); + free(D); + free(E); + return NULL_AFFINE_GAP_EDITS; + } + + size_t e = 0, c = 0, s = 0; + + C[0] = 0; + E[0] = NULL_AFFINE_GAP_EDITS; + size_t t = start_gap_cost; + + affine_gap_edits_t base_edits = NULL_AFFINE_GAP_EDITS; + base_edits.num_gap_opens++; + + for (size_t j = 1; j < m + 1; j++) { + t += extend_gap_cost; + C[j] = t; + D[j] = t + start_gap_cost; + base_edits.num_gap_extensions++; + E[j] = base_edits; + ED[j] = base_edits; + } + + t = start_gap_cost; + base_edits = NULL_AFFINE_GAP_EDITS; + base_edits.num_gap_opens++; + + affine_gap_edits_t current_edits = NULL_AFFINE_GAP_EDITS; + affine_gap_edits_t prev_char_edits = NULL_AFFINE_GAP_EDITS; + affine_gap_edits_t prev_row_prev_char_edits = NULL_AFFINE_GAP_EDITS; + + bool in_gap = false; + + for (size_t i = 1; i < n + 1; i++) { + // s = CC[0] + s = C[0]; + uint32_t c2 = u2[i - 1]; + // CC[0] = c = t = t + h + t += extend_gap_cost; + c = t; + C[0] = c; + + prev_row_prev_char_edits = E[0]; + base_edits.num_gap_extensions++; + prev_char_edits = base_edits; + E[0] = prev_char_edits; + + // e = t + g + e = t + start_gap_cost; + + affine_gap_op op = AFFINE_GAP_OPEN; + + ssize_t match_at = -1; + + size_t min_at = 0; + size_t min_cost = SIZE_MAX; + + for (size_t j = 1; j < m + 1; j++) { + // insertion + // e = min(e, c + g) + h + size_t min = e; + uint32_t c1 = u1[j - 1]; + + affine_gap_op insert_op = AFFINE_GAP_OPEN; + + if ((c + start_gap_cost) < min) { + min = c + start_gap_cost; + insert_op = AFFINE_GAP_OPEN; + } else { + insert_op = AFFINE_GAP_EXTEND; + } + + e = min + extend_gap_cost; + + // deletion + // DD[j] = min(DD[j], CC[j] + g) + h + + affine_gap_op delete_op = AFFINE_GAP_OPEN; + + min = D[j]; + affine_gap_edits_t delete_edits = ED[j]; + affine_gap_edits_t delete_edits_stored = delete_edits; + delete_op = AFFINE_GAP_OPEN; + if (C[j] + start_gap_cost < min) { + min = C[j] + start_gap_cost; + + delete_edits = delete_edits_stored = E[j]; + delete_edits_stored.num_gap_opens++; + } + + D[j] = min + extend_gap_cost; + delete_edits_stored.num_gap_extensions++; + ED[j] = delete_edits_stored; + + // Cost + // c = min(DD[j], e, s + w(a, b)) + + affine_gap_op current_op = delete_op; + + + min = D[j]; + + // Delete transition + current_edits = delete_edits; + + if (e < min) { + min = e; + // Insert transition + current_op = insert_op; + current_edits = prev_char_edits; + } + + bool both_separators = space_or_equivalent((int32_t)c1) && space_or_equivalent((int32_t)c2); + + bool is_transpose = false; + size_t w = c1 != c2 && !both_separators ? mismatch_cost : match_cost; + + if (c1 != c2 && utf8_is_letter(utf8proc_category(c2)) && utf8_is_letter(utf8proc_category(c1)) && j < m && c2 == u1[j] && i < n && c1 == u2[i]) { + w = transpose_cost; + is_transpose = true; + } + + if (s + w < min) { + min = s + w; + + // Match/mismatch/transpose transition + current_edits = prev_row_prev_char_edits; + + if ((c1 == c2 || both_separators) && !is_transpose) { + current_op = AFFINE_CHAR_MATCH; + } else if (!is_transpose) { + current_op = AFFINE_CHAR_MISMATCH; + } else if (is_transpose) { + current_op = AFFINE_TRANSPOSITION; + } + } + + if (current_op == AFFINE_CHAR_MATCH) { + current_edits.num_matches++; + } else if (current_op == AFFINE_CHAR_MISMATCH) { + current_edits.num_mismatches++; + } else if (current_op == AFFINE_GAP_EXTEND) { + current_edits.num_gap_extensions++; + } else if (current_op == AFFINE_GAP_OPEN) { + current_edits.num_gap_opens++; + current_edits.num_gap_extensions++; + } else if (current_op == AFFINE_TRANSPOSITION) { + current_edits.num_transpositions++; + } + + if (min < min_cost) { + op = current_op; + min_cost = min; + min_at = j; + } + + c = min; + s = C[j]; + C[j] = c; + + prev_char_edits = current_edits; + prev_row_prev_char_edits = E[j]; + E[j] = prev_char_edits; + + // In the case of a transposition, duplicate costs for next character and advance by 2 + if (current_op == AFFINE_TRANSPOSITION) { + E[j + 1] = E[j]; + C[j + 1] = C[j]; + j++; + } + } + + if (op == AFFINE_TRANSPOSITION) { + i++; + } + + } + + edits = E[m]; + free(C); + free(D); + free(E); + free(ED); + + return edits; + +} + +affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { + return affine_gap_distance_unicode_costs(u1_array, u2_array, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); +} + +affine_gap_edits_t affine_gap_distance_costs(const char *s1, const char *s2, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { + if (s1 == NULL || s2 == NULL) return NULL_AFFINE_GAP_EDITS; + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return NULL_AFFINE_GAP_EDITS; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return NULL_AFFINE_GAP_EDITS; + } + + affine_gap_edits_t edits = affine_gap_distance_unicode_costs(u1_array, u2_array, start_gap_cost, extend_gap_cost, match_cost, mismatch_cost, transpose_cost); + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return edits; +} + + +affine_gap_edits_t affine_gap_distance(const char *s1, const char *s2) { + return affine_gap_distance_costs(s1, s2, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); +} + + +bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_array *u2_array, affine_gap_edits_t edits) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 == 0 || len2 == 0) return false; + + size_t min_len = len1 < len2 ? len1 : len2; + + return edits.num_matches == min_len && u1_array->a[0] == u2_array->a[0]; +} + +inline bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array) { + affine_gap_edits_t edits = affine_gap_distance_unicode(u1_array, u2_array); + + return possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits); +} + + +bool possible_abbreviation_unicode_strict(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 == 0 || len2 == 0) return false; + + size_t min_len = len1 < len2 ? len1 : len2; + + ssize_t prefix_len = unicode_common_prefix(u1_array, u2_array); + if (prefix_len == min_len) return true; + ssize_t suffix_len = unicode_common_suffix(u1_array, u2_array); + return suffix_len > 0 && prefix_len > 0 && possible_abbreviation_unicode(u1_array, u2_array); +} + +static bool possible_abbreviation_options(const char *s1, const char *s2, bool strict) { + if (s1 == NULL || s2 == NULL) return false; + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return false; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return false; + } + + bool abbrev = false; + if (!strict) { + abbrev = possible_abbreviation_unicode(u1_array, u2_array); + } else { + abbrev = possible_abbreviation_unicode_strict(u1_array, u2_array); + } + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return abbrev; +} + +inline bool possible_abbreviation(const char *s1, const char *s2) { + return possible_abbreviation_options(s1, s2, false); +} + +inline bool possible_abbreviation_strict(const char *s1, const char *s2) { + return possible_abbreviation_options(s1, s2, true); +} + + +ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + size_t num_bytes = (len1 + 1) * sizeof(size_t); + + size_t *column = malloc(num_bytes); + if (column == NULL) { + return -1.0; + } + + for (size_t y = 1; y <= len1; y++) { + column[y] = y; + } + + size_t transpose_diag = 0; + size_t last_diag = 0; + + for (size_t x = 1; x <= len2; x++) { + column[0] = x; + for (size_t y = 1, last_diag = x - 1; y <= len1; y++) { + size_t old_diag = column[y]; + size_t cost = (u1[y - 1] == u2[x - 1] ? 0 : 1); + + size_t v1 = column[y] + 1; + size_t v2 = column[y - 1] + 1; + size_t v3 = last_diag + cost; + + size_t min = v1; + if (v2 < min) min = v2; + if (v3 < min) min = v3; + + if (x > 1 && y > 1 && u1[y - 1] == u2[x - 2] && u1[y - 2] == u2[x - 1]) { + size_t v4 = transpose_diag + cost; + if (v4 < min) min = v4; + } + + column[y] = min; + + last_diag = old_diag; + } + transpose_diag = last_diag; + } + + size_t dist = column[len1]; + free(column); + return (ssize_t)dist; +} + +ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) { + if (s1 == NULL || s2 == NULL) return -1; + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return -1.0; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return -1.0; + } + + ssize_t lev = damerau_levenshtein_distance_unicode(u1_array, u2_array, replace_cost); + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + return lev; +} + +ssize_t damerau_levenshtein_distance(const char *s1, const char *s2) { + return damerau_levenshtein_distance_replace_cost(s1, s2, 0); +} + +double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { + if (u1_array == NULL || u2_array == NULL) return -1.0; + + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + // If both strings are zero-length, return 1. If only one is, return 0 + if (len1 == 0) return len2 == 0 ? 1.0 : 0.0; + + size_t max_len = len1 > len2 ? len1 : len2; + size_t match_distance = (max_len / 2) - 1; + + uint8_t *u1_matches = calloc(len1, sizeof(uint8_t)); + uint8_t *u2_matches = calloc(len2, sizeof(uint8_t)); + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + double matches = 0.0; + double transpositions = 0.0; + + size_t i = 0; + + // count matches + for (size_t i = 0; i < len1; i++) { + // start and end take into account the match distance + size_t start = i > match_distance ? i - match_distance : 0; + size_t end = (i + match_distance + 1) < len2 ? i + match_distance + 1 : len2; + + for (size_t k = start; k < end; k++) { + // already a match at k + if (u2_matches[k]) continue; + // codepoints not equal + if (u1[i] != u2[k]) continue; + // otherwise record a match on both sides and increment counter + u1_matches[i] = true; + u2_matches[k] = true; + matches++; + break; + } + } + + if (matches == 0) { + free(u1_matches); + free(u2_matches); + return 0.0; + } + + + // count transpositions + size_t k = 0; + for (size_t i = 0; i < len1; i++) { + // wait for a match in u1 + if (!u1_matches[i]) continue; + // get the next matched character in u2 + while (!u2_matches[k]) k++; + // it's a transposition + if (u1[i] != u2[k]) transpositions++; + k++; + } + + // transpositions double-count transposed characters, so divide by 2 + transpositions /= 2.0; + + free(u1_matches); + free(u2_matches); + + // Jaro distance + return ((matches / len1) + + (matches / len2) + + ((matches - transpositions) / matches)) / 3.0; +} + +double jaro_distance(const char *s1, const char *s2) { + if (s1 == NULL || s2 == NULL) { + return -1.0; + } + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return -1.0; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return -1.0; + } + + double jaro = jaro_distance_unicode(u1_array, u2_array); + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + return jaro; +} + +#define MAX_JARO_WINKLER_PREFIX 4 + +double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) { + double jaro = jaro_distance_unicode(u1_array, u2_array); + + double j; + + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + size_t m = len1 < len2 ? len1 : len2; + + size_t shared_prefix = 0; + for (size_t i = 0; i < m; i++) { + if (u1[i] != u2[i]) break; + shared_prefix++; + if (shared_prefix > MAX_JARO_WINKLER_PREFIX) { + shared_prefix = MAX_JARO_WINKLER_PREFIX; + break; + } + } + + double jaro_winkler = jaro; + + if (jaro >= bonus_threshold) { + jaro_winkler += (1.0 - jaro) * shared_prefix * prefix_scale; + } + + return jaro_winkler > 1.0 ? 1.0 : jaro_winkler; +} + +double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) { + if (s1 == NULL || s2 == NULL) { + return -1.0; + } + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return -1.0; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return -1.0; + } + + double jaro_winkler = jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, prefix_scale, bonus_threshold); + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return jaro_winkler; +} + +inline double jaro_winkler_distance(const char *s1, const char *s2) { + return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD); +} + +inline double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { + return jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD); +} diff --git a/src/string_similarity.h b/src/string_similarity.h new file mode 100644 index 00000000..51c25208 --- /dev/null +++ b/src/string_similarity.h @@ -0,0 +1,47 @@ +#ifndef STRING_SIMILARITY_H +#define STRING_SIMILARITY_H + +#include +#include + +#include "collections.h" + +#define DEFAULT_AFFINE_GAP_OPEN_COST 3 +#define DEFAULT_AFFINE_GAP_EXTEND_COST 2 +#define DEFAULT_AFFINE_GAP_MATCH_COST 0 +#define DEFAULT_AFFINE_GAP_MISMATCH_COST 6 +#define DEFAULT_AFFINE_GAP_TRANSPOSE_COST 4 + +typedef struct affine_gap_edits { + size_t num_matches; + size_t num_mismatches; + size_t num_transpositions; + size_t num_gap_opens; + size_t num_gap_extensions; +} affine_gap_edits_t; + +affine_gap_edits_t affine_gap_distance(const char *s1, const char *s2); +affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); + +bool possible_abbreviation(const char *s1, const char *s2); +bool possible_abbreviation_strict(const char *s1, const char *s2); +bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array); +bool possible_abbreviation_unicode_strict(uint32_array *u1_array, uint32_array *u2_array); +bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_array *u2_array, affine_gap_edits_t edits); + +ssize_t damerau_levenshtein_distance(const char *s1, const char *s2); +ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost); +ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost); + +#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 +#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 + +double jaro_distance(const char *s1, const char *s2); +double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); +double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold); +double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold); +double jaro_winkler_distance(const char *s1, const char *s2); +double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); + + +#endif diff --git a/src/string_utils.c b/src/string_utils.c index b8f3abf0..45cc1373 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -294,6 +294,10 @@ inline bool utf8_is_letter(int cat) { || cat == UTF8PROC_CATEGORY_LM; } +inline bool utf8_is_digit(int cat) { + return cat == UTF8PROC_CATEGORY_ND; +} + inline bool utf8_is_number(int cat) { return cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO; } @@ -310,6 +314,12 @@ inline bool utf8_is_hyphen(int32_t ch) { return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212; } +#define PERIOD_CODEPOINT 46 + +inline bool utf8_is_period(int32_t codepoint) { + return codepoint == PERIOD_CODEPOINT; +} + inline bool utf8_is_punctuation(int cat) { return cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PE \ || cat == UTF8PROC_CATEGORY_PF || cat == UTF8PROC_CATEGORY_PI \ @@ -337,6 +347,111 @@ inline bool utf8_is_whitespace(int32_t ch) { ; } + +ssize_t utf8_len(const char *str, size_t len) { + if (str == NULL) return -1; + if (len == 0) return 0; + + int32_t ch = 0; + ssize_t num_utf8_chars = 0; + ssize_t char_len; + + uint8_t *ptr = (uint8_t *)str; + + size_t remaining = len; + + while (1) { + char_len = utf8proc_iterate(ptr, -1, &ch); + + if (ch == 0) break; + remaining -= char_len; + if (remaining == 0) break; + + ptr += char_len; + num_utf8_chars++; + } + + return num_utf8_chars; +} + +uint32_array *unicode_codepoints(const char *str) { + if (str == NULL) return NULL; + + uint32_array *a = uint32_array_new(); + + int32_t ch = 0; + ssize_t num_utf8_chars = 0; + ssize_t char_len; + + uint8_t *ptr = (uint8_t *)str; + + while (1) { + char_len = utf8proc_iterate(ptr, -1, &ch); + + if (ch == 0) break; + + uint32_array_push(a, (uint32_t)ch); + ptr += char_len; + } + + return a; +} + +bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 != len2) return false; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + for (size_t i = 0; i < len1; i++) { + if (u1[i] != u2[i]) return false; + } + return true; +} + +size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + size_t min_len = len1 <= len2 ? len1 : len2; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + size_t common_prefix = 0; + + for (size_t i = 0; i < min_len; i++) { + if (u1[i] == u2[i]) { + common_prefix++; + } else { + break; + } + } + return common_prefix; +} + +size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + size_t min_len = len1 <= len2 ? len1 : len2; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + size_t common_suffix = 0; + + for (size_t i = 0; i < min_len; i++) { + if (u1[len1 - i - 1] == u2[len2 - i - 1]) { + common_suffix++; + } else { + break; + } + } + return common_suffix; +} + + + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; @@ -482,6 +597,61 @@ inline size_t utf8_common_prefix_ignore_separators(const char *str1, const char return utf8_common_prefix_len_ignore_separators(str1, str2, strlen(str2)); } +bool utf8_equal_ignore_separators_len(const char *str1, const char *str2, size_t len) { + if (len == 0) return false; + + int32_t c1 = -1, c2 = -1; + ssize_t len1, len2; + + uint8_t *ptr1 = (uint8_t *)str1; + uint8_t *ptr2 = (uint8_t *)str2; + + size_t remaining = len; + + while (1) { + len1 = utf8proc_iterate(ptr1, -1, &c1); + len2 = utf8proc_iterate(ptr2, -1, &c2); + + if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) { + ptr1++; + ptr2++; + remaining--; + if (remaining == 0) return true; + continue; + } + + if (c1 != 0 && c2 != 0 && c1 == c2) { + ptr1 += len1; + ptr2 += len2; + remaining -= len1; + } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) { + ptr1 += len1; + if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) { + ptr2 += len2; + } + remaining -= len1; + } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) { + ptr2 += len2; + remaining -= len2; + } else { + break; + } + + if (remaining == 0) return true; + + } + + return false; +} + +inline bool utf8_equal_ignore_separators(const char *str1, const char *str2) { + size_t len1 = strlen(str1); + size_t len2 = strlen(str2); + size_t len = len1 > len2 ? len1 : len2; + + return utf8_equal_ignore_separators_len(str1, str2, len); +} + bool string_is_digit(char *str, size_t len) { uint8_t *ptr = (uint8_t *)str; size_t idx = 0; @@ -559,6 +729,43 @@ inline bool string_contains_hyphen(char *str) { return string_next_hyphen_index(str, strlen(str)) >= 0; } +ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len) { + uint8_t *ptr = (uint8_t *)str; + int32_t ch; + ssize_t idx = 0; + + while (idx < len) { + ssize_t char_len = utf8proc_iterate(ptr, len, &ch); + + if (char_len <= 0 || ch == 0) break; + + if ((uint32_t)ch == codepoint) return idx; + ptr += char_len; + idx += char_len; + } + return -1; +} + +ssize_t string_next_codepoint(char *str, uint32_t codepoint) { + return string_next_codepoint_len(str, codepoint, strlen(str)); +} + +ssize_t string_next_period_len(char *str, size_t len) { + return string_next_codepoint_len(str, PERIOD_CODEPOINT, len); +} + +ssize_t string_next_period(char *str) { + return string_next_codepoint(str, PERIOD_CODEPOINT); +} + +inline bool string_contains_period_len(char *str, size_t len) { + return string_next_codepoint_len(str, PERIOD_CODEPOINT, len) >= 0; +} + +inline bool string_contains_period(char *str) { + return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0; +} + size_t string_right_spaces_len(char *str, size_t len) { size_t spaces = 0; @@ -583,6 +790,28 @@ size_t string_right_spaces_len(char *str, size_t len) { } +inline size_t string_hyphen_prefix_len(char *str, size_t len) { + // Strip beginning hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + +inline size_t string_hyphen_suffix_len(char *str, size_t len) { + // Strip ending hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + size_t string_left_spaces_len(char *str, size_t len) { size_t spaces = 0; @@ -881,6 +1110,18 @@ cstring_array *cstring_array_from_strings(char **strings, size_t n) { return array; } +bool cstring_array_extend(cstring_array *array, cstring_array *other) { + if (array == NULL || other == NULL) return false; + size_t n = cstring_array_num_strings(other); + + for (size_t i = 0; i < n; i++) { + char *s_i = cstring_array_get_string(other, i); + cstring_array_add_string(array, s_i); + } + return true; +} + + inline size_t cstring_array_capacity(cstring_array *self) { return self->str->m; } @@ -1089,6 +1330,12 @@ inline void string_tree_finalize_token(string_tree_t *self) { uint32_array_push(self->token_indices, (uint32_t)cstring_array_num_strings(self->strings)); } +void string_tree_clear(string_tree_t *self) { + uint32_array_clear(self->token_indices); + uint32_array_push(self->token_indices, 0); + cstring_array_clear(self->strings); +} + // terminated inline void string_tree_add_string(string_tree_t *self, char *str) { cstring_array_add_string(self->strings, str); diff --git a/src/string_utils.h b/src/string_utils.h index 0e7dd235..1ddcc626 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -74,7 +74,7 @@ ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *ds char *utf8_lower_options(const char *s, utf8proc_option_t options); char *utf8_lower(const char *s); char *utf8_upper_options(const char *s, utf8proc_option_t options); -char *utf8_lower(const char *s); +char *utf8_upper(const char *s); int utf8_compare(const char *str1, const char *str2); int utf8_compare_len(const char *str1, const char *str2, size_t len); @@ -83,9 +83,20 @@ size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len); size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2); size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len); +bool utf8_equal_ignore_separators(const char *str1, const char *str2); + +ssize_t utf8_len(const char *str, size_t len); + +uint32_array *unicode_codepoints(const char *str); +bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); +size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array); +size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array); + bool utf8_is_hyphen(int32_t ch); +bool utf8_is_period(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); +bool utf8_is_digit(int cat); bool utf8_is_letter_or_number(int cat); bool utf8_is_punctuation(int cat); bool utf8_is_symbol(int cat); @@ -99,8 +110,22 @@ ssize_t string_next_hyphen_index(char *str, size_t len); bool string_contains_hyphen(char *str); bool string_contains_hyphen_len(char *str, size_t len); +ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len); +ssize_t string_next_codepoint(char *str, uint32_t codepoint); + +ssize_t string_next_period_len(char *str, size_t len); +ssize_t string_next_period(char *str); + +bool string_contains_period_len(char *str, size_t len); +bool string_contains_period(char *str); + +size_t string_left_spaces_len(char *str, size_t len); +size_t string_right_spaces_len(char *str, size_t len); char *string_trim(char *str); +size_t string_hyphen_prefix_len(char *str, size_t len); +size_t string_hyphen_suffix_len(char *str, size_t len); + /* char_array is a dynamic character array defined in collections.h but has a few additional methods related to string manipulation. @@ -183,6 +208,8 @@ void cstring_array_clear(cstring_array *self); cstring_array *cstring_array_from_char_array(char_array *str); cstring_array *cstring_array_from_strings(char **strings, size_t n); +bool cstring_array_extend(cstring_array *array, cstring_array *other); + // Convert cstring_array to an array of n C strings and destroy the cstring_array char **cstring_array_to_strings(cstring_array *self); @@ -260,6 +287,8 @@ void string_tree_add_string_len(string_tree_t *self, char *str, size_t len); void string_tree_append_string(string_tree_t *self, char *str); void string_tree_append_string_len(string_tree_t *self, char *str, size_t len); +void string_tree_clear(string_tree_t *self); + uint32_t string_tree_num_tokens(string_tree_t *self); uint32_t string_tree_num_strings(string_tree_t *self); diff --git a/src/token_types.h b/src/token_types.h index d746ae89..31cc2ba9 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -1,64 +1,60 @@ #ifndef TOKEN_TYPES_H #define TOKEN_TYPES_H +#include "libpostal.h" + // Doing these as #defines so we can duplicate the values exactly in Python -#define END 0 // Null byte -// Word types -#define WORD 1 // Any letter-only word (includes all unicode letters) -#define ABBREVIATION 2 // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) -#define IDEOGRAPHIC_CHAR 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character -#define HANGUL_SYLLABLE 4 // Hangul syllable sequences which contain more than one codepoint -#define ACRONYM 5 // Specifically things like U.N. where we may delete internal periods +#define END LIBPOSTAL_TOKEN_TYPE_END -#define PHRASE 10 // Not part of the first stage tokenizer, but may be used after phrase parsing +#define WORD LIBPOSTAL_TOKEN_TYPE_WORD +#define ABBREVIATION LIBPOSTAL_TOKEN_TYPE_ABBREVIATION +#define IDEOGRAPHIC_CHAR LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR +#define HANGUL_SYLLABLE LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE +#define ACRONYM LIBPOSTAL_TOKEN_TYPE_ACRONYM +#define PHRASE LIBPOSTAL_TOKEN_TYPE_PHRASE -// Special tokens -#define EMAIL 20 // Make sure emails are tokenized altogether -#define URL 21 // Make sure urls are tokenized altogether -#define US_PHONE 22 // US phone number (with or without country code) -#define INTL_PHONE 23 // A non-US phone number (must have country code) +#define EMAIL LIBPOSTAL_TOKEN_TYPE_EMAIL +#define URL LIBPOSTAL_TOKEN_TYPE_URL +#define US_PHONE LIBPOSTAL_TOKEN_TYPE_US_PHONE +#define INTL_PHONE LIBPOSTAL_TOKEN_TYPE_INTL_PHONE -// Numbers and numeric types -#define NUMERIC 50 // Any sequence containing a digit -#define ORDINAL 51 // 1st, 2nd, 1er, 1 etc. -#define ROMAN_NUMERAL 52 // II, III, VI, etc. -#define IDEOGRAPHIC_NUMBER 53 // All numeric ideographic characters, includes e.g. Han numbers and chars like "²" +#define NUMERIC LIBPOSTAL_TOKEN_TYPE_NUMERIC +#define ORDINAL LIBPOSTAL_TOKEN_TYPE_ORDINAL +#define ROMAN_NUMERAL LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL +#define IDEOGRAPHIC_NUMBER LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER +#define PERIOD LIBPOSTAL_TOKEN_TYPE_PERIOD +#define EXCLAMATION LIBPOSTAL_TOKEN_TYPE_EXCLAMATION +#define QUESTION_MARK LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK +#define COMMA LIBPOSTAL_TOKEN_TYPE_COMMA +#define COLON LIBPOSTAL_TOKEN_TYPE_COLON +#define SEMICOLON LIBPOSTAL_TOKEN_TYPE_SEMICOLON +#define PLUS LIBPOSTAL_TOKEN_TYPE_PLUS +#define AMPERSAND LIBPOSTAL_TOKEN_TYPE_AMPERSAND +#define AT_SIGN LIBPOSTAL_TOKEN_TYPE_AT_SIGN +#define POUND LIBPOSTAL_TOKEN_TYPE_POUND +#define ELLIPSIS LIBPOSTAL_TOKEN_TYPE_ELLIPSIS +#define DASH LIBPOSTAL_TOKEN_TYPE_DASH +#define BREAKING_DASH LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH +#define HYPHEN LIBPOSTAL_TOKEN_TYPE_HYPHEN +#define PUNCT_OPEN LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN +#define PUNCT_CLOSE LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE +#define DOUBLE_QUOTE LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE +#define SINGLE_QUOTE LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE +#define OPEN_QUOTE LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE +#define CLOSE_QUOTE LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE +#define SLASH LIBPOSTAL_TOKEN_TYPE_SLASH +#define BACKSLASH LIBPOSTAL_TOKEN_TYPE_BACKSLASH +#define GREATER_THAN LIBPOSTAL_TOKEN_TYPE_GREATER_THAN +#define LESS_THAN LIBPOSTAL_TOKEN_TYPE_LESS_THAN -// Punctuation types, may separate a phrase -#define PERIOD 100 -#define EXCLAMATION 101 -#define QUESTION_MARK 102 -#define COMMA 103 -#define COLON 104 -#define SEMICOLON 105 -#define PLUS 106 -#define AMPERSAND 107 -#define AT_SIGN 108 -#define POUND 109 -#define ELLIPSIS 110 -#define DASH 111 -#define BREAKING_DASH 112 -#define HYPHEN 113 -#define PUNCT_OPEN 114 -#define PUNCT_CLOSE 115 -#define DOUBLE_QUOTE 119 -#define SINGLE_QUOTE 120 -#define OPEN_QUOTE 121 -#define CLOSE_QUOTE 122 -#define SLASH 124 -#define BACKSLASH 125 -#define GREATER_THAN 126 -#define LESS_THAN 127 +#define OTHER LIBPOSTAL_TOKEN_TYPE_OTHER +#define WHITESPACE LIBPOSTAL_TOKEN_TYPE_WHITESPACE +#define NEWLINE LIBPOSTAL_TOKEN_TYPE_NEWLINE -// Non-letters and whitespace -#define OTHER 200 -#define WHITESPACE 300 -#define NEWLINE 301 - -#define INVALID_CHAR 500 +#define INVALID_CHAR LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR #define is_word_token(type) ((type) == WORD || (type) == ABBREVIATION || (type) == ACRONYM || (type) == IDEOGRAPHIC_CHAR || (type) == HANGUL_SYLLABLE) diff --git a/src/tokens.h b/src/tokens.h index 6b314417..bf61f5bc 100644 --- a/src/tokens.h +++ b/src/tokens.h @@ -12,11 +12,9 @@ #include "token_types.h" #include "vector.h" -typedef struct token { - size_t offset; - size_t len; - uint16_t type; -} token_t; +typedef libpostal_token_t token_t; + +#define NULL_TOKEN (token_t){0, 0, END} VECTOR_INIT(token_array, token_t) diff --git a/src/trie_search.c b/src/trie_search.c index 8518db89..fa78adf8 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -633,6 +633,8 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u trie_data_node_t data_node; trie_node_t terminal_node; + bool phrase_at_hyphen = false; + while (idx < len) { char_len = utf8proc_iterate(ptr, len, &codepoint); log_debug("char_len = %zu, char=%d\n", char_len, codepoint); @@ -653,7 +655,7 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u for (i = 0; i < char_len; i++) { node_id = trie_get_transition_index(self, last_node, *char_ptr); node = trie_get_node(self, node_id); - log_debug("At idx=%zu, char=%.*s\n", i, (int)char_len, char_ptr); + log_debug("At idx=%u, i=%zu, char=%.*s\n", idx, i, (int)char_len, char_ptr); if (node.check != last_node_id) { log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id); @@ -665,7 +667,12 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u } if (is_hyphen && node.check != last_node_id) { - log_debug("No space transition\n"); + log_debug("No space transition, phrase_len=%zu\n", phrase_len); + if (phrase_len > 0 && phrase_len == idx) { + log_debug("phrase_at_hyphen\n"); + phrase_at_hyphen = true; + } + ptr += char_len; idx += char_len; separator_char_len = char_len; @@ -720,10 +727,20 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u log_debug("match_len=%zu\n", match_len); if (tail_match_len == current_tail_len - tail_pos) { + if (phrase_at_hyphen) { + char_len = utf8proc_iterate(ptr + char_len, len, &codepoint); + if (char_len > 0 && utf8proc_codepoint_valid(codepoint)) { + int cat = utf8proc_category(codepoint); + + if (codepoint != 0 && !utf8_is_hyphen(codepoint) && !utf8_is_separator(cat) && !utf8_is_punctuation(cat)) { + return (phrase_t){phrase_start, phrase_len, value}; + } + } + } if (first_char) phrase_start = idx; phrase_len = (uint32_t)(idx + match_len) - phrase_start; - log_debug("tail match! phrase_len=%u\n", phrase_len); + log_debug("tail match! phrase_len=%u, len=%zu\n", phrase_len, len); value = data_node.data; return (phrase_t){phrase_start, phrase_len, value}; } else { diff --git a/test/test_expand.c b/test/test_expand.c index d97838ae..2b451295 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -4,18 +4,26 @@ #include #include "greatest.h" +#include "../src/string_utils.h" #include "../src/libpostal.h" SUITE(libpostal_expansion_tests); -static greatest_test_res test_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { +static greatest_test_res test_expansion_contains_phrase_option(char *input, char *output, libpostal_normalize_options_t options, bool root) { size_t num_expansions; - char **expansions = libpostal_expand_address(input, options, &num_expansions); + + char **expansions = NULL; + if (!root) { + expansions = libpostal_expand_address(input, options, &num_expansions); + } else { + expansions = libpostal_expand_address_root(input, options, &num_expansions); + } bool contains_expansion = false; char *expansion; for (size_t i = 0; i < num_expansions; i++) { expansion = expansions[i]; + printf("expansion = %s\n", expansion); if (string_equals(output, expansion)) { contains_expansion = true; break; @@ -38,15 +46,26 @@ static greatest_test_res test_expansion_contains(char *input, char *output, libp PASS(); } -static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { +static greatest_test_res test_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { + bool root = false; + CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); + + PASS(); +} + +static greatest_test_res test_root_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { + bool root = true; + CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); + + PASS(); +} + +static greatest_test_res test_expansion_contains_phrase_option_with_languages(char *input, char *output, libpostal_normalize_options_t options, bool root, size_t num_languages, va_list args) { char **languages = NULL; size_t i; if (num_languages > 0) { - va_list args; - - va_start(args, num_languages); languages = malloc(sizeof(char *) * num_languages); char *lang; @@ -56,8 +75,6 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha languages[i] = strdup(lang); } - va_end(args); - options.num_languages = num_languages; options.languages = (char **)languages; } else { @@ -65,7 +82,7 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha options.num_languages = 0; } - CHECK_CALL(test_expansion_contains(input, output, options)); + CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); if (languages != NULL) { for (i = 0; i < num_languages; i++) { free(languages[i]); @@ -76,6 +93,36 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha } + +static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { + bool root = false; + if (num_languages > 0) { + va_list args; + va_start(args, num_languages); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + va_end(args); + } else { + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL)); + } + PASS(); +} + + +static greatest_test_res test_root_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { + bool root = true; + if (num_languages > 0) { + va_list args; + va_start(args, num_languages); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + va_end(args); + } else { + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL)); + } + PASS(); +} + + + TEST test_expansions(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -91,6 +138,129 @@ TEST test_expansions(void) { PASS(); } +TEST test_street_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; + + // English - normal cases + CHECK_CALL(test_root_expansion_contains("Malcolm X Blvd", "malcolm x", options)); + CHECK_CALL(test_root_expansion_contains("E 106th St", "106", options)); + CHECK_CALL(test_root_expansion_contains("S Park Ave", "park", options)); + CHECK_CALL(test_root_expansion_contains("Park South", "park", options)); + CHECK_CALL(test_root_expansion_contains("Rev Dr. MLK Dr S", "martin luther king junior", options)); + CHECK_CALL(test_root_expansion_contains("Rev Dr. Martin Luther King Jr Dr S", "martin luther king junior", options)); + CHECK_CALL(test_root_expansion_contains("East 6th Street", "6th", options)); + + // English - edge cases + CHECK_CALL(test_root_expansion_contains("Avenue B", "b", options)); + CHECK_CALL(test_root_expansion_contains("Avenue C", "c", options)); + CHECK_CALL(test_root_expansion_contains("Avenue D", "d", options)); + CHECK_CALL(test_root_expansion_contains("Avenue E", "e", options)); + CHECK_CALL(test_root_expansion_contains("Avenue N", "n", options)); + CHECK_CALL(test_root_expansion_contains("U St SE", "u", options)); + CHECK_CALL(test_root_expansion_contains("S Park", "park", options)); + CHECK_CALL(test_root_expansion_contains("Park S", "park", options)); + CHECK_CALL(test_root_expansion_contains("Avenue Rd", "avenue", options)); + CHECK_CALL(test_root_expansion_contains("Broadway", "broadway", options)); + CHECK_CALL(test_root_expansion_contains("E Broadway", "east", options)); + CHECK_CALL(test_root_expansion_contains("E Center St", "center", options)); + CHECK_CALL(test_root_expansion_contains("E Ctr St", "center", options)); + CHECK_CALL(test_root_expansion_contains("E Center Street", "center", options)); + CHECK_CALL(test_root_expansion_contains("E Ctr Street", "center", options)); + CHECK_CALL(test_root_expansion_contains("Center St E", "center", options)); + CHECK_CALL(test_root_expansion_contains("Ctr St E", "center", options)); + CHECK_CALL(test_root_expansion_contains("Center Street E", "center", options)); + CHECK_CALL(test_root_expansion_contains("Ctr Street E", "center", options)); + + // Spanish + CHECK_CALL(test_root_expansion_contains("C/ Ocho", "8", options)); + PASS(); +} + + +TEST test_house_number_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; + + // English - normal cases + CHECK_CALL(test_root_expansion_contains("1A", "1 a", options)); + CHECK_CALL(test_root_expansion_contains("A1", "a 1", options)); + CHECK_CALL(test_root_expansion_contains("1", "1", options)); + CHECK_CALL(test_root_expansion_contains_with_languages("# 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("House No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("House #1", "1", options, 1, "en")); + + PASS(); +} + +TEST test_level_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; + + // English - normal cases + CHECK_CALL(test_root_expansion_contains_with_languages("1st Fl", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1st Floor", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("First Fl", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("First Floor", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("2nd Fl", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("2nd Floor", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Second Fl", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Second Floor", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Fl #1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Fl No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Floor No. 1", "1", options, 1, "en")); + + // Specifiers + CHECK_CALL(test_root_expansion_contains_with_languages("SB 1", "sub basement 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Bsmt", "basement", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Bsmt 1", "basement 1", options, 1, "en")); + + CHECK_CALL(test_root_expansion_contains_with_languages("1G", "1 ground", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("G", "ground", options, 1, "en")); + + PASS(); +} + +TEST test_unit_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; + + // English - normal cases + CHECK_CALL(test_root_expansion_contains_with_languages("1A", "1 a", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("A1", "a 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apt 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apt No 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apt #101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apartment 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apartment #101", "101", options, 1, "en")); + + // Specifiers + CHECK_CALL(test_root_expansion_contains_with_languages("PH 1", "penthouse 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("PH1", "penthouse 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Penthouse 1", "penthouse 1", options, 1, "en")); + + CHECK_CALL(test_root_expansion_contains_with_languages("1L", "1l", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1L", "1 left", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1F", "1f", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1F", "1f", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1R", "1r", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1R", "1r", options, 1, "en")); + + PASS(); +} + + +TEST test_po_box_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; + + CHECK_CALL(test_root_expansion_contains_with_languages("PO Box 1234", "1234", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("PO Box #1234", "1234", options, 1, "en")); + + PASS(); +} + TEST test_expansions_language_classifier(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -132,6 +302,11 @@ SUITE(libpostal_expansion_tests) { } RUN_TEST(test_expansions); + RUN_TEST(test_street_root_expansions); + RUN_TEST(test_house_number_root_expansions); + RUN_TEST(test_level_root_expansions); + RUN_TEST(test_unit_root_expansions); + RUN_TEST(test_po_box_root_expansions); RUN_TEST(test_expansions_language_classifier); RUN_TEST(test_expansions_no_options); diff --git a/test/test_string_utils.c b/test/test_string_utils.c index 7ded5a4e..1fbf310b 100644 --- a/test/test_string_utils.c +++ b/test/test_string_utils.c @@ -60,6 +60,26 @@ TEST test_utf8_compare_ignore_separators(void) { PASS(); } +TEST test_utf8_equal_ignore_separators(void) { + char *str1 = "Bünderstraße "; + char *str2 = "Bünder-straße"; + + bool equal = utf8_common_prefix_ignore_separators(str1, str2); + ASSERT(equal); + + str1 = " Bünder-straße "; + str2 = "Bünder straße"; + equal = utf8_common_prefix_ignore_separators(str1, str2); + ASSERT(equal); + + str1 = "Bünder-straße-a"; + str2 = "Bünder straße aa"; + equal = utf8_common_prefix_ignore_separators(str1, str2); + ASSERT_FALSE(equal); + + PASS(); +} + TEST test_feature_array_add(void) { cstring_array *features = cstring_array_new(); if (features == NULL) {