[expand] adding a function to check if two place names/addresses are equivalent after token normalization (replacing hyphens, deleting final periods, lowercasing, simple transliteration, etc.) and taking into account abbreviations from any specified libpostal dictionaries. In conjunction with place name affixes, useful in data sets like GeoPlanet or GeoNames to determine if a name variant is related to the original or not

2016-10-12 14:55:59 -04:00
parent f8664b0deb
commit 1d25f08b52
1 changed files with 56 additions and 0 deletions
--- a/scripts/geodata/address_expansions/equivalence.py
+++ b/scripts/geodata/address_expansions/equivalence.py
@@ -0,0 +1,56 @@
+import random
+import re
+import six
+
+from itertools import izip
+
+from geodata.address_expansions.gazetteers import *
+from geodata.encoding import safe_decode, safe_encode
+from geodata.text.normalize import normalized_tokens
+from geodata.text.tokenize import tokenize_raw, token_types
+from geodata.text.utils import non_breaking_dash_regex
+
+
+def canonicals_for_language(data, language):
+    canonicals = set()
+
+    for d in data:
+        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
+        if language is None or lang == language:
+            canonicals.add(canonical)
+
+    return canonicals
+
+def equivalent(s1, s2, gazetteer, language):
+    '''
+    Address/place equivalence
+    -------------------------
+
+    OSM discourages abbreviations, but to make our training data map better
+    to real-world input, we can safely replace the canonical phrase with an
+    abbreviated version and retain the meaning of the words
+    '''
+
+    tokens_s1 = normalized_tokens(s1)
+    tokens_s2 = normalized_tokens(s2)
+
+    abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
+    abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
+
+    if len(abbreviated_s1) != len(abbreviated_s2):
+        return False
+
+    for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
+        if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
+            if t1 != t2:
+                return False
+        elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
+            canonicals_s1 = canonicals_for_language(d1, language)
+            canonicals_s2 = canonicals_for_language(d2, language)
+
+            if not canonicals_s1 & canonicals_s2:
+                return False
+        else:
+            return False
+
+    return True