[osm/parsing] Making tagged training data for address parser more robust to the types of partial input we see in geocoding by randomly eliminating components subject to some constraints (e.g. house number cannot be used without a street name)

2015-10-02 19:52:13 -04:00
parent c790a2b87f
commit 83aecb9f2c
1 changed files with 108 additions and 16 deletions
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -46,7 +46,7 @@ import HTMLParser

 from collections import defaultdict, OrderedDict
 from lxml import etree
-from itertools import ifilter, chain
+from itertools import ifilter, chain, combinations

 this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
@@ -86,6 +86,78 @@ ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
 TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv'


+class AddressComponent(object):
+    '''
+    Declare an address component and its dependencies e.g.
+    a house_numer cannot be used in the absence of a road name.
+    '''
+    ANY = 'any'
+
+    def __init__(self, name, dependencies=tuple(), method=ANY):
+        self.name = name
+        self.dependencies = dependencies
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __cmp__(self, other):
+        return cmp(self.name, other.name)
+
+
+OSM_ADDRESS_COMPONENTS = OrderedDict.fromkeys([
+    AddressComponent(AddressFormatter.HOUSE),
+    AddressComponent(AddressFormatter.ROAD),
+    AddressComponent(AddressFormatter.HOUSE_NUMBER, dependencies=(AddressFormatter.ROAD,)),
+    AddressComponent(AddressFormatter.SUBURB, dependencies=(AddressFormatter.CITY, AddressFormatter.STATE,
+                                                            AddressFormatter.POSTCODE)),
+    AddressComponent(AddressFormatter.CITY),
+    AddressComponent(AddressFormatter.STATE, dependencies=(AddressFormatter.SUBURB, AddressFormatter.CITY,
+                                                           AddressFormatter.POSTCODE, AddressFormatter.COUNTRY)),
+    AddressComponent(AddressFormatter.POSTCODE),
+    AddressComponent(AddressFormatter.COUNTRY),
+])
+
+
+def num_deps(c):
+    return len(c.dependencies)
+
+
+OSM_ADDRESS_COMPONENTS_SORTED = sorted(OSM_ADDRESS_COMPONENTS, key=num_deps)
+
+OSM_ADDRESS_COMPONENT_COMBINATIONS = []
+
+'''
+The following statements create a bitset of address components
+for quickly checking testing whether or not a candidate set of
+address components can be considered a full geographic string
+suitable for formatting (i.e. would be a valid geocoder query).
+For instance, a house number by itself is not sufficient
+to be considered a valid address for this purpose unless it
+has a road name as well. Using bitsets we can easily answer
+questions like "Is house/house_number/road/city valid?"
+'''
+OSM_ADDRESS_COMPONENT_VALUES = {
+    c.name: 1 << i
+    for i, c in enumerate(OSM_ADDRESS_COMPONENTS.keys())
+}
+
+OSM_ADDRESS_COMPONENTS_VALID = set()
+
+
+def component_bitset(components):
+    return reduce(operator.or_, [OSM_ADDRESS_COMPONENT_VALUES[c] for c in components])
+
+
+for i in xrange(1, len(OSM_ADDRESS_COMPONENTS.keys())):
+    for perm in combinations(OSM_ADDRESS_COMPONENTS.keys(), i):
+        perm_set = set([p.name for p in perm])
+        valid = all((not p.dependencies or any(d in perm_set for d in p.dependencies) for p in perm))
+        if valid:
+            components = [c.name for c in perm]
+            OSM_ADDRESS_COMPONENT_COMBINATIONS.append(tuple(components))
+            OSM_ADDRESS_COMPONENTS_VALID.add(component_bitset(components))
+
+
 class OSMField(object):
    def __init__(self, name, c_constant, alternates=None):
        self.name = name
@@ -156,7 +228,6 @@ def read_osm_json(filename):
        yield key, json.loads(attrs)


-
 def normalize_osm_name_tag(tag, script=False):
    norm = tag.rsplit(':', 1)[-1]
    if not script:
@@ -346,6 +417,20 @@ def strip_keys(value, ignore_keys):
        value.pop(key, None)


+def write_formatted_address(writer, formatter, country, components, tag_components=True, minimal_only=-True):
+    formatted_address = formatter.format_address(country, components, tag_components=tag_components, minimal_only=minimal_only)
+    if formatted_address is not None:
+        formatted_address = tsv_string(formatted_address)
+        if not formatted_address or not formatted_address.strip():
+            return
+        if tag_components:
+            row = (language, country, formatted_address)
+        else:
+            row = (formatted_address,)
+
+        writer.writerow(row)
+
+
 def build_address_format_training_data(language_rtree, infile, out_dir, tag_components=True):
    '''
    Creates formatted address training data for supervised sequence labeling (or potentially 
@@ -359,7 +444,12 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
    {language, country, data}. The data field here is a sequence of labeled tokens similar
    to what we might see in part-of-speech tagging.

+
    This format uses a special character "|" to denote possible breaks in the input (comma, newline).
+
+    Note that for the address parser, we'd like it to be robust to many different types
+    of input, so we may selectively eleminate components
+
    This information can potentially be used downstream by the sequence model as these
    breaks may be present at prediction time.

@@ -406,23 +496,25 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp
                    continue
                language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])

-        formatted_address = formatter.format_address(country, value, tag_components=tag_components)
-        if formatted_address is not None:
-            formatted_address = tsv_string(formatted_address)
-            if not formatted_address or not formatted_address.strip():
-                continue
-            if tag_components:
-                row = (language, country, formatted_address)
-            else:
-                row = (formatted_address,)
+        address_components = {k: v for k, v in value.iteritems() if k.startswith('addr:')}
+        formatter.replace_aliases(address_components)

-            writer.writerow(row)
+        # Version with all components
+        write_formatted_address(writer, formatter, country, address_components, tag_components=tag_components, minimal_only=not tag_components)

-        if formatted_address is not None:
-            i += 1
-            if i % 1000 == 0 and i > 0:
-                print 'did', i, 'formatted addresses'
+        current_components = component_bitset(address_components.keys())

+        if tag_components:
+            for component in address_components.keys():
+                if component in OSM_ADDRESS_COMPONENTS_VALID and current_components ^ OSM_ADDRESS_COMPONENTS_VALID[component] and random.random() > 0.5:
+                    address_components.pop(component)
+                    if not address_components:
+                        break
+                    write_formatted_address(writer, formatter, country, address_components, tag_components=tag_components, minimal_only=False)
+
+        i += 1
+        if i % 1000 == 0 and i > 0:
+            print 'did', i, 'formatted addresses'

 NAME_KEYS = (
    'name',