diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index daf8be69..1e015929 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -46,7 +46,7 @@ import HTMLParser from collections import defaultdict, OrderedDict from lxml import etree -from itertools import ifilter, chain +from itertools import ifilter, chain, combinations this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) @@ -86,6 +86,78 @@ ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv' TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv' +class AddressComponent(object): + ''' + Declare an address component and its dependencies e.g. + a house_numer cannot be used in the absence of a road name. + ''' + ANY = 'any' + + def __init__(self, name, dependencies=tuple(), method=ANY): + self.name = name + self.dependencies = dependencies + + def __hash__(self): + return hash(self.name) + + def __cmp__(self, other): + return cmp(self.name, other.name) + + +OSM_ADDRESS_COMPONENTS = OrderedDict.fromkeys([ + AddressComponent(AddressFormatter.HOUSE), + AddressComponent(AddressFormatter.ROAD), + AddressComponent(AddressFormatter.HOUSE_NUMBER, dependencies=(AddressFormatter.ROAD,)), + AddressComponent(AddressFormatter.SUBURB, dependencies=(AddressFormatter.CITY, AddressFormatter.STATE, + AddressFormatter.POSTCODE)), + AddressComponent(AddressFormatter.CITY), + AddressComponent(AddressFormatter.STATE, dependencies=(AddressFormatter.SUBURB, AddressFormatter.CITY, + AddressFormatter.POSTCODE, AddressFormatter.COUNTRY)), + AddressComponent(AddressFormatter.POSTCODE), + AddressComponent(AddressFormatter.COUNTRY), +]) + + +def num_deps(c): + return len(c.dependencies) + + +OSM_ADDRESS_COMPONENTS_SORTED = sorted(OSM_ADDRESS_COMPONENTS, key=num_deps) + +OSM_ADDRESS_COMPONENT_COMBINATIONS = [] + +''' +The following statements create a bitset of address components +for quickly checking testing whether or not a candidate set of +address components can be considered a full geographic string +suitable for formatting (i.e. would be a valid geocoder query). +For instance, a house number by itself is not sufficient +to be considered a valid address for this purpose unless it +has a road name as well. Using bitsets we can easily answer +questions like "Is house/house_number/road/city valid?" +''' +OSM_ADDRESS_COMPONENT_VALUES = { + c.name: 1 << i + for i, c in enumerate(OSM_ADDRESS_COMPONENTS.keys()) +} + +OSM_ADDRESS_COMPONENTS_VALID = set() + + +def component_bitset(components): + return reduce(operator.or_, [OSM_ADDRESS_COMPONENT_VALUES[c] for c in components]) + + +for i in xrange(1, len(OSM_ADDRESS_COMPONENTS.keys())): + for perm in combinations(OSM_ADDRESS_COMPONENTS.keys(), i): + perm_set = set([p.name for p in perm]) + valid = all((not p.dependencies or any(d in perm_set for d in p.dependencies) for p in perm)) + if valid: + components = [c.name for c in perm] + OSM_ADDRESS_COMPONENT_COMBINATIONS.append(tuple(components)) + OSM_ADDRESS_COMPONENTS_VALID.add(component_bitset(components)) + + class OSMField(object): def __init__(self, name, c_constant, alternates=None): self.name = name @@ -156,7 +228,6 @@ def read_osm_json(filename): yield key, json.loads(attrs) - def normalize_osm_name_tag(tag, script=False): norm = tag.rsplit(':', 1)[-1] if not script: @@ -346,6 +417,20 @@ def strip_keys(value, ignore_keys): value.pop(key, None) +def write_formatted_address(writer, formatter, country, components, tag_components=True, minimal_only=-True): + formatted_address = formatter.format_address(country, components, tag_components=tag_components, minimal_only=minimal_only) + if formatted_address is not None: + formatted_address = tsv_string(formatted_address) + if not formatted_address or not formatted_address.strip(): + return + if tag_components: + row = (language, country, formatted_address) + else: + row = (formatted_address,) + + writer.writerow(row) + + def build_address_format_training_data(language_rtree, infile, out_dir, tag_components=True): ''' Creates formatted address training data for supervised sequence labeling (or potentially @@ -359,7 +444,12 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp {language, country, data}. The data field here is a sequence of labeled tokens similar to what we might see in part-of-speech tagging. + This format uses a special character "|" to denote possible breaks in the input (comma, newline). + + Note that for the address parser, we'd like it to be robust to many different types + of input, so we may selectively eleminate components + This information can potentially be used downstream by the sequence model as these breaks may be present at prediction time. @@ -406,23 +496,25 @@ def build_address_format_training_data(language_rtree, infile, out_dir, tag_comp continue language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) - formatted_address = formatter.format_address(country, value, tag_components=tag_components) - if formatted_address is not None: - formatted_address = tsv_string(formatted_address) - if not formatted_address or not formatted_address.strip(): - continue - if tag_components: - row = (language, country, formatted_address) - else: - row = (formatted_address,) + address_components = {k: v for k, v in value.iteritems() if k.startswith('addr:')} + formatter.replace_aliases(address_components) - writer.writerow(row) + # Version with all components + write_formatted_address(writer, formatter, country, address_components, tag_components=tag_components, minimal_only=not tag_components) - if formatted_address is not None: - i += 1 - if i % 1000 == 0 and i > 0: - print 'did', i, 'formatted addresses' + current_components = component_bitset(address_components.keys()) + if tag_components: + for component in address_components.keys(): + if component in OSM_ADDRESS_COMPONENTS_VALID and current_components ^ OSM_ADDRESS_COMPONENTS_VALID[component] and random.random() > 0.5: + address_components.pop(component) + if not address_components: + break + write_formatted_address(writer, formatter, country, address_components, tag_components=tag_components, minimal_only=False) + + i += 1 + if i % 1000 == 0 and i > 0: + print 'did', i, 'formatted addresses' NAME_KEYS = ( 'name',