[intersections] intersections training data

2016-05-30 21:50:45 -04:00
parent 5075128ada
commit 8aada7086f
3 changed files with 116 additions and 2 deletions
--- a/scripts/geodata/osm/extract.py
+++ b/scripts/geodata/osm/extract.py
@@ -44,6 +44,10 @@ OSM_NAME_TAGS = (
    'short_name',
 )
 OSM_BASE_NAME_TAGS = (
    'tiger:name_base',
 )
 def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False):
    '''
--- a/scripts/geodata/osm/formatter.py
+++ b/scripts/geodata/osm/formatter.py
@@ -6,6 +6,7 @@ import sys
 import yaml
 from collections import OrderedDict
 from six.itertools import combinations, ifilter
 this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
@@ -24,6 +25,7 @@ from geodata.configs.utils import nested_get
 from geodata.countries.country_names import *
 from geodata.language_id.disambiguation import *
 from geodata.i18n.languages import *
 from geodata.intersections.query import Intersection, IntersectionQuery
 from geodata.address_formatting.formatter import AddressFormatter
 from geodata.osm.extract import *
 from geodata.polygons.language_polys import *
@@ -39,6 +41,8 @@ OSM_PARSER_DATA_DEFAULT_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os
 ADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'formatted_addresses_tagged.tsv'
 ADDRESS_FORMAT_DATA_FILENAME = 'formatted_addresses.tsv'
 ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
 INTERSECTIONS_FILENAME = 'intersections.tsv'
 INTERSECTIONS_TAGGED_FILENAME = 'intersections_tagged.tsv'
 ALL_LANGUAGES = 'all'
@@ -113,7 +117,7 @@ class OSMAddressFormatter(object):
        }
    }
-    def __init__(self, components, subdivisions_rtree, buildings_rtree):
+    def __init__(self, components, subdivisions_rtree=None, buildings_rtree=None):
        # Instance of AddressComponents, contains structures for reverse geocoding, etc.
        self.components = components
        self.language_rtree = components.language_rtree
@@ -543,7 +547,7 @@ class OSMAddressFormatter(object):
                    if tag_components:
                        row = (language, country, formatted_address)
                    else:
-                        row = formatted_address
+                        row = (formatted_address,)
                    writer.writerow(row)
@@ -551,6 +555,104 @@ class OSMAddressFormatter(object):
            if i % 1000 == 0 and i > 0:
                print('did {} formatted addresses'.format(i))
    def build_intersections_training_data(self, infile, out_dir, tag_components=True):
        '''
        Intersection addresses like "4th & Main Street" are represented in OSM
        by ways that share at least one node.
        This creates formatted strings using the name of each way (sometimes the base name
        for US addresses thanks to Tiger tags).
        Example:
        en  us  34th/road Street/road &/intersection 8th/road Ave/road
        '''
        i = 0
        if tag_components:
            formatted_tagged_file = open(os.path.join(out_dir, INTERSECTIONS_TAGGED_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_file = open(os.path.join(out_dir, INTERSECTIONS_FILENAME), 'w')
            writer = csv.writer(formatted_file, 'tsv_no_quote')
        all_name_tags = set(OSM_NAME_TAGS)
        base_name_tags = set(OSM_BASE_NAME_TAGS)
        replace_with_base_name_prob = float(nested_get(self.config, ('intersections', 'replace_with_base_name_probability'), default=0.0))
        reader = OSMIntersectionsReader(infile)
        for node_id, latitude, longitude, ways in reader.intersections():
            if not ways or len(ways) < 2:
                continue
            tags = ways[0]
            namespaced_language = None
            language_components = {}
            base_name_tags = [t for t in all_base_name_tags if t in tags]
            if not base_names:
                base_name_tag = None
            else:
                base_name_tag = base_name_tags[0]
            for tag in tags:
                if tag.rsplit(':', 1)[0] in all_name_tags and all((tag in w for w in ways)):
                    way_names = [(w[tag], w.get(base_name_tag) if base_name_tag else None) for w in ways]
                    if ':' in tag:
                        namespaced_language = tag.rsplit(':')[-1]
                    if namespaced_language not in language_components:
                        address_components, country, language = self.components.expanded({}, latitude, longitude, language=namespaced_language)
                        language_components[namespaced_language] = (address_components, country, language)
                    else:
                        address_components, country, language = language_components[namespaced_language]
                    intersection_phrase = Intersection.phrase(language, country=country)
                    if not intersection_phrase:
                        continue
                    formatted_intersections = []
                    for (w1, w1_base), (w2, w2_base) in combinations(way_names, 2):
                        intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2)
                        formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components)
                        formatted_intersections.append(formatted)
                        if w1_base and random.random() < replace_with_base_name_prob:
                            w1 = w1_base
                            intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2)
                            formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components)
                            formatted_intersections.append(formatted)
                        if w2_base and random.random() < replace_with_base_name_prob:
                            w2 = w2_base
                            intersection = IntersectionQuery(road1=w1, intersection_phrase=intersection_phrase, road2=w2)
                            formatted = self.formatter.format_intersection(intersection, address_components, country, language, tag_components=tag_components)
                            formatted_intersections.append(formatted)
                    for formatted in formatted_intersections:
                        if not formatted or not formatted.strip():
                            continue
                        formatted = tsv_string(formatted)
                        if not formatted or not formatted.strip():
                            continue
                        if tag_components:
                            row = (language, country, formatted)
                        else:
                            row = (formatted,)
                        writer.writerow(row)
            i += 1
            if i % 1000 == 0 and i > 0:
                print('did {} intersections'.format(i))
    def build_limited_training_data(self, infile, out_dir):
        '''
        Creates a special kind of formatted address training data from OSM's addr:* tags
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -444,6 +444,9 @@ if __name__ == '__main__':
                        default=tempfile.gettempdir(),
                        help='Temp directory to use')
    parser.add_argument('-x', '--intersections-file',
                        help='Path to planet-ways-latlons.osm')
    parser.add_argument('--language-rtree-dir',
                        required=True,
                        help='Language RTree directory')
@@ -530,3 +533,8 @@ if __name__ == '__main__':
        osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
    if args.venues_file:
        build_venue_training_data(language_rtree, args.venues_file, args.out_dir)
    if args.intersections_file and args.format:
        components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
        osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree)
        osm_formatter.build_intersections_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)