Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/osm/init.py
+++ b/scripts/geodata/osm/init.py
--- a/scripts/geodata/osm/admin_boundaries.py
+++ b/scripts/geodata/osm/admin_boundaries.py
@@ -0,0 +1,333 @@
+'''
+admin_boundaries.py
+-------------------
+
+Generates polygons from OpenStreetMap relations
+'''
+
+import array
+import logging
+import six
+
+from bisect import bisect_left
+from collections import defaultdict, OrderedDict
+from itertools import izip, combinations
+
+from geodata.coordinates.conversion import latlon_to_decimal
+from geodata.encoding import safe_encode, safe_decode
+from geodata.file_utils import ensure_dir
+from geodata.graph.scc import strongly_connected_components
+from geodata.i18n.languages import osm_admin1_ids
+from geodata.math.floats import isclose
+from geodata.osm.definitions import osm_definitions
+from geodata.osm.extract import *
+
+
+class OSMPolygonReader(object):
+    '''
+    OSM relations are stored with pointers to their bounding ways,
+    which in turn store pointers to their constituent nodes and the
+    XML file for planet is far too large to be parsed in-memory.
+
+    For the purposes of constructing (multi)polygons, we need lists
+    of lat/lon coordinates for the edges of each outer and inner polygon
+    that form the overall boundary (this allows for holes e.g.
+    Lesotho/South Africa and multiple disjoint polygons such as islands)
+
+    This class creates a compact representation of the intermediate
+    lookup tables and coordinates using Python's typed array module
+    which stores C-sized ints, doubles, etc. in a dynamic array. It's like
+    a list but smaller and faster for arrays of numbers and doesn't require
+    pulling in numpy as a dependency when all we want is the space savings.
+
+    One nice property of the .osm files generated by osmfilter is that
+    nodes/ways/relations are stored in sorted order, so we don't have to
+    pre-sort the lookup arrays before performing binary search.
+    '''
+
+    def __init__(self, filename):
+        self.filename = filename
+
+        self.node_ids = array.array('l')
+        self.way_ids = array.array('l')
+
+        self.coords = array.array('d')
+
+        self.nodes = {}
+
+        self.way_deps = array.array('l')
+        self.way_coords = array.array('d')
+        self.way_indptr = array.array('i', [0])
+
+        self.logger = logging.getLogger('osm_admin_polys')
+
+    def binary_search(self, a, x):
+        '''Locate the leftmost value exactly equal to x'''
+        i = bisect_left(a, x)
+        if i != len(a) and a[i] == x:
+            return i
+        raise ValueError
+
+    def node_coordinates(self, coords, indptr, idx):
+        start_index = indptr[idx] * 2
+        end_index = indptr[idx + 1] * 2
+        node_coords = coords[start_index:end_index]
+        return zip(node_coords[::2], node_coords[1::2])
+
+    def sparse_deps(self, data, indptr, idx):
+        return [data[i] for i in xrange(indptr[idx], indptr[idx + 1])]
+
+    def create_polygons(self, ways):
+        '''
+        Polygons (relations) are effectively stored as lists of
+        line segments (ways) and there may be more than one polygon
+        (island chains, overseas territories).
+
+        If we view the line segments as a graph (any two ways which
+        share a terminal node are connected), then the process of
+        constructing polygons reduces to finding strongly connected
+        components in a graph.
+
+        https://en.wikipedia.org/wiki/Strongly_connected_component
+
+        Note that even though there may be hundreds of thousands of
+        points in a complex polygon like a country boundary, we only
+        need to build a graph of connected ways, which will be many
+        times smaller and take much less time to traverse.
+        '''
+        end_nodes = defaultdict(list)
+        polys = []
+
+        way_indices = {}
+        start_end_nodes = {}
+
+        for way_id in ways:
+            # Find the way position via binary search
+            try:
+                way_index = self.binary_search(self.way_ids, way_id)
+            except ValueError:
+                continue
+
+            # Cache the way index
+            way_indices[way_id] = way_index
+
+            # way_indptr is a compressed index into way_deps/way_coords
+            # way_index i is stored at indices way_indptr[i]:way_indptr[i+1]
+            # in way_deps
+            start_node_id = self.way_deps[self.way_indptr[way_index]]
+            end_node_id = self.way_deps[self.way_indptr[way_index + 1] - 1]
+
+            start_end_nodes[way_id] = (start_node_id, end_node_id)
+
+            if start_node_id == end_node_id:
+                way_node_points = self.node_coordinates(self.way_coords, self.way_indptr, way_index)
+                polys.append(way_node_points)
+                continue
+
+            end_nodes[start_node_id].append(way_id)
+            end_nodes[end_node_id].append(way_id)
+
+        # Way graph for a single polygon, don't need to be as concerned about storage
+        way_graph = defaultdict(OrderedDict)
+
+        for node_id, ways in end_nodes.iteritems():
+            for w1, w2 in combinations(ways, 2):
+                way_graph[w1][w2] = None
+                way_graph[w2][w1] = None
+
+        way_graph = {v: w.keys() for v, w in way_graph.iteritems()}
+
+        for component in strongly_connected_components(way_graph):
+            poly_nodes = []
+
+            seen = set()
+
+            if not component:
+                continue
+
+            q = [(c, False) for c in component[:1]]
+            while q:
+                way_id, reverse = q.pop()
+                way_index = way_indices[way_id]
+
+                node_coords = self.node_coordinates(self.way_coords, self.way_indptr, way_index)
+
+                head, tail = start_end_nodes[way_id]
+
+                if reverse:
+                    node_coords = node_coords[::-1]
+                    head, tail = tail, head
+
+                for neighbor in way_graph[way_id]:
+                    if neighbor in seen:
+                        continue
+                    neighbor_head, neighbor_tail = start_end_nodes[neighbor]
+                    neighbor_reverse = neighbor_head == head or neighbor_tail == tail
+                    q.append((neighbor, neighbor_reverse))
+
+                way_start = 0 if q else 1
+                poly_nodes.extend(node_coords[way_start:-1])
+
+                seen.add(way_id)
+
+            polys.append(poly_nodes)
+
+        return polys
+
+    def include_polygon(self, props):
+        raise NotImplementedError('Children must implement')
+
+    def polygons(self, properties_only=False):
+        '''
+        Generator which yields tuples like:
+
+        (relation_id, properties, outer_polygons, inner_polygons)
+
+        At this point a polygon is a list of coordinate tuples,
+        suitable for passing to shapely's Polygon constructor
+        but may be used for other purposes.
+
+        outer_polygons is a list of the exterior polygons for this
+        boundary. inner_polygons is a list of "holes" in the exterior
+        polygons although donuts and donut-holes need to be matched
+        by the caller using something like shapely's contains.
+        '''
+        i = 0
+
+        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
+            props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
+            if element_id.startswith('node'):
+                node_id = long(element_id.split(':')[-1])
+                lat = props.get('lat')
+                lon = props.get('lon')
+                if lat is None or lon is None:
+                    continue
+                lat, lon = latlon_to_decimal(lat, lon)
+                if lat is None or lon is None:
+                    continue
+
+                if isclose(lat, 90.0):
+                    lat = 89.999
+
+                if isclose(lon, 180.0):
+                    lon = 179.999
+
+                if 'name' in props and 'place' in props:
+                    self.nodes[node_id] = props
+
+                # Nodes are stored in a sorted array, coordinate indices are simply
+                # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
+                # Note that the pairs are lon, lat instead of lat, lon for geometry purposes
+                self.coords.append(lon)
+                self.coords.append(lat)
+                self.node_ids.append(node_id)
+            elif element_id.startswith('way'):
+                way_id = long(element_id.split(':')[-1])
+
+                # Get node indices by binary search
+                try:
+                    node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps]
+                except ValueError:
+                    continue
+
+                # Way ids stored in a sorted array
+                self.way_ids.append(way_id)
+
+                # way_deps is the list of dependent node ids
+                # way_coords is a copy of coords indexed by way ids
+                for node_id, node_index in izip(deps, node_indices):
+                    self.way_deps.append(node_id)
+                    self.way_coords.append(self.coords[node_index * 2])
+                    self.way_coords.append(self.coords[node_index * 2 + 1])
+
+                self.way_indptr.append(len(self.way_deps))
+
+                if deps[0] == deps[-1] and self.include_polygon(props):
+                    way_id_offset = WAY_OFFSET + way_id
+                    if not properties_only:
+                        outer_polys = self.create_polygons([way_id])
+                        inner_polys = []
+                        yield way_id_offset, props, {}, outer_polys, inner_polys
+                    else:
+                        yield way_id_offset, props, {}
+
+            elif element_id.startswith('relation'):
+                if self.node_ids is not None:
+                    self.node_ids = None
+                if self.coords is not None:
+                    self.coords = None
+
+                relation_id = long(element_id.split(':')[-1])
+                if len(deps) == 0 or not self.include_polygon(props) or props.get('type', '').lower() == 'multilinestring':
+                    continue
+
+                outer_ways = []
+                inner_ways = []
+                admin_centers = []
+
+                for elem_id, elem_type, role in deps:
+                    if role in ('outer', '') and elem_type == 'way':
+                        outer_ways.append(elem_id)
+                    elif role == 'inner' and elem_type == 'way':
+                        inner_ways.append(elem_id)
+                    elif role == 'admin_centre' and elem_type == 'node':
+                        val = self.nodes.get(long(elem_id))
+                        if val is not None:
+                            val['type'] = 'node'
+                            val['id'] = long(elem_id)
+                            admin_centers.append(val)
+                    elif role == 'label' and elem_type == 'node':
+                        val = self.nodes.get(long(elem_id))
+                        if val is not None and val.get('name', six.u('')).lower() == props.get('name', six.u('')).lower():
+                            props.update({k: v for k, v in six.iteritems(val)
+                                          if k not in props})
+
+                admin_center = {}
+                if len(admin_centers) == 1:
+                    admin_center = admin_centers[0]
+
+                relation_id_offset = RELATION_OFFSET + relation_id
+                if not properties_only:
+                    outer_polys = self.create_polygons(outer_ways)
+                    inner_polys = self.create_polygons(inner_ways)
+                    yield relation_id_offset, props, admin_center, outer_polys, inner_polys
+                else:
+                    yield relation_id_offset, props, admin_center
+            if i % 1000 == 0 and i > 0:
+                self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
+            i += 1
+
+
+class OSMAdminPolygonReader(OSMPolygonReader):
+    def include_polygon(self, props):
+        return 'boundary' in props or 'place' in props
+
+
+class OSMSubdivisionPolygonReader(OSMPolygonReader):
+    def include_polygon(self, props):
+        return 'landuse' in props or 'place' in props or 'amenity' in props
+
+
+class OSMBuildingPolygonReader(OSMPolygonReader):
+    def include_polygon(self, props):
+        return 'building' in props or 'building:part' in props or props.get('type', None) == 'building'
+
+
+class OSMCountryPolygonReader(OSMPolygonReader):
+    def include_polygon(self, props):
+        return 'ISO3166-1:alpha2' in props or 'ISO3166-2' in props or (props.get('type', 'relation'), safe_encode(props.get('id', ''))) in osm_admin1_ids
+
+
+class OSMNeighborhoodPolygonReader(OSMPolygonReader):
+    def include_polygon(self, props):
+        return osm_definitions.meets_definition(props, osm_definitions.NEIGHBORHOOD)
+
+
+class OSMPostalCodesPolygonReader(OSMPolygonReader):
+    def include_polygon(self, props):
+        return props.get('boundary') == 'postal_code'
+
+
+class OSMAirportsPolygonReader(OSMPolygonReader):
+    def include_polygon(self, props):
+        return 'aerodrome' in props
--- a/scripts/geodata/osm/components.py
+++ b/scripts/geodata/osm/components.py
@@ -0,0 +1,184 @@
+import collections
+import os
+import six
+import yaml
+
+from copy import deepcopy
+
+from geodata.address_formatting.formatter import AddressFormatter
+from geodata.configs.utils import recursive_merge, DoesNotExist
+
+from geodata.encoding import safe_encode
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+OSM_BOUNDARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                  'resources', 'boundaries', 'osm')
+
+
+class OSMAddressComponents(object):
+    '''
+    Keeps a map of OSM keys and values to the standard components
+    of an address like city, state, etc. used for address formatting.
+    When we reverse geocode a point, it will fall into a number of
+    polygons, and we simply need to assign the names of said polygons
+    to an address field.
+    '''
+
+    ADMIN_LEVEL = 'admin_level'
+
+    # These keys override country-level
+    global_keys_override = {
+        'place': {
+            'island': AddressFormatter.ISLAND,
+            'islet': AddressFormatter.ISLAND,
+            'municipality': AddressFormatter.CITY,
+            'city': AddressFormatter.CITY,
+            'town': AddressFormatter.CITY,
+            'township': AddressFormatter.CITY,
+            'village': AddressFormatter.CITY,
+            'hamlet': AddressFormatter.CITY,
+            'suburb': AddressFormatter.SUBURB,
+            'quarter': AddressFormatter.SUBURB,
+            'neighbourhood': AddressFormatter.SUBURB
+        },
+        'border_type': {
+            'city': AddressFormatter.CITY
+        }
+    }
+
+    # These keys are fallback in case we haven't added a country or there is no admin_level=
+    global_keys = {
+        'place': {
+            'country': AddressFormatter.COUNTRY,
+            'state': AddressFormatter.STATE,
+            'region': AddressFormatter.STATE,
+            'province': AddressFormatter.STATE,
+            'county': AddressFormatter.STATE_DISTRICT,
+        },
+        'gnis:class': {
+            'populated place': AddressFormatter.CITY,
+        }
+    }
+
+    def __init__(self, boundaries_dir=OSM_BOUNDARIES_DIR):
+        self.config = {}
+
+        self.use_admin_center = {}
+
+        for filename in os.listdir(boundaries_dir):
+            if not filename.endswith('.yaml'):
+                continue
+
+            country_code = filename.rsplit('.yaml', 1)[0]
+            data = yaml.load(open(os.path.join(boundaries_dir, filename)))
+
+            for prop, values in six.iteritems(data):
+                if not hasattr(values, 'items'):
+                    # non-dict key
+                    continue
+
+                for k, v in values.iteritems():
+                    if isinstance(v, six.string_types) and v not in AddressFormatter.address_formatter_fields:
+                        raise ValueError(u'Invalid value in {} for prop={}, key={}: {}'.format(filename, prop, k, v))
+
+                if prop == 'overrides':
+                    self.use_admin_center.update({(r['type'], safe_encode(r['id'])): r.get('probability', 1.0) for r in values.get('use_admin_center', [])})
+
+                    containing_overrides = values.get('contained_by', {})
+
+                    if not containing_overrides:
+                        continue
+
+                    for id_type, vals in six.iteritems(containing_overrides):
+                        for element_id in vals:
+
+                            override_config = vals[element_id]
+
+                            config = deepcopy(data)
+                            config.pop('overrides')
+
+                            recursive_merge(config, override_config)
+
+                            vals[element_id] = config
+
+            self.config[country_code] = data
+
+    def component(self, country, prop, value):
+        component = self.global_keys_override.get(prop, {}).get(value, None)
+        if component is not None:
+            return component
+
+        component = self.config.get(country, {}).get(prop, {}).get(value, None)
+        if component is not None:
+            return component
+
+        return self.global_keys.get(prop, {}).get(value, None)
+
+    def component_from_properties(self, country, properties, containing=(), global_keys=True):
+        country_config = self.config.get(country, {})
+
+        config = country_config
+
+        overrides = country_config.get('overrides')
+        if overrides:
+            id_overrides = overrides.get('id', {})
+            element_type = properties.get('type')
+            element_id = properties.get('id')
+
+            override_value = id_overrides.get(element_type, {})
+            element_id = six.binary_type(element_id or '')
+            if element_id in override_value:
+                return override_value[element_id]
+
+            contained_by_overrides = overrides.get('contained_by')
+            if contained_by_overrides and containing:
+                # Note, containing should be passed in from smallest to largest
+                for containing_type, containing_id in containing:
+                    override_config = contained_by_overrides.get(containing_type, {}).get(six.binary_type(containing_id or ''), None)
+                    if override_config:
+                        config = override_config
+                        break
+
+        values = [(k.lower(), v.lower()) for k, v in six.iteritems(properties) if isinstance(v, six.string_types)]
+
+        global_overrides_last = config.get('global_overrides_last', False)
+
+        # place=city, place=suburb, etc. override per-country boundaries
+        if not global_overrides_last:
+            for k, v in values:
+                containing_component = self.global_keys_override.get(k, {}).get(v, DoesNotExist)
+
+                if containing_component is not DoesNotExist:
+                    return containing_component
+
+                if k != self.ADMIN_LEVEL and k in config:
+                    containing_component = config.get(k, {}).get(v, DoesNotExist)
+                    if containing_component is not DoesNotExist:
+                        return containing_component
+
+        # admin_level tags are mapped per country
+        for k, v in values:
+            containing_component = config.get(k, {}).get(v, DoesNotExist)
+
+            if containing_component is not DoesNotExist:
+                return containing_component
+
+        # other place keys like place=state, etc. serve as a backup
+        # when no admin_level tags are available
+        for k, v in values:
+            containing_component = self.global_keys.get(k, {}).get(v, DoesNotExist)
+
+            if containing_component is not DoesNotExist:
+                return containing_component
+
+        if global_overrides_last:
+            for k, v in values:
+                containing_component = self.global_keys_override.get(k, {}).get(v, DoesNotExist)
+
+                if containing_component is not DoesNotExist:
+                    return containing_component
+
+        return None
+
+osm_address_components = OSMAddressComponents()
--- a/scripts/geodata/osm/definitions.py
+++ b/scripts/geodata/osm/definitions.py
@@ -0,0 +1,89 @@
+import os
+import re
+import six
+
+from collections import defaultdict
+
+from geodata.graph.topsort import topsort
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+DEFAULT_SCRIPT_PATH = os.path.join(this_dir, 'fetch_osm_address_data.sh')
+
+valid_key_regex = re.compile('VALID_(.*?)_KEYS="(.*)"')
+variable_regex = re.compile(r'\$VALID_(.*?)_KEYS(?=\b)')
+kv_regex = re.compile('([^\s]*)=([^\s]*)')
+
+
+class OSMDefinitions(object):
+    ALL = '*'
+
+    ADMIN_BORDER = 'admin_border'
+    ADMIN_NODE = 'admin_node'
+    AEROWAY = 'aeroway'
+    AMENITY = 'amenity'
+    BUILDING = 'building'
+    HISTORIC = 'historic'
+    LANDUSE = 'landuse'
+    NATURAL = 'natural'
+    LOCALITY = 'locality'
+    NEIGHBORHOOD = 'neighborhood'
+    EXTENDED_NEIGHBORHOOD = 'extended_neighborhood'
+    OFFICE = 'office'
+    PLACE = 'place'
+    POPULATED_PLACE = 'populated_place'
+    SHOP = 'shop'
+    TOURISM = 'tourism'
+    VENUE = 'venue'
+    WATERWAY = 'waterway'
+
+    def __init__(self, filename=DEFAULT_SCRIPT_PATH):
+        script = open(filename).read()
+
+        dependencies = defaultdict(list)
+
+        definitions = {}
+
+        matches = valid_key_regex.findall(script)
+
+        match_text = {d.lower(): t for d, t in matches}
+
+        for definition, text in matches:
+            variables = variable_regex.findall(text)
+            if not variables:
+                dependencies[definition.lower()] = []
+            for v in variables:
+                dependencies[definition.lower()].append(v.lower())
+
+        for definition in topsort(dependencies):
+            definition = definition.lower()
+            text = match_text[definition]
+            variables = variable_regex.findall(text)
+            for v in variables:
+                v = v.lower()
+                text = text.replace('$VALID_{}_KEYS'.format(v.upper()), match_text[v])
+
+            kvs = defaultdict(set)
+
+            for k, v in kv_regex.findall(text):
+                if v != '':
+                    kvs[k].add(v.lower())
+                else:
+                    kvs[k].add(self.ALL)
+
+            definitions[definition] = kvs
+
+        self.definitions = definitions
+
+    def meets_definition(self, props, category):
+        defs = self.definitions.get(category, {})
+        if not defs:
+            return False
+        elif self.ALL in defs:
+            return True
+        for k, v in six.iteritems(props):
+            if v.lower() in defs.get(k.lower(), set()):
+                return True
+        return False
+
+osm_definitions = OSMDefinitions()
--- a/scripts/geodata/osm/extract.py
+++ b/scripts/geodata/osm/extract.py
@@ -0,0 +1,207 @@
+'''
+geodata.osm.extract
+-------------------
+
+Extracts nodes/ways/relations, their metadata and dependencies
+from .osm XML files.
+'''
+
+import re
+import six
+import urllib
+import HTMLParser
+
+from collections import OrderedDict
+from lxml import etree
+
+
+from geodata.csv_utils import unicode_csv_reader
+from geodata.text.normalize import normalize_string, NORMALIZE_STRING_DECOMPOSE, NORMALIZE_STRING_LATIN_ASCII
+from geodata.encoding import safe_decode, safe_encode
+
+
+WAY_OFFSET = 10 ** 15
+RELATION_OFFSET = 2 * 10 ** 15
+
+NODE = 'node'
+WAY = 'way'
+RELATION = 'relation'
+
+ALL_OSM_TAGS = set([NODE, WAY, RELATION])
+WAYS_RELATIONS = set([WAY, RELATION])
+
+OSM_NAME_TAGS = (
+    'name',
+    'alt_name',
+    'int_name',
+    'nat_name',
+    'reg_name',
+    'loc_name',
+    'official_name',
+    'commonname',
+    'common_name',
+    'place_name',
+    'short_name',
+)
+
+OSM_BASE_NAME_TAGS = (
+    'tiger:name_base',
+)
+
+
+def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False):
+    '''
+    Parse a file in .osm format iteratively, generating tuples like:
+    ('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
+    ('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
+    ('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
+    ('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
+    ('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4])
+    '''
+    f = open(filename)
+    parser = etree.iterparse(f)
+
+    single_type = len(allowed_types) == 1
+
+    for (_, elem) in parser:
+        elem_id = long(elem.attrib.pop('id', 0))
+        item_type = elem.tag
+        if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET:
+            elem_id -= WAY_OFFSET
+            item_type = 'way'
+        elif elem_id >= RELATION_OFFSET:
+            elem_id -= RELATION_OFFSET
+            item_type = 'relation'
+
+        if item_type in allowed_types:
+            attrs = OrderedDict(elem.attrib)
+            attrs['type'] = item_type
+            attrs['id'] = safe_encode(elem_id)
+
+            top_level_attrs = set(attrs)
+            deps = [] if dependencies else None
+
+            for e in elem.getchildren():
+                if e.tag == 'tag':
+                    # Prevent user-defined lat/lon keys from overriding the lat/lon on the node
+                    key = e.attrib['k']
+                    if key not in top_level_attrs:
+                        attrs[key] = e.attrib['v']
+                elif dependencies and item_type == 'way' and e.tag == 'nd':
+                    deps.append(long(e.attrib['ref']))
+                elif dependencies and item_type == 'relation' and e.tag == 'member' and 'role' in e.attrib:
+                    deps.append((long(e.attrib['ref']), e.attrib.get('type'), e.attrib['role']))
+
+            key = elem_id if single_type else '{}:{}'.format(item_type, elem_id)
+            yield key, attrs, deps
+
+        if elem.tag in ALL_OSM_TAGS:
+            elem.clear()
+            while elem.getprevious() is not None:
+                del elem.getparent()[0]
+
+
+def osm_type_and_id(element_id):
+    element_id = long(element_id)
+    if element_id >= RELATION_OFFSET:
+        id_type = RELATION
+        element_id -= RELATION_OFFSET
+    elif element_id >= WAY_OFFSET:
+        id_type = WAY
+        element_id -= WAY_OFFSET
+    else:
+        id_type = NODE
+
+    return id_type, element_id
+
+apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
+
+html_parser = HTMLParser.HTMLParser()
+
+
+def normalize_wikipedia_title(title):
+    match = apposition_regex.match(title)
+    if match:
+        title = match.group(1)
+
+    title = safe_decode(title)
+    title = html_parser.unescape(title)
+    title = urllib.unquote_plus(title)
+
+    return title.replace(u'_', u' ').strip()
+
+
+def osm_wikipedia_title_and_language(key, value):
+    language = None
+    if u':' in key:
+        key, language = key.rsplit(u':', 1)
+
+    if u':' in value:
+        possible_language = value.split(u':', 1)[0]
+        if len(possible_language) == 2 and language is None:
+            language = possible_language
+            value = value.rsplit(u':', 1)[-1]
+
+    return normalize_wikipedia_title(value), language
+
+
+non_breaking_dash = six.u('[-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]')
+simple_number = six.u('(?:{})?[0-9]+(?:\.[0-9]+)?').format(non_breaking_dash)
+simple_number_regex = re.compile(simple_number, re.UNICODE)
+
+non_breaking_dash_regex = re.compile(non_breaking_dash, re.UNICODE)
+number_range_regex = re.compile(six.u('({}){}({})').format(simple_number, non_breaking_dash, simple_number), re.UNICODE)
+letter_range_regex = re.compile(r'([^\W\d_]){}([^\W\d_])'.format(non_breaking_dash.encode('unicode-escape')), re.UNICODE)
+
+number_split_regex = re.compile('[,;]')
+
+
+def parse_osm_number_range(value, parse_letter_range=True, max_range=100):
+    value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE)
+    numbers = []
+    values = number_split_regex.split(value)
+    for val in values:
+        val = val.strip()
+        match = number_range_regex.match(val)
+        if match:
+            start_num, end_num = match.groups()
+            start_num_len = len(start_num)
+
+            zfill = 0
+            if start_num.startswith('0'):
+                zfill = start_num_len
+
+            try:
+                start_num = int(start_num)
+                end_num = int(end_num)
+
+                if end_num > start_num:
+                    if end_num - start_num > max_range:
+                        end_num = start_num + max_range
+
+                    for i in xrange(start_num, end_num + 1):
+                        numbers.append(safe_decode(i).zfill(zfill))
+                else:
+                    numbers.append(val.strip().zfill(zfill))
+                    continue
+            except (TypeError, ValueError):
+                numbers.append(safe_decode(val).strip().zfill(zfill))
+                continue
+
+        else:
+            letter_match = letter_range_regex.match(val)
+            if letter_match and parse_letter_range:
+                start_num, end_num = letter_match.groups()
+                start_num = ord(start_num)
+                end_num = ord(end_num)
+                if end_num > start_num:
+                    if end_num - start_num > max_range:
+                        end_num = start_num + max_range
+                    for i in xrange(start_num, end_num + 1):
+                        numbers.append(six.unichr(i))
+                else:
+                    numbers.extend([six.unichr(start_num), six.unichr(end_num)])
+                    continue
+            else:
+                numbers.append(safe_decode(val.strip()))
+    return numbers
--- a/scripts/geodata/osm/fetch_osm_address_data.sh
+++ b/scripts/geodata/osm/fetch_osm_address_data.sh
@@ -0,0 +1,282 @@
+#!/usr/bin/env bash
+
+: '
+fetch_osm_address_data.sh
+-------------------------
+
+Shell script to download OSM planet and derive inputs
+for language detection and address parser training set
+construction.
+
+Usage: ./fetch_osm_address_data.sh out_dir
+'
+
+if [ "$#" -ge 1 ]; then
+    OUT_DIR=$1
+else
+    OUT_DIR=`pwd`
+fi
+
+set -e
+
+THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+RESOURCES_DIR=$THIS_DIR/../../../resources
+ADMIN1_FILE=$RESOURCES_DIR/language/regional/adm1.tsv
+
+# Check for osmfilter and osmconvert
+if ! type -P osmfilter osmconvert > /dev/null; then
+cat << EOF
+ERROR: osmfilter and osmconvert are required
+
+On Debian/Ubuntu:
+sudo apt-get install osmctools
+
+Or to compile:
+wget -O - http://m.m.i24.cc/osmfilter.c |cc -x c - -O3 -o osmfilter
+wget -O - http://m.m.i24.cc/osmconvert.c | cc -x c - -lz -O3 -o osmconvert
+EOF
+exit 127
+fi
+
+PREV_DIR=`pwd`
+
+cd $OUT_DIR
+
+# Download planet as PBF
+# TODO: currently uses single mirror, randomly choose one instead
+echo "Started OSM download: `date`"
+
+PLANET_PBF="planet-latest.osm.pbf"
+JAPAN_PBF="japan-latest.osm.pbf"
+
+wget --quiet http://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-latest.osm.pbf -O $OUT_DIR/$PLANET_PBF &
+wget --quiet http://download.geofabrik.de/asia/japan-latest.osm.pbf -O $OUT_DIR/$JAPAN_PBF &
+
+wait
+
+echo "Converting to o5m: `date`"
+PLANET_O5M="planet-latest.o5m"
+JAPAN_O5M="japan-latest.o5m"
+
+# Needs to be in O5M for some of the subsequent steps to work whereas PBF is smaller for download
+osmconvert $PLANET_PBF -o=$PLANET_O5M &
+osmconvert $JAPAN_PBF -o=$JAPAN_O5M &
+
+wait
+
+rm $PLANET_PBF
+rm $JAPAN_PBF
+
+VALID_AEROWAY_KEYS="aeroway=aerodrome"
+VALID_AMENITY_KEYS="amenity=ambulance_station or amenity=animal_boarding or amenity=animal_shelter or amenity=arts_centre or amenity=auditorium or amenity=baby_hatch or amenity=bank or amenity=bar or amenity=bbq or amenity=biergarten or amenity=boathouse or amenity=boat_rental or amenity=boat_sharing or amenity=boat_storage or amenity=brothel or amenity=bureau_de_change or amenity=bus_station or amenity=cafe or amenity=car_rental or amenity=car_sharing or amenity=car_wash or amenity=casino or amenity=cemetery or amenity=charging_station or amenity=cinema or amenity=childcare or amenity=clinic or amenity=club or amenity=clock or amenity=college or amenity=community_center or amenity=community_centre or amenity=community_hall or amenity=concert_hall or amenity=conference_centre or amenity=courthouse or amenity=coworking_space or amenity=crematorium or amenity=crypt or amenity=culture_center or amenity=dancing_school or amenity=dentist or amenity=dive_centre or amenity=doctors or amenity=dojo or amenity=dormitory or amenity=driving_school or amenity=embassy or amenity=emergency_service or amenity=events_venue or amenity=exhibition_centre or amenity=fast_food or amenity=ferry_terminal or amenity=festival_grounds or amenity=fire_station or amenity=food_count or amenity=fountain or amenity=gambling or amenity=game_feeding or amenity=grave_yard or amenity=greenhouse or amenity=gym or amenity=hall or amenity=health_centre or amenity=hospice or amenity=hospital or amenity=hotel or amenity=hunting_stand or amenity=ice_cream or amenity=internet_cafe or amenity=kindergarten or amenity=kiosk or amenity=kneipp_water_cure or amenity=language_school or amenity=lavoir or amenity=library or amenity=love_hotel or amenity=market or amenity=marketplace or amenity=medical_centre or amenity=mobile_money_agent or amenity=monastery or amenity=money_transfer or amenity=mortuary or amenity=mountain_rescue or amenity=music_school or amenity=music_venue or amenity=nightclub or amenity=nursery or amenity=nursing_home or amenity=office or amenity=parish_hall or amenity=park or amenity=pharmacy or amenity=planetarium or amenity=place_of_worship or amenity=police or amenity=post_office or amenity=preschool or amenity=prison or amenity=pub or amenity=public_bath or amenity=public_bookcase or amenity=public_building or amenity=public_facility or amenity=public_hall or amenity=public_market or amenity=ranger_station or amenity=refugee_housing or amenity=register_office or amenity=research_institute or amenity=rescue_station or amenity=residential or amenity=Residential or amenity=restaurant or amenity=retirement_home or amenity=sacco or amenity=sanitary_dump_station or amenity=sanitorium or amenity=sauna or amenity=school or amenity=shelter or amenity=shop or amenity=shopping or amenity=shower or amenity=ski_rental or amenity=ski_school or amenity=social_centre or amenity=social_club or amenity=social_facility or amenity=spa or amenity=stables or amenity=stripclub or amenity=studio or amenity=supermarket or amenity=swimming_pool or amenity=swingerclub or amenity=townhall or amenity=theatre or amenity=training or amenity=trolley_bay or amenity=university or amenity=vehicle_inspection or amenity=veterinary or amenity=village_hall or amenity=vivarium or amenity=waste_transfer_station or amenity=whirlpool or amenity=winery or amenity=youth_centre"
+GENERIC_AMENITIES="amenity=atm or amenity=bench or amenity=bicycle_parking or amenity=bicycle_rental or amenity=bicycle_repair_station or amenity=compressed_air or amenity=drinking_water or amenity=emergency_phone or amenity=fire_hydrant or amenity=fuel or amenity=grit_bin or amenity=motorcycle_parking or amenity=parking or amenity=parking_space or amenity=post_box or amenity=reception_area or amenity=recycling or amenity=taxi or amenity=telephone or amenity=ticket_validator or amenity=toilets or amenity=vending_machine or amenity=waste_basket or amenity=waste_disposal or amenity=water_point or amenity=watering_place or amenity=wifi"
+
+VALID_OFFICE_KEYS="office=accountant or office=administrative or office=administration or office=advertising_agency or office=architect or office=association or office=camping or office=charity or office=company or office=consulting or office=educational_institution or office=employment_agency or office=estate_agent or office=financial or office=forestry or office=foundation or office=government or office=insurance or office=it or office=lawyer or office=newspaper or office=ngo or office=notary or office=parish or office=physician or office=political_party or office=publisher or office=quango or office=real_estate_agent or office=realtor or office=register or office=religion or office=research or office=tax or office=tax_advisor or office=telecommunication or office=therapist or office=travel_agent or office=water_utility"
+VALID_SHOP_KEYS="shop="
+VALID_HISTORIC_KEYS="historic=archaeological_site or historic=castle or historic=fort or historic=memorial or historic=monument or historic=ruins or historic=tomb"
+VALID_PLACE_KEYS="place=farm or place=isolated_dwelling or place=square"
+VALID_TOURISM_KEYS="tourism=hotel or tourism=attraction or tourism=guest_house or tourism=museum or tourism=chalet or tourism=motel or tourism=hostel or tourism=alpine_hut or tourism=theme_park or tourism=zoo or tourism=apartment or tourism=wilderness_hut or tourism=gallery or tourism=bed_and_breakfast or tourism=hanami or tourism=wine_cellar or tourism=resort or tourism=aquarium or tourism=apartments or tourism=cabin or tourism=winery or tourism=hut"
+VALID_LEISURE_KEYS="leisure=adult_gaming_centre or leisure=amusement_arcade or leisure=arena or leisure=bandstand or leisure=beach_resort or leisure=bbq or leisure=bird_hide or leisure=bowling_alley or leisure=casino or leisure=common or leisure=club or leisure=dance or leisure=dancing or leisure=disc_golf_course or leisure=dog_park or leisure=fishing or leisure=fitness_centre or leisure=gambling or leisure=garden or leisure=golf_course or leisure=hackerspace or leisure=horse_riding or leisure=hospital or leisure=hot_spring or leisure=ice_rink leisure=landscape_reserve or leisure=marina or leisure=maze or leisure=miniature_golf or leisure=nature_reserve or leisure=padding_pool or leisure=park or leisure=pitch or leisure=playground or leisure=recreation_ground or leisure=resort or leisure=sailing_club or leisure=sauna or leisure=social_club or leisure=sports_centre or leisure=stadium or leisure=summer_camp or leisure=swimming_pool or leisure=tanning_salon or leisure=track or leisure=trampoline_park or leisure=turkish_bath or leisure=video_arcade or leisure=water_park or leisure=wildlife_hide"
+VALID_LANDUSE_KEYS="landuse=allotmenets or landuse=basin or landuse=cemetery or landuse=commercial or landuse=construction or landuse=farmland or landuse=forest or landuse=grass or landuse=greenhouse_horticulture or landuse=industrial or landuse=landfill or landuse=meadow or landuse=military or landuse=orchard or landuse=plant_nursery or landuse=port or landuse=quarry or landuse=recreation_ground or landuse=resevoir or landuse=residential or landuse=retail or landuse=village_green or landuse=vineyard"
+
+VALID_VENUE_KEYS="( ( $VALID_AEROWAY_KEYS ) or ( $VALID_AMENITY_KEYS ) or ( $VALID_HISTORIC_KEYS ) or ( $VALID_OFFICE_KEYS ) or ( $VALID_PLACE_KEYS ) or ( $VALID_SHOP_KEYS ) or ( $VALID_TOURISM_KEYS ) or ( $VALID_LEISURE_KEYS ) or ( $VALID_LANDUSE_KEYS ) )"
+
+# Address data set for use in parser, language detection
+echo "Filtering for records with address tags: `date`"
+PLANET_ADDRESSES_O5M="planet-addresses.o5m"
+JAPAN_ADDRESSES_O5M="japan-addresses.o5m"
+VALID_ADDRESSES="( ( ( name= or addr:housename= ) and ( ( building= and building!=yes )  or $VALID_VENUE_KEYS ) ) ) or ( ( addr:street= or addr:place= ) and ( name= or building= or building:levels= or addr:housename= or addr:housenumber= ) )"
+VALID_ADDRESSES_JAPAN="( addr:housenumber= or addr:street= ) or ( ( name= or name:ja= or addr:housename= ) and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS ) )"
+osmfilter $PLANET_O5M --keep="$VALID_ADDRESSES" --drop-author --drop-version -o=$PLANET_ADDRESSES_O5M &
+osmfilter $JAPAN_O5M --keep="$VALID_ADDRESSES_JAPAN" --drop-author --drop-version -o=$JAPAN_ADDRESSES_O5M &
+
+wait
+
+PLANET_ADDRESSES_LATLONS="planet-addresses-latlons.o5m"
+JAPAN_ADDRESSES_LATLONS="japan-addresses-latlons.o5m"
+osmconvert $PLANET_ADDRESSES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_ADDRESSES_LATLONS &
+osmconvert $JAPAN_ADDRESSES_O5M --max-objects=1000000000 --all-to-nodes -o=$JAPAN_ADDRESSES_LATLONS &
+
+wait
+
+rm $PLANET_ADDRESSES_O5M
+rm $JAPAN_ADDRESSES_O5M
+PLANET_ADDRESSES="planet-addresses.osm"
+osmfilter $PLANET_ADDRESSES_LATLONS --keep="$VALID_ADDRESSES" -o=$PLANET_ADDRESSES_O5M &
+osmfilter $JAPAN_ADDRESSES_LATLONS --keep="$VALID_ADDRESSES_JAPAN" -o=$JAPAN_ADDRESSES_O5M &
+
+wait
+
+osmconvert $PLANET_ADDRESSES_O5M $JAPAN_ADDRESSES_O5M -o=$PLANET_ADDRESSES
+
+rm $PLANET_ADDRESSES_O5M
+rm $JAPAN_ADDRESSES_O5M
+
+rm $PLANET_ADDRESSES_LATLONS
+rm $JAPAN_ADDRESSES_LATLONS
+
+# Border data set for use in R-tree index/reverse geocoding, parsing, language detection
+echo "Filtering for borders: `date`"
+PLANET_COUNTRIES="planet-countries.osm"
+PLANET_BORDERS_O5M="planet-borders.o5m"
+PLANET_BORDERS="planet-borders.osm"
+PLANET_ADMIN_BORDERS_OSM="planet-admin-borders.osm"
+
+VALID_COUNTRY_KEYS="ISO3166-1:alpha2="
+VALID_ADMIN1_KEYS="ISO3166-2="
+ADMIN1_LANGUAGE_EXCEPTION_IDS=$(grep "osm" $ADMIN1_FILE | sed 's/^.*relation:\([0-9][0-9]*\).*$/@id=\1/' | xargs echo | sed 's/\s/ or /g')
+
+VALID_ADMIN_BORDER_KEYS="boundary=administrative or boundary=town or boundary=city_limit or boundary=civil_parish or boundary=civil or boundary=ceremonial or boundary=postal_district or place=island or place=city or place=town or place=village or place=hamlet or place=municipality or place=settlement"
+
+VALID_POPULATED_PLACE_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=locality or place=settlement or place=census-designated or place:ph=village"
+VALID_NEIGHBORHOOD_KEYS="place=neighbourhood or place=neighborhood or place:ph=barangay"
+VALID_EXTENDED_NEIGHBORHOOD_KEYS="place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place:ph=barangay"
+
+VALID_LOCALITY_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place=locality or place=settlement or place=census-designated or place:ph=barangay or place:ph=village"
+
+VALID_ADMIN_NODE_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place=island or place=islet or place=county or place=region or place=state or place=subdistrict or place=township or place=archipelago or place=department or place=country or place=district or place=census-designated or place=ward or place=subward or place=province or place=peninsula or place=settlement or place=subregion"
+
+osmfilter $PLANET_O5M --keep="$VALID_ADMIN_BORDER_KEYS" --drop-author --drop-version -o=$PLANET_ADMIN_BORDERS_OSM &
+osmfilter $PLANET_O5M --keep="$VALID_ADMIN_BORDER_KEYS or $VALID_LOCALITY_KEYS" --drop-author --drop-version -o=$PLANET_BORDERS_O5M &
+
+wait
+
+PLANET_ADMIN_NODES="planet-admin-nodes.osm"
+osmfilter $PLANET_O5M --keep="$VALID_ADMIN_NODE_KEYS" --drop-ways --drop-relations --ignore-dependencies --drop-author --drop-version -o=$PLANET_ADMIN_NODES
+PLANET_BORDERS_LATLONS="planet-borders-latlons.o5m"
+osmconvert $PLANET_BORDERS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_BORDERS_LATLONS
+rm $PLANET_BORDERS_O5M
+osmfilter $PLANET_BORDERS_LATLONS --keep="$VALID_ADMIN_BORDER_KEYS or $VALID_LOCALITY_KEYS" -o=$PLANET_BORDERS
+rm $PLANET_BORDERS_LATLONS
+osmfilter $PLANET_O5M --keep="$VALID_COUNTRY_KEYS or $VALID_ADMIN1_KEYS or $ADMIN1_LANGUAGE_EXCEPTION_IDS" --drop-author --drop-version -o=$PLANET_COUNTRIES
+
+echo "Filtering for neighborhoods"
+PLANET_LOCALITIES="planet-localities.osm"
+PLANET_NEIGHBORHOOD_BORDERS="planet-neighborhood-borders.osm"
+
+osmfilter $PLANET_O5M --keep="$VALID_NEIGHBORHOOD_KEYS" --drop-author --drop-version -o=$PLANET_NEIGHBORHOOD_BORDERS
+osmfilter $PLANET_O5M --keep="name= and ( $VALID_LOCALITY_KEYS )" --drop-relations --drop-ways --ignore-dependencies --drop-author --drop-version -o=$PLANET_LOCALITIES
+
+echo "Filtering for rail stations"
+VALID_RAIL_STATION_KEYS="railway=station"
+PLANET_RAILWAYS_O5M="planet-rail-stations.o5m"
+PLANET_RAILWAYS="planet-rail-stations.osm"
+
+osmfilter $PLANET_O5M --keep="$VALID_RAIL_STATION_KEYS" --drop-author --drop-version -o=$PLANET_RAILWAYS_O5M
+PLANET_RAILWAYS_LATLONS="planet-rail-stations-latlons.o5m"
+osmconvert $PLANET_RAILWAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_RAILWAYS_LATLONS
+rm $PLANET_RAILWAYS_O5M
+osmfilter $PLANET_RAILWAYS_LATLONS --keep="$VALID_RAIL_STATION_KEYS" -o=$PLANET_RAILWAYS
+rm $PLANET_RAILWAYS_LATLONS
+
+echo "Filtering for airports and terminals"
+VALID_AIRPORT_KEYS="aeroway=aerodrome or aeroway=terminal"
+PLANET_AIRPORTS_O5M="planet-airports.o5m"
+PLANET_AIRPORTS="planet-airports.osm"
+
+osmfilter $PLANET_O5M --keep="$VALID_AIRPORT_KEYS" --drop-author --drop-version -o=$PLANET_AIRPORTS_O5M
+PLANET_AIRPORTS_LATLONS="planet-airports-latlons.o5m"
+osmconvert $PLANET_AIRPORTS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_AIRPORTS_LATLONS
+PLANET_AIRPORT_POLYGONS="planet-airport-polygons.osm"
+osmconvert $PLANET_AIRPORTS_O5M -o=$PLANET_AIRPORT_POLYGONS
+rm $PLANET_AIRPORTS_O5M
+osmfilter $PLANET_AIRPORTS_LATLONS --keep="$VALID_AIRPORT_KEYS" -o=$PLANET_AIRPORTS
+rm $PLANET_AIRPORTS_LATLONS
+
+echo "Filtering for subdivision polygons"
+PLANET_SUBDIVISIONS="planet-subdivisions.osm"
+SUBDIVISION_AMENITY_TYPES="amenity=university or amentiy=college or amentiy=school or amentiy=hospital"
+SUBDIVISION_LANDUSE_TYPES="landuse=residential or landuse=commercial or landuse=industrial or landuse=retail or landuse=military"
+SUBDIVISION_PLACE_TYPES="place=allotmenets or place=city_block or place=block or place=plot or place=subdivision"
+osmfilter $PLANET_O5M --keep="( $SUBDIVISION_AMENITY_TYPES or $SUBDIVISION_PLACE_TYPES or $SUBDIVISION_LANDUSE_TYPES )" --drop="( place= and not ( $SUBDIVISION_PLACE_TYPES ) ) or boundary=" --drop-author --drop-version -o=$PLANET_SUBDIVISIONS
+
+echo "Filtering for postal_code polygons"
+PLANET_POSTAL_CODES="planet-postcodes.osm"
+osmfilter $PLANET_O5M --keep="boundary=postal_code" --drop-author --drop-version -o=$PLANET_POSTAL_CODES
+
+
+# Venue data set for use in venue classification
+echo "Filtering for venue records: `date`"
+PLANET_VENUES_O5M="planet-venues.o5m"
+osmfilter $PLANET_O5M --keep="( name=  and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS or ( $VALID_RAIL_STATION_KEYS and addr:street= and ( wikipedia= or wikipedia:*= ) ) ) )" --drop-author --drop-version -o=$PLANET_VENUES_O5M
+PLANET_VENUES_LATLONS="planet-venues-latlons.o5m"
+osmconvert $PLANET_VENUES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_VENUES_LATLONS
+rm $PLANET_VENUES_O5M
+PLANET_VENUES="planet-venues.osm"
+osmfilter $PLANET_VENUES_LATLONS --keep="name= and ( ( building= and building!=yes ) or ( $VALID_VENUE_KEYS or ( $VALID_RAIL_STATION_KEYS and addr:street= and ( wikipedia= or wikipedia:*= ) ) ) )" -o=$PLANET_VENUES
+rm $PLANET_VENUES_LATLONS
+
+# Categories for building generic queries like "restaurants in Brooklyn"
+echo "Filtering for buildings: `date`"
+PLANET_BUILDINGS_O5M="planet-buildings.o5m"
+VALID_BUILDING_KEYS="building= or building:part="
+VALID_BUILDINGS="( ( $VALID_BUILDING_KEYS ) and ( building!=yes or name= or addr:housename= or addr:street= or addr:housenumber= or addr:postcode= ) )"
+osmfilter $PLANET_O5M --keep="$VALID_BUILDINGS" --drop-author --drop-version -o=$PLANET_BUILDINGS_O5M
+PLANET_BUILDINGS_LATLONS="planet-buildings-latlons.o5m"
+osmconvert $PLANET_BUILDINGS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_BUILDINGS_LATLONS
+rm $PLANET_BUILDINGS_O5M
+PLANET_BUILDINGS="planet-buildings.osm"
+osmfilter $PLANET_BUILDINGS_LATLONS --keep="$VALID_BUILDINGS" -o=$PLANET_BUILDINGS
+rm $PLANET_BUILDINGS_LATLONS
+
+echo "Filtering for building polygons: `date`"
+PLANET_BUILDING_POLYGONS="planet-building-polygons.osm"
+osmfilter $PLANET_O5M --keep="( ( building= or building:part= or type=building ) and ( building:levels= or name= or addr:street= or addr:place= or addr:housename= or addr:housenumber= ) )" --drop-author --drop-version -o=$PLANET_BUILDING_POLYGONS
+
+
+echo "Filtering for amenities: `date`"
+PLANET_AMENITIES_O5M="planet-amenities.o5m"
+ALL_AMENITIES="aeroway= or amenity= or or emergency= or historic= or internet_access= or landuse= or leisure= or man_made= or mountain_pass= or office= or place= or railway= or shop= or tourism="
+osmfilter $PLANET_O5M --keep="$ALL_AMENITIES" --drop-author --drop-version -o=$PLANET_AMENITIES_O5M
+PLANET_AMENITIES_LATLONS="planet-amenities-latlons.o5m"
+osmconvert $PLANET_AMENITIES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_AMENITIES_LATLONS
+rm $PLANET_AMENITIES_O5M
+PLANET_AMENITIES="planet-amenities.osm"
+osmfilter $PLANET_AMENITIES_LATLONS --keep="$ALL_AMENITIES" -o=$PLANET_AMENITIES
+rm $PLANET_AMENITIES_LATLONS
+
+echo "Filtering for natural: `date`"
+PLANET_NATURAL_O5M="planet-natural.o5m"
+VALID_NATURAL_KEYS="natural="
+osmfilter $PLANET_O5M --keep="$VALID_NATURAL_KEYS" --drop-author --drop-version -o=$PLANET_NATURAL_O5M
+PLANET_NATURAL_LATLONS="planet-natural-latlons.o5m"
+osmconvert $PLANET_NATURAL_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_NATURAL_LATLONS
+rm $PLANET_NATURAL_O5M
+PLANET_NATURAL="planet-natural.osm"
+osmfilter $PLANET_NATURAL_LATLONS --keep="$VALID_NATURAL_KEYS" -o=$PLANET_NATURAL
+rm $PLANET_NATURAL_LATLONS
+
+echo "Filtering for waterways: `date`"
+PLANET_WATERWAYS_O5M="planet-waterways.o5m"
+VALID_WATERWAY_KEYS="waterway="
+osmfilter $PLANET_O5M --keep="$VALID_WATERWAY_KEYS" --drop-author --drop-version -o=$PLANET_WATERWAYS_O5M
+PLANET_WATERWAYS_LATLONS="planet-waterways-latlons.o5m"
+osmconvert $PLANET_WATERWAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_WATERWAYS_LATLONS
+rm $PLANET_WATERWAYS_O5M
+PLANET_WATERWAYS="planet-waterways.osm"
+osmfilter $PLANET_WATERWAYS_LATLONS --keep="$VALID_WATERWAY_KEYS" -o=$PLANET_WATERWAYS
+rm $PLANET_WATERWAYS_LATLONS
+
+
+# Streets data set for use in language classification 
+echo "Filtering ways: `date`"
+PLANET_WAYS_O5M="planet-ways.o5m"
+VALID_ROAD_TYPES="( highway=motorway or highway=motorway_link or highway=motorway_junction or highway=trunk or highway=trunk_link or highway=primary or highway=primary_link or highway=secondary or highway=secondary_link or highway=tertiary or highway=tertiary_link or highway=unclassified or highway=unclassified_link or highway=residential or highway=residential_link or highway=service or highway=service_link or highway=living_street or highway=pedestrian or highway=steps or highway=cycleway or highway=bridleway or highway=track or highway=road or ( highway=path and ( motorvehicle=yes or motorcar=yes ) ) )"
+osmfilter planet-latest.o5m --keep="name= and $VALID_ROAD_TYPES" --drop-relations --drop-author --drop-version -o=$PLANET_WAYS_O5M
+PLANET_WAYS_NODES_LATLON="planet-ways-nodes-latlons.o5m"
+osmconvert $PLANET_WAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_WAYS_NODES_LATLON
+# 10^15 is the offset used for ways and relations with --all-to-ndoes, extracts just the ways
+PLANET_WAYS_LATLONS="planet-ways-latlons.osm"
+PLANET_WAYS="planet-ways.osm"
+
+osmfilter $PLANET_WAYS_NODES_LATLON --keep="name= and ( $VALID_ROAD_TYPES )" -o=$PLANET_WAYS
+osmfilter $PLANET_WAYS_O5M --keep="name= and ( $VALID_ROAD_TYPES )" -o=$PLANET_WAYS_LATLONS
+rm $PLANET_WAYS_NODES_LATLON
+rm $PLANET_WAYS_O5M
+
+rm $PLANET_O5M
+rm $JAPAN_O5M
+
+echo "Completed : `date`"
+
+cd $PREV_DIR
--- a/scripts/geodata/osm/formatter.py
+++ b/scripts/geodata/osm/formatter.py
--- a/scripts/geodata/osm/intersections.py
+++ b/scripts/geodata/osm/intersections.py
@@ -0,0 +1,189 @@
+import argparse
+import array
+import logging
+import numpy
+import os
+import six
+import sys
+import ujson as json
+
+from bisect import bisect_left
+from leveldb import LevelDB
+from itertools import izip, groupby
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.coordinates.conversion import latlon_to_decimal
+from geodata.file_utils import ensure_dir
+from geodata.osm.extract import *
+from geodata.encoding import safe_decode, safe_encode
+
+DEFAULT_INTERSECTIONS_FILENAME = 'intersections.json'
+
+
+class OSMIntersectionReader(object):
+    def __init__(self, filename, db_dir):
+        self.filename = filename
+
+        self.node_ids = array.array('l')
+
+        self.logger = logging.getLogger('osm.intersections')
+
+        # Store these in a LevelDB
+        ensure_dir(db_dir)
+        ways_dir = os.path.join(db_dir, 'ways')
+        ensure_dir(ways_dir)
+        nodes_dir = os.path.join(db_dir, 'nodes')
+        ensure_dir(nodes_dir)
+        self.way_props = LevelDB(ways_dir)
+        self.node_props = LevelDB(nodes_dir)
+
+        # These form a graph and should always have the same length
+        self.intersection_edges_nodes = array.array('l')
+        self.intersection_edges_ways = array.array('l')
+
+    def binary_search(self, a, x):
+        '''Locate the leftmost value exactly equal to x'''
+        i = bisect_left(a, x)
+        if i != len(a) and a[i] == x:
+            return i
+        return None
+
+    def intersections(self):
+        '''
+        Generator which yields tuples like:
+
+        (node_id, lat, lon, {way_id: way_props})
+        '''
+        i = 0
+
+        node_ids = array.array('l')
+        node_counts = array.array('i')
+
+        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
+            props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
+            if element_id.startswith('node'):
+                node_id = long(element_id.split(':')[-1])
+                node_ids.append(node_id)
+                node_counts.append(0)
+                self.node_props.Put(safe_encode(node_id), json.dumps(props))
+            elif element_id.startswith('way'):
+                # Don't care about the ordering of the nodes, and want uniques e.g. for circular roads
+                deps = set(deps)
+
+                # Get node indices by binary search
+                for node_id in deps:
+                    try:
+                        node_index = self.binary_search(node_ids, node_id)
+                    except ValueError:
+                        continue
+                    if node_index is None:
+                        continue
+                    node_counts[node_index] += 1
+
+            if i % 1000 == 0 and i > 0:
+                self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
+            i += 1
+
+        for i, count in enumerate(node_counts):
+            if count > 1:
+                self.node_ids.append(node_ids[i])
+
+        del node_ids
+        del node_counts
+
+        i = 0
+
+        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
+            if element_id.startswith('node'):
+                node_id = long(element_id.split(':')[-1])
+                node_index = self.binary_search(self.node_ids, node_id)
+            elif element_id.startswith('way'):
+                props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
+                way_id = long(element_id.split(':')[-1])
+                props['id'] = way_id
+                for node_id in deps:
+                    node_index = self.binary_search(self.node_ids, node_id)
+                    if node_index is not None:
+                        self.intersection_edges_nodes.append(node_id)
+                        self.intersection_edges_ways.append(way_id)
+                        self.way_props.Put(safe_encode(way_id), json.dumps(props))
+
+            if i % 1000 == 0 and i > 0:
+                self.logger.info('second pass, doing {}s, at {}'.format(element_id.split(':')[0], i))
+            i += 1
+
+        i = 0
+
+        indices = numpy.argsort(self.intersection_edges_nodes)
+        self.intersection_edges_nodes = numpy.fromiter((self.intersection_edges_nodes[i] for i in indices), dtype=numpy.uint64)
+        self.intersection_edges_ways = numpy.fromiter((self.intersection_edges_ways[i] for i in indices), dtype=numpy.uint64)
+        del indices
+
+        idx = 0
+
+        # Need to make a copy here otherwise will change dictionary size during iteration
+        for node_id, g in groupby(self.intersection_edges_nodes):
+            group_len = sum((1 for j in g))
+
+            node_props = json.loads(self.node_props.Get(safe_encode(node_id)))
+
+            way_indices = self.intersection_edges_ways[idx:idx + group_len]
+            all_ways = [json.loads(self.way_props.Get(safe_encode(w))) for w in way_indices]
+            way_names = set()
+            ways = []
+            for way in all_ways:
+                if way['name'] in way_names:
+                    continue
+                ways.append(way)
+                way_names.add(way['name'])
+
+            idx += group_len
+
+            if i % 1000 == 0 and i > 0:
+                self.logger.info('checking intersections, did {}'.format(i))
+            i += 1
+
+            if len(ways) > 1:
+                node_index = self.binary_search(self.node_ids, node_id)
+                yield self.node_ids[node_index], node_props, ways
+
+    def create_intersections(self, outfile):
+        out = open(outfile, 'w')
+        for node_id, node_props, ways in self.intersections():
+            d = {'id': safe_encode(node_id),
+                 'node': node_props,
+                 'ways': ways}
+            out.write(json.dumps(d) + six.u('\n'))
+
+    @classmethod
+    def read_intersections(cls, infile):
+        f = open(infile)
+        for line in f:
+            data = json.loads(line.rstrip())
+            yield data['id'], data['node'], data['ways']
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-i', '--input',
+                        required=True,
+                        help='Path to planet-ways-latlons.osm')
+
+    parser.add_argument('--db-dir',
+                        required=True,
+                        help='Path to temporary db')
+
+    parser.add_argument('-o', '--out-dir',
+                        default=os.getcwd(),
+                        required=True,
+                        help='Output directory')
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+
+    reader = OSMIntersectionReader(args.input, args.db_dir)
+    reader.create_intersections(os.path.join(args.out_dir, DEFAULT_INTERSECTIONS_FILENAME))
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -0,0 +1,563 @@
+# -*- coding: utf-8 -*-
+'''
+osm_address_training_data.py
+----------------------------
+
+This script generates several training sets from OpenStreetMap addresses,
+streets, venues and toponyms.
+
+Note: the combined size of all the files created by this script exceeds 100GB
+so if training these models, it is wise to use a server-grade machine with
+plenty of disk space. The following commands can be used in parallel to create
+all the training sets:
+
+Ways:
+python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Venues:
+python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Limited formatted addresses:
+python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  -o $(OUT_DIR)
+
+Formatted addresses (tagged):
+python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Formatted addresses (untagged):
+python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm  -f -u --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Intersections (after running intersections.py to create the JSON file):
+python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR)  --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
+
+Toponyms:
+python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) -o $(OUT_DIR)
+'''
+
+import argparse
+import csv
+import logging
+import os
+import operator
+import random
+import re
+import sys
+import tempfile
+import urllib
+import ujson as json
+import HTMLParser
+
+from collections import defaultdict, OrderedDict
+from lxml import etree
+from itertools import ifilter, chain, combinations
+
+from shapely.geos import LOG as shapely_geos_logger
+shapely_geos_logger.setLevel(logging.CRITICAL)
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.address_expansions.abbreviations import abbreviate
+from geodata.address_expansions.gazetteers import *
+from geodata.addresses.components import AddressComponents
+from geodata.coordinates.conversion import *
+from geodata.language_id.disambiguation import *
+from geodata.language_id.sample import sample_random_language
+from geodata.i18n.languages import *
+from geodata.metro_stations.reverse_geocode import MetroStationReverseGeocoder
+from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
+from geodata.osm.extract import *
+from geodata.osm.formatter import OSMAddressFormatter
+from geodata.places.reverse_geocode import PlaceReverseGeocoder
+from geodata.polygons.language_polys import *
+from geodata.polygons.reverse_geocode import *
+from geodata.i18n.unicode_paths import DATA_DIR
+
+from geodata.csv_utils import *
+from geodata.file_utils import *
+
+# Input files
+PLANET_ADDRESSES_INPUT_FILE = 'planet-addresses.osm'
+PLANET_WAYS_INPUT_FILE = 'planet-ways.osm'
+PLANET_VENUES_INPUT_FILE = 'planet-venues.osm'
+PLANET_BORDERS_INPUT_FILE = 'planet-borders.osm'
+
+# Output files
+WAYS_LANGUAGE_DATA_FILENAME = 'streets_by_language.tsv'
+ADDRESS_LANGUAGE_DATA_FILENAME = 'address_streets_by_language.tsv'
+TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv'
+
+
+def normalize_osm_name_tag(tag, script=False):
+    norm = tag.rsplit(':', 1)[-1]
+    if not script:
+        return norm
+    return norm.split('_', 1)[0]
+
+
+def get_language_names(country_rtree, key, value, tag_prefix='name'):
+    if not ('lat' in value and 'lon' in value):
+        return None, None
+
+    has_colon = ':' in tag_prefix
+    tag_first_component = tag_prefix.split(':')[0]
+    tag_last_component = tag_prefix.split(':')[-1]
+
+    try:
+        latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
+    except Exception:
+        return None, None
+
+    osm_country_components = country_rtree.point_in_poly(latitude, longitude, return_all=True)
+    country, candidate_languages = country_rtree.country_and_languages_from_components(osm_country_components)
+    if not (country and candidate_languages):
+        return None, None
+
+    num_langs = len(candidate_languages)
+    default_langs = set([l for l, d in candidate_languages if d])
+    num_defaults = len(default_langs)
+    name_language = defaultdict(list)
+
+    alternate_langs = []
+
+    equivalent_alternatives = defaultdict(list)
+    for k, v in value.iteritems():
+        if k.startswith(tag_prefix + ':') and normalize_osm_name_tag(k, script=True) in languages:
+            lang = k.rsplit(':', 1)[-1]
+            alternate_langs.append((lang, v))
+            equivalent_alternatives[v].append(lang)
+
+    has_alternate_names = len(alternate_langs)
+    # Some countries like Lebanon list things like name:en == name:fr == "Rue Abdel Hamid Karame"
+    # Those addresses should be disambiguated rather than taken for granted
+    ambiguous_alternatives = set([k for k, v in equivalent_alternatives.iteritems() if len(v) > 1])
+
+    regional_defaults = 0
+    country_defaults = 0
+    regional_langs = set()
+    country_langs = set()
+    for c in osm_country_components:
+        _, langs = country_rtree.country_and_languages_from_components([c])
+        if 'ISO3166-1:alpha2' not in c:
+            regional_defaults += sum((1 for l, d in langs if d))
+            regional_langs |= set([l for l, d in langs])
+        else:
+            country_defaults += sum((1 for l, d in langs if d))
+            country_langs |= set([l for l, d in langs])
+
+    ambiguous_already_seen = set()
+
+    for k, v in value.iteritems():
+        if k.startswith(tag_prefix + ':'):
+            if v not in ambiguous_alternatives:
+                norm = normalize_osm_name_tag(k)
+                norm_sans_script = normalize_osm_name_tag(k, script=True)
+                if norm in languages or norm_sans_script in languages:
+                    name_language[norm].append(v)
+            elif v not in ambiguous_already_seen:
+                langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
+                lang = disambiguate_language(v, langs)
+
+                if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
+                    name_language[lang].append(v)
+
+                ambiguous_already_seen.add(v)
+        elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
+            if num_langs == 1:
+                name_language[candidate_languages[0][0]].append(v)
+            else:
+                lang = disambiguate_language(v, candidate_languages)
+                default_lang = candidate_languages[0][0]
+
+                if lang == AMBIGUOUS_LANGUAGE:
+                    return None, None
+                elif lang == UNKNOWN_LANGUAGE and num_defaults == 1:
+                    name_language[default_lang].append(v)
+                elif lang != UNKNOWN_LANGUAGE:
+                    if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
+                        return None, None
+                    name_language[lang].append(v)
+                else:
+                    return None, None
+
+    return country, name_language
+
+
+def build_ways_training_data(country_rtree, infile, out_dir, abbreviate_streets=True):
+    '''
+    Creates a training set for language classification using most OSM ways
+    (streets) under a fairly lengthy osmfilter definition which attempts to
+    identify all roads/ways designated for motor vehicle traffic, which
+    is more-or-less what we'd expect to see in addresses.
+
+    The fields are {language, country, street name}. Example:
+
+    ar      ma      ﺵﺍﺮﻋ ﻑﺎﻟ ﻮﻟﺩ ﻊﻤﻳﺭ
+    '''
+    i = 0
+    f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    for key, value, deps in parse_osm(infile, allowed_types=WAYS_RELATIONS):
+        country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
+        if not name_language:
+            continue
+
+        for lang, val in name_language.iteritems():
+            for v in val:
+                for s in v.split(';'):
+                    if lang in languages:
+                        writer.writerow((lang, country, tsv_string(s)))
+                        if not abbreviate_streets:
+                            continue
+                        abbrev = abbreviate(street_and_synonyms_gazetteer, s, lang)
+                        if abbrev != s:
+                            writer.writerow((lang, country, tsv_string(abbrev)))
+            if i % 1000 == 0 and i > 0:
+                print('did {} ways'.format(i))
+            i += 1
+    f.close()
+
+
+NAME_KEYS = (
+    'name',
+    'addr:housename',
+)
+
+HOUSE_NUMBER_KEYS = (
+    'addr:house_number',
+    'addr:housenumber',
+    'house_number'
+)
+
+COUNTRY_KEYS = (
+    'country',
+    'country_name',
+    'addr:country',
+    'is_in:country',
+    'addr:country_code',
+    'country_code',
+    'is_in:country_code'
+)
+
+POSTAL_KEYS = (
+    'postcode',
+    'postal_code',
+    'addr:postcode',
+    'addr:postal_code',
+)
+
+
+def build_toponym_training_data(country_rtree, infile, out_dir):
+    '''
+    Data set of toponyms by language and country which should assist
+    in language classification. OSM tends to use the native language
+    by default (e.g. Москва instead of Moscow). Toponyms get messy
+    due to factors like colonialism, historical names, name borrowing
+    and the shortness of the names generally. In these cases
+    we're more strict as to what constitutes a valid language for a
+    given country.
+
+    Example:
+    ja      jp      東京都
+    '''
+    i = 0
+    f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    for key, value, deps in parse_osm(infile):
+        if not any((k.startswith('name') for k, v in value.iteritems())):
+            continue
+
+        try:
+            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
+        except Exception:
+            continue
+
+
+        osm_country_components = country_rtree.point_in_poly(latitude, longitude, return_all=True)
+        country, candidate_languages = country_rtree.country_and_languages_from_components(osm_country_components)
+        if not (country and candidate_languages):
+            continue
+
+        name_language = defaultdict(list)
+
+        official = official_languages[country]
+
+        default_langs = set([l for l, default in official.iteritems() if default])
+
+        _, regional_langs = country_rtree.country_and_languages_from_components([c for c in osm_country_components if 'ISO3166-1:alpha2' not in c])
+
+        top_lang = None
+        if len(official) > 0:
+            top_lang = official.iterkeys().next()
+
+        # E.g. Hindi in India, Urdu in Pakistan
+        if top_lang is not None and top_lang not in WELL_REPRESENTED_LANGUAGES and len(default_langs) > 1:
+            default_langs -= WELL_REPRESENTED_LANGUAGES
+
+        valid_languages = set([l for l, d in candidate_languages])
+
+        '''
+        WELL_REPRESENTED_LANGUAGES are languages like English, French, etc. for which we have a lot of data
+        WELL_REPRESENTED_LANGUAGE_COUNTRIES are more-or-less the "origin" countries for said languages where
+        we can take the place names as examples of the language itself (e.g. place names in France are examples
+        of French, whereas place names in much of Francophone Africa tend to get their names from languages
+        other than French, even though French is the official language.
+        '''
+        valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]])
+
+        valid_languages |= default_langs
+
+        if not valid_languages:
+            continue
+
+        have_qualified_names = False
+
+        for k, v in value.iteritems():
+            if not k.startswith('name:'):
+                continue
+
+            norm = normalize_osm_name_tag(k)
+            norm_sans_script = normalize_osm_name_tag(k, script=True)
+
+            if norm in languages:
+                lang = norm
+            elif norm_sans_script in languages:
+                lang = norm_sans_script
+            else:
+                continue
+
+            if lang in valid_languages:
+                have_qualified_names = True
+                name_language[lang].append(v)
+
+        if not have_qualified_names and len(regional_langs) <= 1 and 'name' in value and len(valid_languages) == 1:
+            name_language[top_lang].append(value['name'])
+
+        for k, v in name_language.iteritems():
+            for s in v:
+                s = s.strip()
+                if not s:
+                    continue
+                writer.writerow((k, country, tsv_string(s)))
+            if i % 1000 == 0 and i > 0:
+                print('did {} toponyms'.format(i))
+            i += 1
+
+    f.close()
+
+
+def build_address_training_data(country_rtree, infile, out_dir, format=False):
+    '''
+    Creates training set similar to the ways data but using addr:street tags instead.
+    These may be slightly closer to what we'd see in real live addresses, containing
+    variations, some abbreviations (although this is discouraged in OSM), etc.
+
+    Example record:
+    eu      es      Errebal kalea
+    '''
+    i = 0
+    f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    for key, value, deps in parse_osm(infile):
+        country, street_language = get_language_names(country_rtree, key, value, tag_prefix='addr:street')
+        if not street_language:
+            continue
+
+        for k, v in street_language.iteritems():
+            for s in v:
+                s = s.strip()
+                if not s:
+                    continue
+                if k in languages:
+                    writer.writerow((k, country, tsv_string(s)))
+            if i % 1000 == 0 and i > 0:
+                print('did {} streets'.format(i))
+            i += 1
+
+    f.close()
+
+VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv'
+
+
+def build_venue_training_data(country_rtree, infile, out_dir):
+    i = 0
+
+    f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w')
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    for key, value, deps in parse_osm(infile):
+        country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
+        if not name_language:
+            continue
+
+        venue_type = None
+        for key in (u'amenity', u'building'):
+            amenity = value.get(key, u'').strip()
+            if amenity in ('yes', 'y'):
+                continue
+
+            if amenity:
+                venue_type = u':'.join([key, amenity])
+                break
+
+        if venue_type is None:
+            continue
+
+        for k, v in name_language.iteritems():
+            for s in v:
+                s = s.strip()
+                if k in languages:
+                    writer.writerow((k, country, safe_encode(venue_type), tsv_string(s)))
+            if i % 1000 == 0 and i > 0:
+                print('did, {} venues'.format(i))
+            i += 1
+
+    f.close()
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-s', '--streets-file',
+                        help='Path to planet-ways.osm')
+
+    parser.add_argument('--unabbreviated',
+                        action='store_true',
+                        default=False,
+                        help='Use unabbreviated street names for token counts')
+
+    parser.add_argument('-a', '--address-file',
+                        help='Path to planet-addresses.osm')
+
+    parser.add_argument('-v', '--venues-file',
+                        help='Path to planet-venues.osm')
+
+    parser.add_argument('-b', '--borders-file',
+                        help='Path to planet-borders.osm')
+
+    parser.add_argument('-f', '--format',
+                        action='store_true',
+                        default=False,
+                        help='Save formatted addresses (slow)')
+
+    parser.add_argument('-u', '--untagged',
+                        action='store_true',
+                        default=False,
+                        help='Save untagged formatted addresses (slow)')
+
+    parser.add_argument('-l', '--limited-addresses',
+                        action='store_true',
+                        default=False,
+                        help='Save formatted addresses without house names or country (slow)')
+
+    parser.add_argument('-p', '--place-nodes-file',
+                        help='Path to planet-admin-nodes.osm')
+
+    parser.add_argument('-t', '--temp-dir',
+                        default=tempfile.gettempdir(),
+                        help='Temp directory to use')
+
+    parser.add_argument('-x', '--intersections-file',
+                        help='Path to planet-ways-latlons.osm')
+
+    parser.add_argument('--country-rtree-dir',
+                        required=True,
+                        help='Country RTree directory')
+
+    parser.add_argument('--rtree-dir',
+                        default=None,
+                        help='OSM reverse geocoder RTree directory')
+
+    parser.add_argument('--places-index-dir',
+                        default=None,
+                        help='Places index directory')
+
+    parser.add_argument('--metro-stations-index-dir',
+                        default=None,
+                        help='Metro stations reverse geocoder directory')
+
+    parser.add_argument('--subdivisions-rtree-dir',
+                        default=None,
+                        help='Subdivisions reverse geocoder RTree directory')
+
+    parser.add_argument('--buildings-rtree-dir',
+                        default=None,
+                        help='Buildings reverse geocoder RTree directory')
+
+    parser.add_argument('--neighborhoods-rtree-dir',
+                        default=None,
+                        help='Neighborhoods reverse geocoder RTree directory')
+
+    parser.add_argument('-o', '--out-dir',
+                        default=os.getcwd(),
+                        help='Output directory')
+
+    args = parser.parse_args()
+
+    country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir)
+
+    osm_rtree = None
+    if args.rtree_dir:
+        osm_rtree = OSMReverseGeocoder.load(args.rtree_dir)
+
+    neighborhoods_rtree = None
+    if args.neighborhoods_rtree_dir:
+        neighborhoods_rtree = NeighborhoodReverseGeocoder.load(args.neighborhoods_rtree_dir)
+
+    places_index = None
+    if args.places_index_dir:
+        places_index = PlaceReverseGeocoder.load(args.places_index_dir)
+
+    metro_stations_index = None
+    if args.metro_stations_index_dir:
+        metro_stations_index = MetroStationReverseGeocoder.load(args.metro_stations_index_dir)
+
+    subdivisions_rtree = None
+    if args.subdivisions_rtree_dir:
+        subdivisions_rtree = OSMSubdivisionReverseGeocoder.load(args.subdivisions_rtree_dir)
+
+    buildings_rtree = None
+    if args.buildings_rtree_dir:
+        buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree_dir)
+
+    # Can parallelize
+    if args.streets_file and not args.format:
+        build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
+    if args.borders_file:
+        build_toponym_training_data(country_rtree, args.borders_file, args.out_dir)
+    if args.venues_file:
+        build_venue_training_data(country_rtree, args.venues_file, args.out_dir)
+
+    if args.address_file or args.intersections_file:
+        if osm_rtree is None:
+            parser.error('--rtree-dir required for formatted addresses')
+        elif neighborhoods_rtree is None:
+            parser.error('--neighborhoods-rtree-dir required for formatted addresses')
+        elif places_index is None:
+            parser.error('--places-index-dir required for formatted addresses')
+
+    if args.address_file and args.format:
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
+        osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
+    if args.address_file and args.limited_addresses:
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
+        osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
+
+    if args.place_nodes_file and args.format:
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
+        osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)
+
+    if args.intersections_file and args.format:
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
+        osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)
+
+    if args.streets_file and args.format:
+        components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
+        osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
+        osm_formatter.build_ways_training_data(args.streets_file, args.out_dir, tag_components=not args.untagged)