Initial fork commit
This commit is contained in:
0
scripts/geodata/osm/__init__.py
Normal file
0
scripts/geodata/osm/__init__.py
Normal file
333
scripts/geodata/osm/admin_boundaries.py
Normal file
333
scripts/geodata/osm/admin_boundaries.py
Normal file
@@ -0,0 +1,333 @@
|
||||
'''
|
||||
admin_boundaries.py
|
||||
-------------------
|
||||
|
||||
Generates polygons from OpenStreetMap relations
|
||||
'''
|
||||
|
||||
import array
|
||||
import logging
|
||||
import six
|
||||
|
||||
from bisect import bisect_left
|
||||
from collections import defaultdict, OrderedDict
|
||||
from itertools import izip, combinations
|
||||
|
||||
from geodata.coordinates.conversion import latlon_to_decimal
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.file_utils import ensure_dir
|
||||
from geodata.graph.scc import strongly_connected_components
|
||||
from geodata.i18n.languages import osm_admin1_ids
|
||||
from geodata.math.floats import isclose
|
||||
from geodata.osm.definitions import osm_definitions
|
||||
from geodata.osm.extract import *
|
||||
|
||||
|
||||
class OSMPolygonReader(object):
|
||||
'''
|
||||
OSM relations are stored with pointers to their bounding ways,
|
||||
which in turn store pointers to their constituent nodes and the
|
||||
XML file for planet is far too large to be parsed in-memory.
|
||||
|
||||
For the purposes of constructing (multi)polygons, we need lists
|
||||
of lat/lon coordinates for the edges of each outer and inner polygon
|
||||
that form the overall boundary (this allows for holes e.g.
|
||||
Lesotho/South Africa and multiple disjoint polygons such as islands)
|
||||
|
||||
This class creates a compact representation of the intermediate
|
||||
lookup tables and coordinates using Python's typed array module
|
||||
which stores C-sized ints, doubles, etc. in a dynamic array. It's like
|
||||
a list but smaller and faster for arrays of numbers and doesn't require
|
||||
pulling in numpy as a dependency when all we want is the space savings.
|
||||
|
||||
One nice property of the .osm files generated by osmfilter is that
|
||||
nodes/ways/relations are stored in sorted order, so we don't have to
|
||||
pre-sort the lookup arrays before performing binary search.
|
||||
'''
|
||||
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
|
||||
self.node_ids = array.array('l')
|
||||
self.way_ids = array.array('l')
|
||||
|
||||
self.coords = array.array('d')
|
||||
|
||||
self.nodes = {}
|
||||
|
||||
self.way_deps = array.array('l')
|
||||
self.way_coords = array.array('d')
|
||||
self.way_indptr = array.array('i', [0])
|
||||
|
||||
self.logger = logging.getLogger('osm_admin_polys')
|
||||
|
||||
def binary_search(self, a, x):
|
||||
'''Locate the leftmost value exactly equal to x'''
|
||||
i = bisect_left(a, x)
|
||||
if i != len(a) and a[i] == x:
|
||||
return i
|
||||
raise ValueError
|
||||
|
||||
def node_coordinates(self, coords, indptr, idx):
|
||||
start_index = indptr[idx] * 2
|
||||
end_index = indptr[idx + 1] * 2
|
||||
node_coords = coords[start_index:end_index]
|
||||
return zip(node_coords[::2], node_coords[1::2])
|
||||
|
||||
def sparse_deps(self, data, indptr, idx):
|
||||
return [data[i] for i in xrange(indptr[idx], indptr[idx + 1])]
|
||||
|
||||
def create_polygons(self, ways):
|
||||
'''
|
||||
Polygons (relations) are effectively stored as lists of
|
||||
line segments (ways) and there may be more than one polygon
|
||||
(island chains, overseas territories).
|
||||
|
||||
If we view the line segments as a graph (any two ways which
|
||||
share a terminal node are connected), then the process of
|
||||
constructing polygons reduces to finding strongly connected
|
||||
components in a graph.
|
||||
|
||||
https://en.wikipedia.org/wiki/Strongly_connected_component
|
||||
|
||||
Note that even though there may be hundreds of thousands of
|
||||
points in a complex polygon like a country boundary, we only
|
||||
need to build a graph of connected ways, which will be many
|
||||
times smaller and take much less time to traverse.
|
||||
'''
|
||||
end_nodes = defaultdict(list)
|
||||
polys = []
|
||||
|
||||
way_indices = {}
|
||||
start_end_nodes = {}
|
||||
|
||||
for way_id in ways:
|
||||
# Find the way position via binary search
|
||||
try:
|
||||
way_index = self.binary_search(self.way_ids, way_id)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Cache the way index
|
||||
way_indices[way_id] = way_index
|
||||
|
||||
# way_indptr is a compressed index into way_deps/way_coords
|
||||
# way_index i is stored at indices way_indptr[i]:way_indptr[i+1]
|
||||
# in way_deps
|
||||
start_node_id = self.way_deps[self.way_indptr[way_index]]
|
||||
end_node_id = self.way_deps[self.way_indptr[way_index + 1] - 1]
|
||||
|
||||
start_end_nodes[way_id] = (start_node_id, end_node_id)
|
||||
|
||||
if start_node_id == end_node_id:
|
||||
way_node_points = self.node_coordinates(self.way_coords, self.way_indptr, way_index)
|
||||
polys.append(way_node_points)
|
||||
continue
|
||||
|
||||
end_nodes[start_node_id].append(way_id)
|
||||
end_nodes[end_node_id].append(way_id)
|
||||
|
||||
# Way graph for a single polygon, don't need to be as concerned about storage
|
||||
way_graph = defaultdict(OrderedDict)
|
||||
|
||||
for node_id, ways in end_nodes.iteritems():
|
||||
for w1, w2 in combinations(ways, 2):
|
||||
way_graph[w1][w2] = None
|
||||
way_graph[w2][w1] = None
|
||||
|
||||
way_graph = {v: w.keys() for v, w in way_graph.iteritems()}
|
||||
|
||||
for component in strongly_connected_components(way_graph):
|
||||
poly_nodes = []
|
||||
|
||||
seen = set()
|
||||
|
||||
if not component:
|
||||
continue
|
||||
|
||||
q = [(c, False) for c in component[:1]]
|
||||
while q:
|
||||
way_id, reverse = q.pop()
|
||||
way_index = way_indices[way_id]
|
||||
|
||||
node_coords = self.node_coordinates(self.way_coords, self.way_indptr, way_index)
|
||||
|
||||
head, tail = start_end_nodes[way_id]
|
||||
|
||||
if reverse:
|
||||
node_coords = node_coords[::-1]
|
||||
head, tail = tail, head
|
||||
|
||||
for neighbor in way_graph[way_id]:
|
||||
if neighbor in seen:
|
||||
continue
|
||||
neighbor_head, neighbor_tail = start_end_nodes[neighbor]
|
||||
neighbor_reverse = neighbor_head == head or neighbor_tail == tail
|
||||
q.append((neighbor, neighbor_reverse))
|
||||
|
||||
way_start = 0 if q else 1
|
||||
poly_nodes.extend(node_coords[way_start:-1])
|
||||
|
||||
seen.add(way_id)
|
||||
|
||||
polys.append(poly_nodes)
|
||||
|
||||
return polys
|
||||
|
||||
def include_polygon(self, props):
|
||||
raise NotImplementedError('Children must implement')
|
||||
|
||||
def polygons(self, properties_only=False):
|
||||
'''
|
||||
Generator which yields tuples like:
|
||||
|
||||
(relation_id, properties, outer_polygons, inner_polygons)
|
||||
|
||||
At this point a polygon is a list of coordinate tuples,
|
||||
suitable for passing to shapely's Polygon constructor
|
||||
but may be used for other purposes.
|
||||
|
||||
outer_polygons is a list of the exterior polygons for this
|
||||
boundary. inner_polygons is a list of "holes" in the exterior
|
||||
polygons although donuts and donut-holes need to be matched
|
||||
by the caller using something like shapely's contains.
|
||||
'''
|
||||
i = 0
|
||||
|
||||
for element_id, props, deps in parse_osm(self.filename, dependencies=True):
|
||||
props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
|
||||
if element_id.startswith('node'):
|
||||
node_id = long(element_id.split(':')[-1])
|
||||
lat = props.get('lat')
|
||||
lon = props.get('lon')
|
||||
if lat is None or lon is None:
|
||||
continue
|
||||
lat, lon = latlon_to_decimal(lat, lon)
|
||||
if lat is None or lon is None:
|
||||
continue
|
||||
|
||||
if isclose(lat, 90.0):
|
||||
lat = 89.999
|
||||
|
||||
if isclose(lon, 180.0):
|
||||
lon = 179.999
|
||||
|
||||
if 'name' in props and 'place' in props:
|
||||
self.nodes[node_id] = props
|
||||
|
||||
# Nodes are stored in a sorted array, coordinate indices are simply
|
||||
# [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
|
||||
# Note that the pairs are lon, lat instead of lat, lon for geometry purposes
|
||||
self.coords.append(lon)
|
||||
self.coords.append(lat)
|
||||
self.node_ids.append(node_id)
|
||||
elif element_id.startswith('way'):
|
||||
way_id = long(element_id.split(':')[-1])
|
||||
|
||||
# Get node indices by binary search
|
||||
try:
|
||||
node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps]
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Way ids stored in a sorted array
|
||||
self.way_ids.append(way_id)
|
||||
|
||||
# way_deps is the list of dependent node ids
|
||||
# way_coords is a copy of coords indexed by way ids
|
||||
for node_id, node_index in izip(deps, node_indices):
|
||||
self.way_deps.append(node_id)
|
||||
self.way_coords.append(self.coords[node_index * 2])
|
||||
self.way_coords.append(self.coords[node_index * 2 + 1])
|
||||
|
||||
self.way_indptr.append(len(self.way_deps))
|
||||
|
||||
if deps[0] == deps[-1] and self.include_polygon(props):
|
||||
way_id_offset = WAY_OFFSET + way_id
|
||||
if not properties_only:
|
||||
outer_polys = self.create_polygons([way_id])
|
||||
inner_polys = []
|
||||
yield way_id_offset, props, {}, outer_polys, inner_polys
|
||||
else:
|
||||
yield way_id_offset, props, {}
|
||||
|
||||
elif element_id.startswith('relation'):
|
||||
if self.node_ids is not None:
|
||||
self.node_ids = None
|
||||
if self.coords is not None:
|
||||
self.coords = None
|
||||
|
||||
relation_id = long(element_id.split(':')[-1])
|
||||
if len(deps) == 0 or not self.include_polygon(props) or props.get('type', '').lower() == 'multilinestring':
|
||||
continue
|
||||
|
||||
outer_ways = []
|
||||
inner_ways = []
|
||||
admin_centers = []
|
||||
|
||||
for elem_id, elem_type, role in deps:
|
||||
if role in ('outer', '') and elem_type == 'way':
|
||||
outer_ways.append(elem_id)
|
||||
elif role == 'inner' and elem_type == 'way':
|
||||
inner_ways.append(elem_id)
|
||||
elif role == 'admin_centre' and elem_type == 'node':
|
||||
val = self.nodes.get(long(elem_id))
|
||||
if val is not None:
|
||||
val['type'] = 'node'
|
||||
val['id'] = long(elem_id)
|
||||
admin_centers.append(val)
|
||||
elif role == 'label' and elem_type == 'node':
|
||||
val = self.nodes.get(long(elem_id))
|
||||
if val is not None and val.get('name', six.u('')).lower() == props.get('name', six.u('')).lower():
|
||||
props.update({k: v for k, v in six.iteritems(val)
|
||||
if k not in props})
|
||||
|
||||
admin_center = {}
|
||||
if len(admin_centers) == 1:
|
||||
admin_center = admin_centers[0]
|
||||
|
||||
relation_id_offset = RELATION_OFFSET + relation_id
|
||||
if not properties_only:
|
||||
outer_polys = self.create_polygons(outer_ways)
|
||||
inner_polys = self.create_polygons(inner_ways)
|
||||
yield relation_id_offset, props, admin_center, outer_polys, inner_polys
|
||||
else:
|
||||
yield relation_id_offset, props, admin_center
|
||||
if i % 1000 == 0 and i > 0:
|
||||
self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
|
||||
i += 1
|
||||
|
||||
|
||||
class OSMAdminPolygonReader(OSMPolygonReader):
|
||||
def include_polygon(self, props):
|
||||
return 'boundary' in props or 'place' in props
|
||||
|
||||
|
||||
class OSMSubdivisionPolygonReader(OSMPolygonReader):
|
||||
def include_polygon(self, props):
|
||||
return 'landuse' in props or 'place' in props or 'amenity' in props
|
||||
|
||||
|
||||
class OSMBuildingPolygonReader(OSMPolygonReader):
|
||||
def include_polygon(self, props):
|
||||
return 'building' in props or 'building:part' in props or props.get('type', None) == 'building'
|
||||
|
||||
|
||||
class OSMCountryPolygonReader(OSMPolygonReader):
|
||||
def include_polygon(self, props):
|
||||
return 'ISO3166-1:alpha2' in props or 'ISO3166-2' in props or (props.get('type', 'relation'), safe_encode(props.get('id', ''))) in osm_admin1_ids
|
||||
|
||||
|
||||
class OSMNeighborhoodPolygonReader(OSMPolygonReader):
|
||||
def include_polygon(self, props):
|
||||
return osm_definitions.meets_definition(props, osm_definitions.NEIGHBORHOOD)
|
||||
|
||||
|
||||
class OSMPostalCodesPolygonReader(OSMPolygonReader):
|
||||
def include_polygon(self, props):
|
||||
return props.get('boundary') == 'postal_code'
|
||||
|
||||
|
||||
class OSMAirportsPolygonReader(OSMPolygonReader):
|
||||
def include_polygon(self, props):
|
||||
return 'aerodrome' in props
|
||||
184
scripts/geodata/osm/components.py
Normal file
184
scripts/geodata/osm/components.py
Normal file
@@ -0,0 +1,184 @@
|
||||
import collections
|
||||
import os
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from geodata.address_formatting.formatter import AddressFormatter
|
||||
from geodata.configs.utils import recursive_merge, DoesNotExist
|
||||
|
||||
from geodata.encoding import safe_encode
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
OSM_BOUNDARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'boundaries', 'osm')
|
||||
|
||||
|
||||
class OSMAddressComponents(object):
|
||||
'''
|
||||
Keeps a map of OSM keys and values to the standard components
|
||||
of an address like city, state, etc. used for address formatting.
|
||||
When we reverse geocode a point, it will fall into a number of
|
||||
polygons, and we simply need to assign the names of said polygons
|
||||
to an address field.
|
||||
'''
|
||||
|
||||
ADMIN_LEVEL = 'admin_level'
|
||||
|
||||
# These keys override country-level
|
||||
global_keys_override = {
|
||||
'place': {
|
||||
'island': AddressFormatter.ISLAND,
|
||||
'islet': AddressFormatter.ISLAND,
|
||||
'municipality': AddressFormatter.CITY,
|
||||
'city': AddressFormatter.CITY,
|
||||
'town': AddressFormatter.CITY,
|
||||
'township': AddressFormatter.CITY,
|
||||
'village': AddressFormatter.CITY,
|
||||
'hamlet': AddressFormatter.CITY,
|
||||
'suburb': AddressFormatter.SUBURB,
|
||||
'quarter': AddressFormatter.SUBURB,
|
||||
'neighbourhood': AddressFormatter.SUBURB
|
||||
},
|
||||
'border_type': {
|
||||
'city': AddressFormatter.CITY
|
||||
}
|
||||
}
|
||||
|
||||
# These keys are fallback in case we haven't added a country or there is no admin_level=
|
||||
global_keys = {
|
||||
'place': {
|
||||
'country': AddressFormatter.COUNTRY,
|
||||
'state': AddressFormatter.STATE,
|
||||
'region': AddressFormatter.STATE,
|
||||
'province': AddressFormatter.STATE,
|
||||
'county': AddressFormatter.STATE_DISTRICT,
|
||||
},
|
||||
'gnis:class': {
|
||||
'populated place': AddressFormatter.CITY,
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, boundaries_dir=OSM_BOUNDARIES_DIR):
|
||||
self.config = {}
|
||||
|
||||
self.use_admin_center = {}
|
||||
|
||||
for filename in os.listdir(boundaries_dir):
|
||||
if not filename.endswith('.yaml'):
|
||||
continue
|
||||
|
||||
country_code = filename.rsplit('.yaml', 1)[0]
|
||||
data = yaml.load(open(os.path.join(boundaries_dir, filename)))
|
||||
|
||||
for prop, values in six.iteritems(data):
|
||||
if not hasattr(values, 'items'):
|
||||
# non-dict key
|
||||
continue
|
||||
|
||||
for k, v in values.iteritems():
|
||||
if isinstance(v, six.string_types) and v not in AddressFormatter.address_formatter_fields:
|
||||
raise ValueError(u'Invalid value in {} for prop={}, key={}: {}'.format(filename, prop, k, v))
|
||||
|
||||
if prop == 'overrides':
|
||||
self.use_admin_center.update({(r['type'], safe_encode(r['id'])): r.get('probability', 1.0) for r in values.get('use_admin_center', [])})
|
||||
|
||||
containing_overrides = values.get('contained_by', {})
|
||||
|
||||
if not containing_overrides:
|
||||
continue
|
||||
|
||||
for id_type, vals in six.iteritems(containing_overrides):
|
||||
for element_id in vals:
|
||||
|
||||
override_config = vals[element_id]
|
||||
|
||||
config = deepcopy(data)
|
||||
config.pop('overrides')
|
||||
|
||||
recursive_merge(config, override_config)
|
||||
|
||||
vals[element_id] = config
|
||||
|
||||
self.config[country_code] = data
|
||||
|
||||
def component(self, country, prop, value):
|
||||
component = self.global_keys_override.get(prop, {}).get(value, None)
|
||||
if component is not None:
|
||||
return component
|
||||
|
||||
component = self.config.get(country, {}).get(prop, {}).get(value, None)
|
||||
if component is not None:
|
||||
return component
|
||||
|
||||
return self.global_keys.get(prop, {}).get(value, None)
|
||||
|
||||
def component_from_properties(self, country, properties, containing=(), global_keys=True):
|
||||
country_config = self.config.get(country, {})
|
||||
|
||||
config = country_config
|
||||
|
||||
overrides = country_config.get('overrides')
|
||||
if overrides:
|
||||
id_overrides = overrides.get('id', {})
|
||||
element_type = properties.get('type')
|
||||
element_id = properties.get('id')
|
||||
|
||||
override_value = id_overrides.get(element_type, {})
|
||||
element_id = six.binary_type(element_id or '')
|
||||
if element_id in override_value:
|
||||
return override_value[element_id]
|
||||
|
||||
contained_by_overrides = overrides.get('contained_by')
|
||||
if contained_by_overrides and containing:
|
||||
# Note, containing should be passed in from smallest to largest
|
||||
for containing_type, containing_id in containing:
|
||||
override_config = contained_by_overrides.get(containing_type, {}).get(six.binary_type(containing_id or ''), None)
|
||||
if override_config:
|
||||
config = override_config
|
||||
break
|
||||
|
||||
values = [(k.lower(), v.lower()) for k, v in six.iteritems(properties) if isinstance(v, six.string_types)]
|
||||
|
||||
global_overrides_last = config.get('global_overrides_last', False)
|
||||
|
||||
# place=city, place=suburb, etc. override per-country boundaries
|
||||
if not global_overrides_last:
|
||||
for k, v in values:
|
||||
containing_component = self.global_keys_override.get(k, {}).get(v, DoesNotExist)
|
||||
|
||||
if containing_component is not DoesNotExist:
|
||||
return containing_component
|
||||
|
||||
if k != self.ADMIN_LEVEL and k in config:
|
||||
containing_component = config.get(k, {}).get(v, DoesNotExist)
|
||||
if containing_component is not DoesNotExist:
|
||||
return containing_component
|
||||
|
||||
# admin_level tags are mapped per country
|
||||
for k, v in values:
|
||||
containing_component = config.get(k, {}).get(v, DoesNotExist)
|
||||
|
||||
if containing_component is not DoesNotExist:
|
||||
return containing_component
|
||||
|
||||
# other place keys like place=state, etc. serve as a backup
|
||||
# when no admin_level tags are available
|
||||
for k, v in values:
|
||||
containing_component = self.global_keys.get(k, {}).get(v, DoesNotExist)
|
||||
|
||||
if containing_component is not DoesNotExist:
|
||||
return containing_component
|
||||
|
||||
if global_overrides_last:
|
||||
for k, v in values:
|
||||
containing_component = self.global_keys_override.get(k, {}).get(v, DoesNotExist)
|
||||
|
||||
if containing_component is not DoesNotExist:
|
||||
return containing_component
|
||||
|
||||
return None
|
||||
|
||||
osm_address_components = OSMAddressComponents()
|
||||
89
scripts/geodata/osm/definitions.py
Normal file
89
scripts/geodata/osm/definitions.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import os
|
||||
import re
|
||||
import six
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from geodata.graph.topsort import topsort
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
DEFAULT_SCRIPT_PATH = os.path.join(this_dir, 'fetch_osm_address_data.sh')
|
||||
|
||||
valid_key_regex = re.compile('VALID_(.*?)_KEYS="(.*)"')
|
||||
variable_regex = re.compile(r'\$VALID_(.*?)_KEYS(?=\b)')
|
||||
kv_regex = re.compile('([^\s]*)=([^\s]*)')
|
||||
|
||||
|
||||
class OSMDefinitions(object):
|
||||
ALL = '*'
|
||||
|
||||
ADMIN_BORDER = 'admin_border'
|
||||
ADMIN_NODE = 'admin_node'
|
||||
AEROWAY = 'aeroway'
|
||||
AMENITY = 'amenity'
|
||||
BUILDING = 'building'
|
||||
HISTORIC = 'historic'
|
||||
LANDUSE = 'landuse'
|
||||
NATURAL = 'natural'
|
||||
LOCALITY = 'locality'
|
||||
NEIGHBORHOOD = 'neighborhood'
|
||||
EXTENDED_NEIGHBORHOOD = 'extended_neighborhood'
|
||||
OFFICE = 'office'
|
||||
PLACE = 'place'
|
||||
POPULATED_PLACE = 'populated_place'
|
||||
SHOP = 'shop'
|
||||
TOURISM = 'tourism'
|
||||
VENUE = 'venue'
|
||||
WATERWAY = 'waterway'
|
||||
|
||||
def __init__(self, filename=DEFAULT_SCRIPT_PATH):
|
||||
script = open(filename).read()
|
||||
|
||||
dependencies = defaultdict(list)
|
||||
|
||||
definitions = {}
|
||||
|
||||
matches = valid_key_regex.findall(script)
|
||||
|
||||
match_text = {d.lower(): t for d, t in matches}
|
||||
|
||||
for definition, text in matches:
|
||||
variables = variable_regex.findall(text)
|
||||
if not variables:
|
||||
dependencies[definition.lower()] = []
|
||||
for v in variables:
|
||||
dependencies[definition.lower()].append(v.lower())
|
||||
|
||||
for definition in topsort(dependencies):
|
||||
definition = definition.lower()
|
||||
text = match_text[definition]
|
||||
variables = variable_regex.findall(text)
|
||||
for v in variables:
|
||||
v = v.lower()
|
||||
text = text.replace('$VALID_{}_KEYS'.format(v.upper()), match_text[v])
|
||||
|
||||
kvs = defaultdict(set)
|
||||
|
||||
for k, v in kv_regex.findall(text):
|
||||
if v != '':
|
||||
kvs[k].add(v.lower())
|
||||
else:
|
||||
kvs[k].add(self.ALL)
|
||||
|
||||
definitions[definition] = kvs
|
||||
|
||||
self.definitions = definitions
|
||||
|
||||
def meets_definition(self, props, category):
|
||||
defs = self.definitions.get(category, {})
|
||||
if not defs:
|
||||
return False
|
||||
elif self.ALL in defs:
|
||||
return True
|
||||
for k, v in six.iteritems(props):
|
||||
if v.lower() in defs.get(k.lower(), set()):
|
||||
return True
|
||||
return False
|
||||
|
||||
osm_definitions = OSMDefinitions()
|
||||
207
scripts/geodata/osm/extract.py
Normal file
207
scripts/geodata/osm/extract.py
Normal file
@@ -0,0 +1,207 @@
|
||||
'''
|
||||
geodata.osm.extract
|
||||
-------------------
|
||||
|
||||
Extracts nodes/ways/relations, their metadata and dependencies
|
||||
from .osm XML files.
|
||||
'''
|
||||
|
||||
import re
|
||||
import six
|
||||
import urllib
|
||||
import HTMLParser
|
||||
|
||||
from collections import OrderedDict
|
||||
from lxml import etree
|
||||
|
||||
|
||||
from geodata.csv_utils import unicode_csv_reader
|
||||
from geodata.text.normalize import normalize_string, NORMALIZE_STRING_DECOMPOSE, NORMALIZE_STRING_LATIN_ASCII
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
|
||||
|
||||
WAY_OFFSET = 10 ** 15
|
||||
RELATION_OFFSET = 2 * 10 ** 15
|
||||
|
||||
NODE = 'node'
|
||||
WAY = 'way'
|
||||
RELATION = 'relation'
|
||||
|
||||
ALL_OSM_TAGS = set([NODE, WAY, RELATION])
|
||||
WAYS_RELATIONS = set([WAY, RELATION])
|
||||
|
||||
OSM_NAME_TAGS = (
|
||||
'name',
|
||||
'alt_name',
|
||||
'int_name',
|
||||
'nat_name',
|
||||
'reg_name',
|
||||
'loc_name',
|
||||
'official_name',
|
||||
'commonname',
|
||||
'common_name',
|
||||
'place_name',
|
||||
'short_name',
|
||||
)
|
||||
|
||||
OSM_BASE_NAME_TAGS = (
|
||||
'tiger:name_base',
|
||||
)
|
||||
|
||||
|
||||
def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False):
|
||||
'''
|
||||
Parse a file in .osm format iteratively, generating tuples like:
|
||||
('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
|
||||
('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
|
||||
('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
|
||||
('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
|
||||
('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4])
|
||||
'''
|
||||
f = open(filename)
|
||||
parser = etree.iterparse(f)
|
||||
|
||||
single_type = len(allowed_types) == 1
|
||||
|
||||
for (_, elem) in parser:
|
||||
elem_id = long(elem.attrib.pop('id', 0))
|
||||
item_type = elem.tag
|
||||
if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET:
|
||||
elem_id -= WAY_OFFSET
|
||||
item_type = 'way'
|
||||
elif elem_id >= RELATION_OFFSET:
|
||||
elem_id -= RELATION_OFFSET
|
||||
item_type = 'relation'
|
||||
|
||||
if item_type in allowed_types:
|
||||
attrs = OrderedDict(elem.attrib)
|
||||
attrs['type'] = item_type
|
||||
attrs['id'] = safe_encode(elem_id)
|
||||
|
||||
top_level_attrs = set(attrs)
|
||||
deps = [] if dependencies else None
|
||||
|
||||
for e in elem.getchildren():
|
||||
if e.tag == 'tag':
|
||||
# Prevent user-defined lat/lon keys from overriding the lat/lon on the node
|
||||
key = e.attrib['k']
|
||||
if key not in top_level_attrs:
|
||||
attrs[key] = e.attrib['v']
|
||||
elif dependencies and item_type == 'way' and e.tag == 'nd':
|
||||
deps.append(long(e.attrib['ref']))
|
||||
elif dependencies and item_type == 'relation' and e.tag == 'member' and 'role' in e.attrib:
|
||||
deps.append((long(e.attrib['ref']), e.attrib.get('type'), e.attrib['role']))
|
||||
|
||||
key = elem_id if single_type else '{}:{}'.format(item_type, elem_id)
|
||||
yield key, attrs, deps
|
||||
|
||||
if elem.tag in ALL_OSM_TAGS:
|
||||
elem.clear()
|
||||
while elem.getprevious() is not None:
|
||||
del elem.getparent()[0]
|
||||
|
||||
|
||||
def osm_type_and_id(element_id):
|
||||
element_id = long(element_id)
|
||||
if element_id >= RELATION_OFFSET:
|
||||
id_type = RELATION
|
||||
element_id -= RELATION_OFFSET
|
||||
elif element_id >= WAY_OFFSET:
|
||||
id_type = WAY
|
||||
element_id -= WAY_OFFSET
|
||||
else:
|
||||
id_type = NODE
|
||||
|
||||
return id_type, element_id
|
||||
|
||||
apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
|
||||
|
||||
html_parser = HTMLParser.HTMLParser()
|
||||
|
||||
|
||||
def normalize_wikipedia_title(title):
|
||||
match = apposition_regex.match(title)
|
||||
if match:
|
||||
title = match.group(1)
|
||||
|
||||
title = safe_decode(title)
|
||||
title = html_parser.unescape(title)
|
||||
title = urllib.unquote_plus(title)
|
||||
|
||||
return title.replace(u'_', u' ').strip()
|
||||
|
||||
|
||||
def osm_wikipedia_title_and_language(key, value):
|
||||
language = None
|
||||
if u':' in key:
|
||||
key, language = key.rsplit(u':', 1)
|
||||
|
||||
if u':' in value:
|
||||
possible_language = value.split(u':', 1)[0]
|
||||
if len(possible_language) == 2 and language is None:
|
||||
language = possible_language
|
||||
value = value.rsplit(u':', 1)[-1]
|
||||
|
||||
return normalize_wikipedia_title(value), language
|
||||
|
||||
|
||||
non_breaking_dash = six.u('[-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]')
|
||||
simple_number = six.u('(?:{})?[0-9]+(?:\.[0-9]+)?').format(non_breaking_dash)
|
||||
simple_number_regex = re.compile(simple_number, re.UNICODE)
|
||||
|
||||
non_breaking_dash_regex = re.compile(non_breaking_dash, re.UNICODE)
|
||||
number_range_regex = re.compile(six.u('({}){}({})').format(simple_number, non_breaking_dash, simple_number), re.UNICODE)
|
||||
letter_range_regex = re.compile(r'([^\W\d_]){}([^\W\d_])'.format(non_breaking_dash.encode('unicode-escape')), re.UNICODE)
|
||||
|
||||
number_split_regex = re.compile('[,;]')
|
||||
|
||||
|
||||
def parse_osm_number_range(value, parse_letter_range=True, max_range=100):
|
||||
value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE)
|
||||
numbers = []
|
||||
values = number_split_regex.split(value)
|
||||
for val in values:
|
||||
val = val.strip()
|
||||
match = number_range_regex.match(val)
|
||||
if match:
|
||||
start_num, end_num = match.groups()
|
||||
start_num_len = len(start_num)
|
||||
|
||||
zfill = 0
|
||||
if start_num.startswith('0'):
|
||||
zfill = start_num_len
|
||||
|
||||
try:
|
||||
start_num = int(start_num)
|
||||
end_num = int(end_num)
|
||||
|
||||
if end_num > start_num:
|
||||
if end_num - start_num > max_range:
|
||||
end_num = start_num + max_range
|
||||
|
||||
for i in xrange(start_num, end_num + 1):
|
||||
numbers.append(safe_decode(i).zfill(zfill))
|
||||
else:
|
||||
numbers.append(val.strip().zfill(zfill))
|
||||
continue
|
||||
except (TypeError, ValueError):
|
||||
numbers.append(safe_decode(val).strip().zfill(zfill))
|
||||
continue
|
||||
|
||||
else:
|
||||
letter_match = letter_range_regex.match(val)
|
||||
if letter_match and parse_letter_range:
|
||||
start_num, end_num = letter_match.groups()
|
||||
start_num = ord(start_num)
|
||||
end_num = ord(end_num)
|
||||
if end_num > start_num:
|
||||
if end_num - start_num > max_range:
|
||||
end_num = start_num + max_range
|
||||
for i in xrange(start_num, end_num + 1):
|
||||
numbers.append(six.unichr(i))
|
||||
else:
|
||||
numbers.extend([six.unichr(start_num), six.unichr(end_num)])
|
||||
continue
|
||||
else:
|
||||
numbers.append(safe_decode(val.strip()))
|
||||
return numbers
|
||||
282
scripts/geodata/osm/fetch_osm_address_data.sh
Executable file
282
scripts/geodata/osm/fetch_osm_address_data.sh
Executable file
@@ -0,0 +1,282 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
: '
|
||||
fetch_osm_address_data.sh
|
||||
-------------------------
|
||||
|
||||
Shell script to download OSM planet and derive inputs
|
||||
for language detection and address parser training set
|
||||
construction.
|
||||
|
||||
Usage: ./fetch_osm_address_data.sh out_dir
|
||||
'
|
||||
|
||||
if [ "$#" -ge 1 ]; then
|
||||
OUT_DIR=$1
|
||||
else
|
||||
OUT_DIR=`pwd`
|
||||
fi
|
||||
|
||||
set -e
|
||||
|
||||
THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
RESOURCES_DIR=$THIS_DIR/../../../resources
|
||||
ADMIN1_FILE=$RESOURCES_DIR/language/regional/adm1.tsv
|
||||
|
||||
# Check for osmfilter and osmconvert
|
||||
if ! type -P osmfilter osmconvert > /dev/null; then
|
||||
cat << EOF
|
||||
ERROR: osmfilter and osmconvert are required
|
||||
|
||||
On Debian/Ubuntu:
|
||||
sudo apt-get install osmctools
|
||||
|
||||
Or to compile:
|
||||
wget -O - http://m.m.i24.cc/osmfilter.c |cc -x c - -O3 -o osmfilter
|
||||
wget -O - http://m.m.i24.cc/osmconvert.c | cc -x c - -lz -O3 -o osmconvert
|
||||
EOF
|
||||
exit 127
|
||||
fi
|
||||
|
||||
PREV_DIR=`pwd`
|
||||
|
||||
cd $OUT_DIR
|
||||
|
||||
# Download planet as PBF
|
||||
# TODO: currently uses single mirror, randomly choose one instead
|
||||
echo "Started OSM download: `date`"
|
||||
|
||||
PLANET_PBF="planet-latest.osm.pbf"
|
||||
JAPAN_PBF="japan-latest.osm.pbf"
|
||||
|
||||
wget --quiet http://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-latest.osm.pbf -O $OUT_DIR/$PLANET_PBF &
|
||||
wget --quiet http://download.geofabrik.de/asia/japan-latest.osm.pbf -O $OUT_DIR/$JAPAN_PBF &
|
||||
|
||||
wait
|
||||
|
||||
echo "Converting to o5m: `date`"
|
||||
PLANET_O5M="planet-latest.o5m"
|
||||
JAPAN_O5M="japan-latest.o5m"
|
||||
|
||||
# Needs to be in O5M for some of the subsequent steps to work whereas PBF is smaller for download
|
||||
osmconvert $PLANET_PBF -o=$PLANET_O5M &
|
||||
osmconvert $JAPAN_PBF -o=$JAPAN_O5M &
|
||||
|
||||
wait
|
||||
|
||||
rm $PLANET_PBF
|
||||
rm $JAPAN_PBF
|
||||
|
||||
VALID_AEROWAY_KEYS="aeroway=aerodrome"
|
||||
VALID_AMENITY_KEYS="amenity=ambulance_station or amenity=animal_boarding or amenity=animal_shelter or amenity=arts_centre or amenity=auditorium or amenity=baby_hatch or amenity=bank or amenity=bar or amenity=bbq or amenity=biergarten or amenity=boathouse or amenity=boat_rental or amenity=boat_sharing or amenity=boat_storage or amenity=brothel or amenity=bureau_de_change or amenity=bus_station or amenity=cafe or amenity=car_rental or amenity=car_sharing or amenity=car_wash or amenity=casino or amenity=cemetery or amenity=charging_station or amenity=cinema or amenity=childcare or amenity=clinic or amenity=club or amenity=clock or amenity=college or amenity=community_center or amenity=community_centre or amenity=community_hall or amenity=concert_hall or amenity=conference_centre or amenity=courthouse or amenity=coworking_space or amenity=crematorium or amenity=crypt or amenity=culture_center or amenity=dancing_school or amenity=dentist or amenity=dive_centre or amenity=doctors or amenity=dojo or amenity=dormitory or amenity=driving_school or amenity=embassy or amenity=emergency_service or amenity=events_venue or amenity=exhibition_centre or amenity=fast_food or amenity=ferry_terminal or amenity=festival_grounds or amenity=fire_station or amenity=food_count or amenity=fountain or amenity=gambling or amenity=game_feeding or amenity=grave_yard or amenity=greenhouse or amenity=gym or amenity=hall or amenity=health_centre or amenity=hospice or amenity=hospital or amenity=hotel or amenity=hunting_stand or amenity=ice_cream or amenity=internet_cafe or amenity=kindergarten or amenity=kiosk or amenity=kneipp_water_cure or amenity=language_school or amenity=lavoir or amenity=library or amenity=love_hotel or amenity=market or amenity=marketplace or amenity=medical_centre or amenity=mobile_money_agent or amenity=monastery or amenity=money_transfer or amenity=mortuary or amenity=mountain_rescue or amenity=music_school or amenity=music_venue or amenity=nightclub or amenity=nursery or amenity=nursing_home or amenity=office or amenity=parish_hall or amenity=park or amenity=pharmacy or amenity=planetarium or amenity=place_of_worship or amenity=police or amenity=post_office or amenity=preschool or amenity=prison or amenity=pub or amenity=public_bath or amenity=public_bookcase or amenity=public_building or amenity=public_facility or amenity=public_hall or amenity=public_market or amenity=ranger_station or amenity=refugee_housing or amenity=register_office or amenity=research_institute or amenity=rescue_station or amenity=residential or amenity=Residential or amenity=restaurant or amenity=retirement_home or amenity=sacco or amenity=sanitary_dump_station or amenity=sanitorium or amenity=sauna or amenity=school or amenity=shelter or amenity=shop or amenity=shopping or amenity=shower or amenity=ski_rental or amenity=ski_school or amenity=social_centre or amenity=social_club or amenity=social_facility or amenity=spa or amenity=stables or amenity=stripclub or amenity=studio or amenity=supermarket or amenity=swimming_pool or amenity=swingerclub or amenity=townhall or amenity=theatre or amenity=training or amenity=trolley_bay or amenity=university or amenity=vehicle_inspection or amenity=veterinary or amenity=village_hall or amenity=vivarium or amenity=waste_transfer_station or amenity=whirlpool or amenity=winery or amenity=youth_centre"
|
||||
GENERIC_AMENITIES="amenity=atm or amenity=bench or amenity=bicycle_parking or amenity=bicycle_rental or amenity=bicycle_repair_station or amenity=compressed_air or amenity=drinking_water or amenity=emergency_phone or amenity=fire_hydrant or amenity=fuel or amenity=grit_bin or amenity=motorcycle_parking or amenity=parking or amenity=parking_space or amenity=post_box or amenity=reception_area or amenity=recycling or amenity=taxi or amenity=telephone or amenity=ticket_validator or amenity=toilets or amenity=vending_machine or amenity=waste_basket or amenity=waste_disposal or amenity=water_point or amenity=watering_place or amenity=wifi"
|
||||
|
||||
VALID_OFFICE_KEYS="office=accountant or office=administrative or office=administration or office=advertising_agency or office=architect or office=association or office=camping or office=charity or office=company or office=consulting or office=educational_institution or office=employment_agency or office=estate_agent or office=financial or office=forestry or office=foundation or office=government or office=insurance or office=it or office=lawyer or office=newspaper or office=ngo or office=notary or office=parish or office=physician or office=political_party or office=publisher or office=quango or office=real_estate_agent or office=realtor or office=register or office=religion or office=research or office=tax or office=tax_advisor or office=telecommunication or office=therapist or office=travel_agent or office=water_utility"
|
||||
VALID_SHOP_KEYS="shop="
|
||||
VALID_HISTORIC_KEYS="historic=archaeological_site or historic=castle or historic=fort or historic=memorial or historic=monument or historic=ruins or historic=tomb"
|
||||
VALID_PLACE_KEYS="place=farm or place=isolated_dwelling or place=square"
|
||||
VALID_TOURISM_KEYS="tourism=hotel or tourism=attraction or tourism=guest_house or tourism=museum or tourism=chalet or tourism=motel or tourism=hostel or tourism=alpine_hut or tourism=theme_park or tourism=zoo or tourism=apartment or tourism=wilderness_hut or tourism=gallery or tourism=bed_and_breakfast or tourism=hanami or tourism=wine_cellar or tourism=resort or tourism=aquarium or tourism=apartments or tourism=cabin or tourism=winery or tourism=hut"
|
||||
VALID_LEISURE_KEYS="leisure=adult_gaming_centre or leisure=amusement_arcade or leisure=arena or leisure=bandstand or leisure=beach_resort or leisure=bbq or leisure=bird_hide or leisure=bowling_alley or leisure=casino or leisure=common or leisure=club or leisure=dance or leisure=dancing or leisure=disc_golf_course or leisure=dog_park or leisure=fishing or leisure=fitness_centre or leisure=gambling or leisure=garden or leisure=golf_course or leisure=hackerspace or leisure=horse_riding or leisure=hospital or leisure=hot_spring or leisure=ice_rink leisure=landscape_reserve or leisure=marina or leisure=maze or leisure=miniature_golf or leisure=nature_reserve or leisure=padding_pool or leisure=park or leisure=pitch or leisure=playground or leisure=recreation_ground or leisure=resort or leisure=sailing_club or leisure=sauna or leisure=social_club or leisure=sports_centre or leisure=stadium or leisure=summer_camp or leisure=swimming_pool or leisure=tanning_salon or leisure=track or leisure=trampoline_park or leisure=turkish_bath or leisure=video_arcade or leisure=water_park or leisure=wildlife_hide"
|
||||
VALID_LANDUSE_KEYS="landuse=allotmenets or landuse=basin or landuse=cemetery or landuse=commercial or landuse=construction or landuse=farmland or landuse=forest or landuse=grass or landuse=greenhouse_horticulture or landuse=industrial or landuse=landfill or landuse=meadow or landuse=military or landuse=orchard or landuse=plant_nursery or landuse=port or landuse=quarry or landuse=recreation_ground or landuse=resevoir or landuse=residential or landuse=retail or landuse=village_green or landuse=vineyard"
|
||||
|
||||
VALID_VENUE_KEYS="( ( $VALID_AEROWAY_KEYS ) or ( $VALID_AMENITY_KEYS ) or ( $VALID_HISTORIC_KEYS ) or ( $VALID_OFFICE_KEYS ) or ( $VALID_PLACE_KEYS ) or ( $VALID_SHOP_KEYS ) or ( $VALID_TOURISM_KEYS ) or ( $VALID_LEISURE_KEYS ) or ( $VALID_LANDUSE_KEYS ) )"
|
||||
|
||||
# Address data set for use in parser, language detection
|
||||
echo "Filtering for records with address tags: `date`"
|
||||
PLANET_ADDRESSES_O5M="planet-addresses.o5m"
|
||||
JAPAN_ADDRESSES_O5M="japan-addresses.o5m"
|
||||
VALID_ADDRESSES="( ( ( name= or addr:housename= ) and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS ) ) ) or ( ( addr:street= or addr:place= ) and ( name= or building= or building:levels= or addr:housename= or addr:housenumber= ) )"
|
||||
VALID_ADDRESSES_JAPAN="( addr:housenumber= or addr:street= ) or ( ( name= or name:ja= or addr:housename= ) and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS ) )"
|
||||
osmfilter $PLANET_O5M --keep="$VALID_ADDRESSES" --drop-author --drop-version -o=$PLANET_ADDRESSES_O5M &
|
||||
osmfilter $JAPAN_O5M --keep="$VALID_ADDRESSES_JAPAN" --drop-author --drop-version -o=$JAPAN_ADDRESSES_O5M &
|
||||
|
||||
wait
|
||||
|
||||
PLANET_ADDRESSES_LATLONS="planet-addresses-latlons.o5m"
|
||||
JAPAN_ADDRESSES_LATLONS="japan-addresses-latlons.o5m"
|
||||
osmconvert $PLANET_ADDRESSES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_ADDRESSES_LATLONS &
|
||||
osmconvert $JAPAN_ADDRESSES_O5M --max-objects=1000000000 --all-to-nodes -o=$JAPAN_ADDRESSES_LATLONS &
|
||||
|
||||
wait
|
||||
|
||||
rm $PLANET_ADDRESSES_O5M
|
||||
rm $JAPAN_ADDRESSES_O5M
|
||||
PLANET_ADDRESSES="planet-addresses.osm"
|
||||
osmfilter $PLANET_ADDRESSES_LATLONS --keep="$VALID_ADDRESSES" -o=$PLANET_ADDRESSES_O5M &
|
||||
osmfilter $JAPAN_ADDRESSES_LATLONS --keep="$VALID_ADDRESSES_JAPAN" -o=$JAPAN_ADDRESSES_O5M &
|
||||
|
||||
wait
|
||||
|
||||
osmconvert $PLANET_ADDRESSES_O5M $JAPAN_ADDRESSES_O5M -o=$PLANET_ADDRESSES
|
||||
|
||||
rm $PLANET_ADDRESSES_O5M
|
||||
rm $JAPAN_ADDRESSES_O5M
|
||||
|
||||
rm $PLANET_ADDRESSES_LATLONS
|
||||
rm $JAPAN_ADDRESSES_LATLONS
|
||||
|
||||
# Border data set for use in R-tree index/reverse geocoding, parsing, language detection
|
||||
echo "Filtering for borders: `date`"
|
||||
PLANET_COUNTRIES="planet-countries.osm"
|
||||
PLANET_BORDERS_O5M="planet-borders.o5m"
|
||||
PLANET_BORDERS="planet-borders.osm"
|
||||
PLANET_ADMIN_BORDERS_OSM="planet-admin-borders.osm"
|
||||
|
||||
VALID_COUNTRY_KEYS="ISO3166-1:alpha2="
|
||||
VALID_ADMIN1_KEYS="ISO3166-2="
|
||||
ADMIN1_LANGUAGE_EXCEPTION_IDS=$(grep "osm" $ADMIN1_FILE | sed 's/^.*relation:\([0-9][0-9]*\).*$/@id=\1/' | xargs echo | sed 's/\s/ or /g')
|
||||
|
||||
VALID_ADMIN_BORDER_KEYS="boundary=administrative or boundary=town or boundary=city_limit or boundary=civil_parish or boundary=civil or boundary=ceremonial or boundary=postal_district or place=island or place=city or place=town or place=village or place=hamlet or place=municipality or place=settlement"
|
||||
|
||||
VALID_POPULATED_PLACE_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=locality or place=settlement or place=census-designated or place:ph=village"
|
||||
VALID_NEIGHBORHOOD_KEYS="place=neighbourhood or place=neighborhood or place:ph=barangay"
|
||||
VALID_EXTENDED_NEIGHBORHOOD_KEYS="place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place:ph=barangay"
|
||||
|
||||
VALID_LOCALITY_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place=locality or place=settlement or place=census-designated or place:ph=barangay or place:ph=village"
|
||||
|
||||
VALID_ADMIN_NODE_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place=island or place=islet or place=county or place=region or place=state or place=subdistrict or place=township or place=archipelago or place=department or place=country or place=district or place=census-designated or place=ward or place=subward or place=province or place=peninsula or place=settlement or place=subregion"
|
||||
|
||||
osmfilter $PLANET_O5M --keep="$VALID_ADMIN_BORDER_KEYS" --drop-author --drop-version -o=$PLANET_ADMIN_BORDERS_OSM &
|
||||
osmfilter $PLANET_O5M --keep="$VALID_ADMIN_BORDER_KEYS or $VALID_LOCALITY_KEYS" --drop-author --drop-version -o=$PLANET_BORDERS_O5M &
|
||||
|
||||
wait
|
||||
|
||||
PLANET_ADMIN_NODES="planet-admin-nodes.osm"
|
||||
osmfilter $PLANET_O5M --keep="$VALID_ADMIN_NODE_KEYS" --drop-ways --drop-relations --ignore-dependencies --drop-author --drop-version -o=$PLANET_ADMIN_NODES
|
||||
PLANET_BORDERS_LATLONS="planet-borders-latlons.o5m"
|
||||
osmconvert $PLANET_BORDERS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_BORDERS_LATLONS
|
||||
rm $PLANET_BORDERS_O5M
|
||||
osmfilter $PLANET_BORDERS_LATLONS --keep="$VALID_ADMIN_BORDER_KEYS or $VALID_LOCALITY_KEYS" -o=$PLANET_BORDERS
|
||||
rm $PLANET_BORDERS_LATLONS
|
||||
osmfilter $PLANET_O5M --keep="$VALID_COUNTRY_KEYS or $VALID_ADMIN1_KEYS or $ADMIN1_LANGUAGE_EXCEPTION_IDS" --drop-author --drop-version -o=$PLANET_COUNTRIES
|
||||
|
||||
echo "Filtering for neighborhoods"
|
||||
PLANET_LOCALITIES="planet-localities.osm"
|
||||
PLANET_NEIGHBORHOOD_BORDERS="planet-neighborhood-borders.osm"
|
||||
|
||||
osmfilter $PLANET_O5M --keep="$VALID_NEIGHBORHOOD_KEYS" --drop-author --drop-version -o=$PLANET_NEIGHBORHOOD_BORDERS
|
||||
osmfilter $PLANET_O5M --keep="name= and ( $VALID_LOCALITY_KEYS )" --drop-relations --drop-ways --ignore-dependencies --drop-author --drop-version -o=$PLANET_LOCALITIES
|
||||
|
||||
echo "Filtering for rail stations"
|
||||
VALID_RAIL_STATION_KEYS="railway=station"
|
||||
PLANET_RAILWAYS_O5M="planet-rail-stations.o5m"
|
||||
PLANET_RAILWAYS="planet-rail-stations.osm"
|
||||
|
||||
osmfilter $PLANET_O5M --keep="$VALID_RAIL_STATION_KEYS" --drop-author --drop-version -o=$PLANET_RAILWAYS_O5M
|
||||
PLANET_RAILWAYS_LATLONS="planet-rail-stations-latlons.o5m"
|
||||
osmconvert $PLANET_RAILWAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_RAILWAYS_LATLONS
|
||||
rm $PLANET_RAILWAYS_O5M
|
||||
osmfilter $PLANET_RAILWAYS_LATLONS --keep="$VALID_RAIL_STATION_KEYS" -o=$PLANET_RAILWAYS
|
||||
rm $PLANET_RAILWAYS_LATLONS
|
||||
|
||||
echo "Filtering for airports and terminals"
|
||||
VALID_AIRPORT_KEYS="aeroway=aerodrome or aeroway=terminal"
|
||||
PLANET_AIRPORTS_O5M="planet-airports.o5m"
|
||||
PLANET_AIRPORTS="planet-airports.osm"
|
||||
|
||||
osmfilter $PLANET_O5M --keep="$VALID_AIRPORT_KEYS" --drop-author --drop-version -o=$PLANET_AIRPORTS_O5M
|
||||
PLANET_AIRPORTS_LATLONS="planet-airports-latlons.o5m"
|
||||
osmconvert $PLANET_AIRPORTS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_AIRPORTS_LATLONS
|
||||
PLANET_AIRPORT_POLYGONS="planet-airport-polygons.osm"
|
||||
osmconvert $PLANET_AIRPORTS_O5M -o=$PLANET_AIRPORT_POLYGONS
|
||||
rm $PLANET_AIRPORTS_O5M
|
||||
osmfilter $PLANET_AIRPORTS_LATLONS --keep="$VALID_AIRPORT_KEYS" -o=$PLANET_AIRPORTS
|
||||
rm $PLANET_AIRPORTS_LATLONS
|
||||
|
||||
echo "Filtering for subdivision polygons"
|
||||
PLANET_SUBDIVISIONS="planet-subdivisions.osm"
|
||||
SUBDIVISION_AMENITY_TYPES="amenity=university or amentiy=college or amentiy=school or amentiy=hospital"
|
||||
SUBDIVISION_LANDUSE_TYPES="landuse=residential or landuse=commercial or landuse=industrial or landuse=retail or landuse=military"
|
||||
SUBDIVISION_PLACE_TYPES="place=allotmenets or place=city_block or place=block or place=plot or place=subdivision"
|
||||
osmfilter $PLANET_O5M --keep="( $SUBDIVISION_AMENITY_TYPES or $SUBDIVISION_PLACE_TYPES or $SUBDIVISION_LANDUSE_TYPES )" --drop="( place= and not ( $SUBDIVISION_PLACE_TYPES ) ) or boundary=" --drop-author --drop-version -o=$PLANET_SUBDIVISIONS
|
||||
|
||||
echo "Filtering for postal_code polygons"
|
||||
PLANET_POSTAL_CODES="planet-postcodes.osm"
|
||||
osmfilter $PLANET_O5M --keep="boundary=postal_code" --drop-author --drop-version -o=$PLANET_POSTAL_CODES
|
||||
|
||||
|
||||
# Venue data set for use in venue classification
|
||||
echo "Filtering for venue records: `date`"
|
||||
PLANET_VENUES_O5M="planet-venues.o5m"
|
||||
osmfilter $PLANET_O5M --keep="( name= and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS or ( $VALID_RAIL_STATION_KEYS and addr:street= and ( wikipedia= or wikipedia:*= ) ) ) )" --drop-author --drop-version -o=$PLANET_VENUES_O5M
|
||||
PLANET_VENUES_LATLONS="planet-venues-latlons.o5m"
|
||||
osmconvert $PLANET_VENUES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_VENUES_LATLONS
|
||||
rm $PLANET_VENUES_O5M
|
||||
PLANET_VENUES="planet-venues.osm"
|
||||
osmfilter $PLANET_VENUES_LATLONS --keep="name= and ( ( building= and building!=yes ) or ( $VALID_VENUE_KEYS or ( $VALID_RAIL_STATION_KEYS and addr:street= and ( wikipedia= or wikipedia:*= ) ) ) )" -o=$PLANET_VENUES
|
||||
rm $PLANET_VENUES_LATLONS
|
||||
|
||||
# Categories for building generic queries like "restaurants in Brooklyn"
|
||||
echo "Filtering for buildings: `date`"
|
||||
PLANET_BUILDINGS_O5M="planet-buildings.o5m"
|
||||
VALID_BUILDING_KEYS="building= or building:part="
|
||||
VALID_BUILDINGS="( ( $VALID_BUILDING_KEYS ) and ( building!=yes or name= or addr:housename= or addr:street= or addr:housenumber= or addr:postcode= ) )"
|
||||
osmfilter $PLANET_O5M --keep="$VALID_BUILDINGS" --drop-author --drop-version -o=$PLANET_BUILDINGS_O5M
|
||||
PLANET_BUILDINGS_LATLONS="planet-buildings-latlons.o5m"
|
||||
osmconvert $PLANET_BUILDINGS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_BUILDINGS_LATLONS
|
||||
rm $PLANET_BUILDINGS_O5M
|
||||
PLANET_BUILDINGS="planet-buildings.osm"
|
||||
osmfilter $PLANET_BUILDINGS_LATLONS --keep="$VALID_BUILDINGS" -o=$PLANET_BUILDINGS
|
||||
rm $PLANET_BUILDINGS_LATLONS
|
||||
|
||||
echo "Filtering for building polygons: `date`"
|
||||
PLANET_BUILDING_POLYGONS="planet-building-polygons.osm"
|
||||
osmfilter $PLANET_O5M --keep="( ( building= or building:part= or type=building ) and ( building:levels= or name= or addr:street= or addr:place= or addr:housename= or addr:housenumber= ) )" --drop-author --drop-version -o=$PLANET_BUILDING_POLYGONS
|
||||
|
||||
|
||||
echo "Filtering for amenities: `date`"
|
||||
PLANET_AMENITIES_O5M="planet-amenities.o5m"
|
||||
ALL_AMENITIES="aeroway= or amenity= or or emergency= or historic= or internet_access= or landuse= or leisure= or man_made= or mountain_pass= or office= or place= or railway= or shop= or tourism="
|
||||
osmfilter $PLANET_O5M --keep="$ALL_AMENITIES" --drop-author --drop-version -o=$PLANET_AMENITIES_O5M
|
||||
PLANET_AMENITIES_LATLONS="planet-amenities-latlons.o5m"
|
||||
osmconvert $PLANET_AMENITIES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_AMENITIES_LATLONS
|
||||
rm $PLANET_AMENITIES_O5M
|
||||
PLANET_AMENITIES="planet-amenities.osm"
|
||||
osmfilter $PLANET_AMENITIES_LATLONS --keep="$ALL_AMENITIES" -o=$PLANET_AMENITIES
|
||||
rm $PLANET_AMENITIES_LATLONS
|
||||
|
||||
echo "Filtering for natural: `date`"
|
||||
PLANET_NATURAL_O5M="planet-natural.o5m"
|
||||
VALID_NATURAL_KEYS="natural="
|
||||
osmfilter $PLANET_O5M --keep="$VALID_NATURAL_KEYS" --drop-author --drop-version -o=$PLANET_NATURAL_O5M
|
||||
PLANET_NATURAL_LATLONS="planet-natural-latlons.o5m"
|
||||
osmconvert $PLANET_NATURAL_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_NATURAL_LATLONS
|
||||
rm $PLANET_NATURAL_O5M
|
||||
PLANET_NATURAL="planet-natural.osm"
|
||||
osmfilter $PLANET_NATURAL_LATLONS --keep="$VALID_NATURAL_KEYS" -o=$PLANET_NATURAL
|
||||
rm $PLANET_NATURAL_LATLONS
|
||||
|
||||
echo "Filtering for waterways: `date`"
|
||||
PLANET_WATERWAYS_O5M="planet-waterways.o5m"
|
||||
VALID_WATERWAY_KEYS="waterway="
|
||||
osmfilter $PLANET_O5M --keep="$VALID_WATERWAY_KEYS" --drop-author --drop-version -o=$PLANET_WATERWAYS_O5M
|
||||
PLANET_WATERWAYS_LATLONS="planet-waterways-latlons.o5m"
|
||||
osmconvert $PLANET_WATERWAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_WATERWAYS_LATLONS
|
||||
rm $PLANET_WATERWAYS_O5M
|
||||
PLANET_WATERWAYS="planet-waterways.osm"
|
||||
osmfilter $PLANET_WATERWAYS_LATLONS --keep="$VALID_WATERWAY_KEYS" -o=$PLANET_WATERWAYS
|
||||
rm $PLANET_WATERWAYS_LATLONS
|
||||
|
||||
|
||||
# Streets data set for use in language classification
|
||||
echo "Filtering ways: `date`"
|
||||
PLANET_WAYS_O5M="planet-ways.o5m"
|
||||
VALID_ROAD_TYPES="( highway=motorway or highway=motorway_link or highway=motorway_junction or highway=trunk or highway=trunk_link or highway=primary or highway=primary_link or highway=secondary or highway=secondary_link or highway=tertiary or highway=tertiary_link or highway=unclassified or highway=unclassified_link or highway=residential or highway=residential_link or highway=service or highway=service_link or highway=living_street or highway=pedestrian or highway=steps or highway=cycleway or highway=bridleway or highway=track or highway=road or ( highway=path and ( motorvehicle=yes or motorcar=yes ) ) )"
|
||||
osmfilter planet-latest.o5m --keep="name= and $VALID_ROAD_TYPES" --drop-relations --drop-author --drop-version -o=$PLANET_WAYS_O5M
|
||||
PLANET_WAYS_NODES_LATLON="planet-ways-nodes-latlons.o5m"
|
||||
osmconvert $PLANET_WAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_WAYS_NODES_LATLON
|
||||
# 10^15 is the offset used for ways and relations with --all-to-ndoes, extracts just the ways
|
||||
PLANET_WAYS_LATLONS="planet-ways-latlons.osm"
|
||||
PLANET_WAYS="planet-ways.osm"
|
||||
|
||||
osmfilter $PLANET_WAYS_NODES_LATLON --keep="name= and ( $VALID_ROAD_TYPES )" -o=$PLANET_WAYS
|
||||
osmfilter $PLANET_WAYS_O5M --keep="name= and ( $VALID_ROAD_TYPES )" -o=$PLANET_WAYS_LATLONS
|
||||
rm $PLANET_WAYS_NODES_LATLON
|
||||
rm $PLANET_WAYS_O5M
|
||||
|
||||
rm $PLANET_O5M
|
||||
rm $JAPAN_O5M
|
||||
|
||||
echo "Completed : `date`"
|
||||
|
||||
cd $PREV_DIR
|
||||
1726
scripts/geodata/osm/formatter.py
Normal file
1726
scripts/geodata/osm/formatter.py
Normal file
File diff suppressed because it is too large
Load Diff
189
scripts/geodata/osm/intersections.py
Normal file
189
scripts/geodata/osm/intersections.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import argparse
|
||||
import array
|
||||
import logging
|
||||
import numpy
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
import ujson as json
|
||||
|
||||
from bisect import bisect_left
|
||||
from leveldb import LevelDB
|
||||
from itertools import izip, groupby
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.coordinates.conversion import latlon_to_decimal
|
||||
from geodata.file_utils import ensure_dir
|
||||
from geodata.osm.extract import *
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
|
||||
DEFAULT_INTERSECTIONS_FILENAME = 'intersections.json'
|
||||
|
||||
|
||||
class OSMIntersectionReader(object):
|
||||
def __init__(self, filename, db_dir):
|
||||
self.filename = filename
|
||||
|
||||
self.node_ids = array.array('l')
|
||||
|
||||
self.logger = logging.getLogger('osm.intersections')
|
||||
|
||||
# Store these in a LevelDB
|
||||
ensure_dir(db_dir)
|
||||
ways_dir = os.path.join(db_dir, 'ways')
|
||||
ensure_dir(ways_dir)
|
||||
nodes_dir = os.path.join(db_dir, 'nodes')
|
||||
ensure_dir(nodes_dir)
|
||||
self.way_props = LevelDB(ways_dir)
|
||||
self.node_props = LevelDB(nodes_dir)
|
||||
|
||||
# These form a graph and should always have the same length
|
||||
self.intersection_edges_nodes = array.array('l')
|
||||
self.intersection_edges_ways = array.array('l')
|
||||
|
||||
def binary_search(self, a, x):
|
||||
'''Locate the leftmost value exactly equal to x'''
|
||||
i = bisect_left(a, x)
|
||||
if i != len(a) and a[i] == x:
|
||||
return i
|
||||
return None
|
||||
|
||||
def intersections(self):
|
||||
'''
|
||||
Generator which yields tuples like:
|
||||
|
||||
(node_id, lat, lon, {way_id: way_props})
|
||||
'''
|
||||
i = 0
|
||||
|
||||
node_ids = array.array('l')
|
||||
node_counts = array.array('i')
|
||||
|
||||
for element_id, props, deps in parse_osm(self.filename, dependencies=True):
|
||||
props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
|
||||
if element_id.startswith('node'):
|
||||
node_id = long(element_id.split(':')[-1])
|
||||
node_ids.append(node_id)
|
||||
node_counts.append(0)
|
||||
self.node_props.Put(safe_encode(node_id), json.dumps(props))
|
||||
elif element_id.startswith('way'):
|
||||
# Don't care about the ordering of the nodes, and want uniques e.g. for circular roads
|
||||
deps = set(deps)
|
||||
|
||||
# Get node indices by binary search
|
||||
for node_id in deps:
|
||||
try:
|
||||
node_index = self.binary_search(node_ids, node_id)
|
||||
except ValueError:
|
||||
continue
|
||||
if node_index is None:
|
||||
continue
|
||||
node_counts[node_index] += 1
|
||||
|
||||
if i % 1000 == 0 and i > 0:
|
||||
self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
|
||||
i += 1
|
||||
|
||||
for i, count in enumerate(node_counts):
|
||||
if count > 1:
|
||||
self.node_ids.append(node_ids[i])
|
||||
|
||||
del node_ids
|
||||
del node_counts
|
||||
|
||||
i = 0
|
||||
|
||||
for element_id, props, deps in parse_osm(self.filename, dependencies=True):
|
||||
if element_id.startswith('node'):
|
||||
node_id = long(element_id.split(':')[-1])
|
||||
node_index = self.binary_search(self.node_ids, node_id)
|
||||
elif element_id.startswith('way'):
|
||||
props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
|
||||
way_id = long(element_id.split(':')[-1])
|
||||
props['id'] = way_id
|
||||
for node_id in deps:
|
||||
node_index = self.binary_search(self.node_ids, node_id)
|
||||
if node_index is not None:
|
||||
self.intersection_edges_nodes.append(node_id)
|
||||
self.intersection_edges_ways.append(way_id)
|
||||
self.way_props.Put(safe_encode(way_id), json.dumps(props))
|
||||
|
||||
if i % 1000 == 0 and i > 0:
|
||||
self.logger.info('second pass, doing {}s, at {}'.format(element_id.split(':')[0], i))
|
||||
i += 1
|
||||
|
||||
i = 0
|
||||
|
||||
indices = numpy.argsort(self.intersection_edges_nodes)
|
||||
self.intersection_edges_nodes = numpy.fromiter((self.intersection_edges_nodes[i] for i in indices), dtype=numpy.uint64)
|
||||
self.intersection_edges_ways = numpy.fromiter((self.intersection_edges_ways[i] for i in indices), dtype=numpy.uint64)
|
||||
del indices
|
||||
|
||||
idx = 0
|
||||
|
||||
# Need to make a copy here otherwise will change dictionary size during iteration
|
||||
for node_id, g in groupby(self.intersection_edges_nodes):
|
||||
group_len = sum((1 for j in g))
|
||||
|
||||
node_props = json.loads(self.node_props.Get(safe_encode(node_id)))
|
||||
|
||||
way_indices = self.intersection_edges_ways[idx:idx + group_len]
|
||||
all_ways = [json.loads(self.way_props.Get(safe_encode(w))) for w in way_indices]
|
||||
way_names = set()
|
||||
ways = []
|
||||
for way in all_ways:
|
||||
if way['name'] in way_names:
|
||||
continue
|
||||
ways.append(way)
|
||||
way_names.add(way['name'])
|
||||
|
||||
idx += group_len
|
||||
|
||||
if i % 1000 == 0 and i > 0:
|
||||
self.logger.info('checking intersections, did {}'.format(i))
|
||||
i += 1
|
||||
|
||||
if len(ways) > 1:
|
||||
node_index = self.binary_search(self.node_ids, node_id)
|
||||
yield self.node_ids[node_index], node_props, ways
|
||||
|
||||
def create_intersections(self, outfile):
|
||||
out = open(outfile, 'w')
|
||||
for node_id, node_props, ways in self.intersections():
|
||||
d = {'id': safe_encode(node_id),
|
||||
'node': node_props,
|
||||
'ways': ways}
|
||||
out.write(json.dumps(d) + six.u('\n'))
|
||||
|
||||
@classmethod
|
||||
def read_intersections(cls, infile):
|
||||
f = open(infile)
|
||||
for line in f:
|
||||
data = json.loads(line.rstrip())
|
||||
yield data['id'], data['node'], data['ways']
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-i', '--input',
|
||||
required=True,
|
||||
help='Path to planet-ways-latlons.osm')
|
||||
|
||||
parser.add_argument('--db-dir',
|
||||
required=True,
|
||||
help='Path to temporary db')
|
||||
|
||||
parser.add_argument('-o', '--out-dir',
|
||||
default=os.getcwd(),
|
||||
required=True,
|
||||
help='Output directory')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
reader = OSMIntersectionReader(args.input, args.db_dir)
|
||||
reader.create_intersections(os.path.join(args.out_dir, DEFAULT_INTERSECTIONS_FILENAME))
|
||||
563
scripts/geodata/osm/osm_address_training_data.py
Normal file
563
scripts/geodata/osm/osm_address_training_data.py
Normal file
@@ -0,0 +1,563 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
osm_address_training_data.py
|
||||
----------------------------
|
||||
|
||||
This script generates several training sets from OpenStreetMap addresses,
|
||||
streets, venues and toponyms.
|
||||
|
||||
Note: the combined size of all the files created by this script exceeds 100GB
|
||||
so if training these models, it is wise to use a server-grade machine with
|
||||
plenty of disk space. The following commands can be used in parallel to create
|
||||
all the training sets:
|
||||
|
||||
Ways:
|
||||
python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||
|
||||
Venues:
|
||||
python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||
|
||||
Limited formatted addresses:
|
||||
python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) -o $(OUT_DIR)
|
||||
|
||||
Formatted addresses (tagged):
|
||||
python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||
|
||||
Formatted addresses (untagged):
|
||||
python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f -u --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||
|
||||
Intersections (after running intersections.py to create the JSON file):
|
||||
python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
|
||||
|
||||
Toponyms:
|
||||
python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) -o $(OUT_DIR)
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import operator
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
import urllib
|
||||
import ujson as json
|
||||
import HTMLParser
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
from lxml import etree
|
||||
from itertools import ifilter, chain, combinations
|
||||
|
||||
from shapely.geos import LOG as shapely_geos_logger
|
||||
shapely_geos_logger.setLevel(logging.CRITICAL)
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.address_expansions.abbreviations import abbreviate
|
||||
from geodata.address_expansions.gazetteers import *
|
||||
from geodata.addresses.components import AddressComponents
|
||||
from geodata.coordinates.conversion import *
|
||||
from geodata.language_id.disambiguation import *
|
||||
from geodata.language_id.sample import sample_random_language
|
||||
from geodata.i18n.languages import *
|
||||
from geodata.metro_stations.reverse_geocode import MetroStationReverseGeocoder
|
||||
from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
|
||||
from geodata.osm.extract import *
|
||||
from geodata.osm.formatter import OSMAddressFormatter
|
||||
from geodata.places.reverse_geocode import PlaceReverseGeocoder
|
||||
from geodata.polygons.language_polys import *
|
||||
from geodata.polygons.reverse_geocode import *
|
||||
from geodata.i18n.unicode_paths import DATA_DIR
|
||||
|
||||
from geodata.csv_utils import *
|
||||
from geodata.file_utils import *
|
||||
|
||||
# Input files
|
||||
PLANET_ADDRESSES_INPUT_FILE = 'planet-addresses.osm'
|
||||
PLANET_WAYS_INPUT_FILE = 'planet-ways.osm'
|
||||
PLANET_VENUES_INPUT_FILE = 'planet-venues.osm'
|
||||
PLANET_BORDERS_INPUT_FILE = 'planet-borders.osm'
|
||||
|
||||
# Output files
|
||||
WAYS_LANGUAGE_DATA_FILENAME = 'streets_by_language.tsv'
|
||||
ADDRESS_LANGUAGE_DATA_FILENAME = 'address_streets_by_language.tsv'
|
||||
TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv'
|
||||
|
||||
|
||||
def normalize_osm_name_tag(tag, script=False):
|
||||
norm = tag.rsplit(':', 1)[-1]
|
||||
if not script:
|
||||
return norm
|
||||
return norm.split('_', 1)[0]
|
||||
|
||||
|
||||
def get_language_names(country_rtree, key, value, tag_prefix='name'):
|
||||
if not ('lat' in value and 'lon' in value):
|
||||
return None, None
|
||||
|
||||
has_colon = ':' in tag_prefix
|
||||
tag_first_component = tag_prefix.split(':')[0]
|
||||
tag_last_component = tag_prefix.split(':')[-1]
|
||||
|
||||
try:
|
||||
latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
osm_country_components = country_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||
country, candidate_languages = country_rtree.country_and_languages_from_components(osm_country_components)
|
||||
if not (country and candidate_languages):
|
||||
return None, None
|
||||
|
||||
num_langs = len(candidate_languages)
|
||||
default_langs = set([l for l, d in candidate_languages if d])
|
||||
num_defaults = len(default_langs)
|
||||
name_language = defaultdict(list)
|
||||
|
||||
alternate_langs = []
|
||||
|
||||
equivalent_alternatives = defaultdict(list)
|
||||
for k, v in value.iteritems():
|
||||
if k.startswith(tag_prefix + ':') and normalize_osm_name_tag(k, script=True) in languages:
|
||||
lang = k.rsplit(':', 1)[-1]
|
||||
alternate_langs.append((lang, v))
|
||||
equivalent_alternatives[v].append(lang)
|
||||
|
||||
has_alternate_names = len(alternate_langs)
|
||||
# Some countries like Lebanon list things like name:en == name:fr == "Rue Abdel Hamid Karame"
|
||||
# Those addresses should be disambiguated rather than taken for granted
|
||||
ambiguous_alternatives = set([k for k, v in equivalent_alternatives.iteritems() if len(v) > 1])
|
||||
|
||||
regional_defaults = 0
|
||||
country_defaults = 0
|
||||
regional_langs = set()
|
||||
country_langs = set()
|
||||
for c in osm_country_components:
|
||||
_, langs = country_rtree.country_and_languages_from_components([c])
|
||||
if 'ISO3166-1:alpha2' not in c:
|
||||
regional_defaults += sum((1 for l, d in langs if d))
|
||||
regional_langs |= set([l for l, d in langs])
|
||||
else:
|
||||
country_defaults += sum((1 for l, d in langs if d))
|
||||
country_langs |= set([l for l, d in langs])
|
||||
|
||||
ambiguous_already_seen = set()
|
||||
|
||||
for k, v in value.iteritems():
|
||||
if k.startswith(tag_prefix + ':'):
|
||||
if v not in ambiguous_alternatives:
|
||||
norm = normalize_osm_name_tag(k)
|
||||
norm_sans_script = normalize_osm_name_tag(k, script=True)
|
||||
if norm in languages or norm_sans_script in languages:
|
||||
name_language[norm].append(v)
|
||||
elif v not in ambiguous_already_seen:
|
||||
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
|
||||
lang = disambiguate_language(v, langs)
|
||||
|
||||
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
|
||||
name_language[lang].append(v)
|
||||
|
||||
ambiguous_already_seen.add(v)
|
||||
elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
|
||||
if num_langs == 1:
|
||||
name_language[candidate_languages[0][0]].append(v)
|
||||
else:
|
||||
lang = disambiguate_language(v, candidate_languages)
|
||||
default_lang = candidate_languages[0][0]
|
||||
|
||||
if lang == AMBIGUOUS_LANGUAGE:
|
||||
return None, None
|
||||
elif lang == UNKNOWN_LANGUAGE and num_defaults == 1:
|
||||
name_language[default_lang].append(v)
|
||||
elif lang != UNKNOWN_LANGUAGE:
|
||||
if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
|
||||
return None, None
|
||||
name_language[lang].append(v)
|
||||
else:
|
||||
return None, None
|
||||
|
||||
return country, name_language
|
||||
|
||||
|
||||
def build_ways_training_data(country_rtree, infile, out_dir, abbreviate_streets=True):
|
||||
'''
|
||||
Creates a training set for language classification using most OSM ways
|
||||
(streets) under a fairly lengthy osmfilter definition which attempts to
|
||||
identify all roads/ways designated for motor vehicle traffic, which
|
||||
is more-or-less what we'd expect to see in addresses.
|
||||
|
||||
The fields are {language, country, street name}. Example:
|
||||
|
||||
ar ma ﺵﺍﺮﻋ ﻑﺎﻟ ﻮﻟﺩ ﻊﻤﻳﺭ
|
||||
'''
|
||||
i = 0
|
||||
f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
|
||||
writer = csv.writer(f, 'tsv_no_quote')
|
||||
|
||||
for key, value, deps in parse_osm(infile, allowed_types=WAYS_RELATIONS):
|
||||
country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
|
||||
if not name_language:
|
||||
continue
|
||||
|
||||
for lang, val in name_language.iteritems():
|
||||
for v in val:
|
||||
for s in v.split(';'):
|
||||
if lang in languages:
|
||||
writer.writerow((lang, country, tsv_string(s)))
|
||||
if not abbreviate_streets:
|
||||
continue
|
||||
abbrev = abbreviate(street_and_synonyms_gazetteer, s, lang)
|
||||
if abbrev != s:
|
||||
writer.writerow((lang, country, tsv_string(abbrev)))
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print('did {} ways'.format(i))
|
||||
i += 1
|
||||
f.close()
|
||||
|
||||
|
||||
NAME_KEYS = (
|
||||
'name',
|
||||
'addr:housename',
|
||||
)
|
||||
|
||||
HOUSE_NUMBER_KEYS = (
|
||||
'addr:house_number',
|
||||
'addr:housenumber',
|
||||
'house_number'
|
||||
)
|
||||
|
||||
COUNTRY_KEYS = (
|
||||
'country',
|
||||
'country_name',
|
||||
'addr:country',
|
||||
'is_in:country',
|
||||
'addr:country_code',
|
||||
'country_code',
|
||||
'is_in:country_code'
|
||||
)
|
||||
|
||||
POSTAL_KEYS = (
|
||||
'postcode',
|
||||
'postal_code',
|
||||
'addr:postcode',
|
||||
'addr:postal_code',
|
||||
)
|
||||
|
||||
|
||||
def build_toponym_training_data(country_rtree, infile, out_dir):
|
||||
'''
|
||||
Data set of toponyms by language and country which should assist
|
||||
in language classification. OSM tends to use the native language
|
||||
by default (e.g. Москва instead of Moscow). Toponyms get messy
|
||||
due to factors like colonialism, historical names, name borrowing
|
||||
and the shortness of the names generally. In these cases
|
||||
we're more strict as to what constitutes a valid language for a
|
||||
given country.
|
||||
|
||||
Example:
|
||||
ja jp 東京都
|
||||
'''
|
||||
i = 0
|
||||
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
|
||||
writer = csv.writer(f, 'tsv_no_quote')
|
||||
|
||||
for key, value, deps in parse_osm(infile):
|
||||
if not any((k.startswith('name') for k, v in value.iteritems())):
|
||||
continue
|
||||
|
||||
try:
|
||||
latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
|
||||
osm_country_components = country_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||
country, candidate_languages = country_rtree.country_and_languages_from_components(osm_country_components)
|
||||
if not (country and candidate_languages):
|
||||
continue
|
||||
|
||||
name_language = defaultdict(list)
|
||||
|
||||
official = official_languages[country]
|
||||
|
||||
default_langs = set([l for l, default in official.iteritems() if default])
|
||||
|
||||
_, regional_langs = country_rtree.country_and_languages_from_components([c for c in osm_country_components if 'ISO3166-1:alpha2' not in c])
|
||||
|
||||
top_lang = None
|
||||
if len(official) > 0:
|
||||
top_lang = official.iterkeys().next()
|
||||
|
||||
# E.g. Hindi in India, Urdu in Pakistan
|
||||
if top_lang is not None and top_lang not in WELL_REPRESENTED_LANGUAGES and len(default_langs) > 1:
|
||||
default_langs -= WELL_REPRESENTED_LANGUAGES
|
||||
|
||||
valid_languages = set([l for l, d in candidate_languages])
|
||||
|
||||
'''
|
||||
WELL_REPRESENTED_LANGUAGES are languages like English, French, etc. for which we have a lot of data
|
||||
WELL_REPRESENTED_LANGUAGE_COUNTRIES are more-or-less the "origin" countries for said languages where
|
||||
we can take the place names as examples of the language itself (e.g. place names in France are examples
|
||||
of French, whereas place names in much of Francophone Africa tend to get their names from languages
|
||||
other than French, even though French is the official language.
|
||||
'''
|
||||
valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]])
|
||||
|
||||
valid_languages |= default_langs
|
||||
|
||||
if not valid_languages:
|
||||
continue
|
||||
|
||||
have_qualified_names = False
|
||||
|
||||
for k, v in value.iteritems():
|
||||
if not k.startswith('name:'):
|
||||
continue
|
||||
|
||||
norm = normalize_osm_name_tag(k)
|
||||
norm_sans_script = normalize_osm_name_tag(k, script=True)
|
||||
|
||||
if norm in languages:
|
||||
lang = norm
|
||||
elif norm_sans_script in languages:
|
||||
lang = norm_sans_script
|
||||
else:
|
||||
continue
|
||||
|
||||
if lang in valid_languages:
|
||||
have_qualified_names = True
|
||||
name_language[lang].append(v)
|
||||
|
||||
if not have_qualified_names and len(regional_langs) <= 1 and 'name' in value and len(valid_languages) == 1:
|
||||
name_language[top_lang].append(value['name'])
|
||||
|
||||
for k, v in name_language.iteritems():
|
||||
for s in v:
|
||||
s = s.strip()
|
||||
if not s:
|
||||
continue
|
||||
writer.writerow((k, country, tsv_string(s)))
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print('did {} toponyms'.format(i))
|
||||
i += 1
|
||||
|
||||
f.close()
|
||||
|
||||
|
||||
def build_address_training_data(country_rtree, infile, out_dir, format=False):
|
||||
'''
|
||||
Creates training set similar to the ways data but using addr:street tags instead.
|
||||
These may be slightly closer to what we'd see in real live addresses, containing
|
||||
variations, some abbreviations (although this is discouraged in OSM), etc.
|
||||
|
||||
Example record:
|
||||
eu es Errebal kalea
|
||||
'''
|
||||
i = 0
|
||||
f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
|
||||
writer = csv.writer(f, 'tsv_no_quote')
|
||||
|
||||
for key, value, deps in parse_osm(infile):
|
||||
country, street_language = get_language_names(country_rtree, key, value, tag_prefix='addr:street')
|
||||
if not street_language:
|
||||
continue
|
||||
|
||||
for k, v in street_language.iteritems():
|
||||
for s in v:
|
||||
s = s.strip()
|
||||
if not s:
|
||||
continue
|
||||
if k in languages:
|
||||
writer.writerow((k, country, tsv_string(s)))
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print('did {} streets'.format(i))
|
||||
i += 1
|
||||
|
||||
f.close()
|
||||
|
||||
VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv'
|
||||
|
||||
|
||||
def build_venue_training_data(country_rtree, infile, out_dir):
|
||||
i = 0
|
||||
|
||||
f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w')
|
||||
writer = csv.writer(f, 'tsv_no_quote')
|
||||
|
||||
for key, value, deps in parse_osm(infile):
|
||||
country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
|
||||
if not name_language:
|
||||
continue
|
||||
|
||||
venue_type = None
|
||||
for key in (u'amenity', u'building'):
|
||||
amenity = value.get(key, u'').strip()
|
||||
if amenity in ('yes', 'y'):
|
||||
continue
|
||||
|
||||
if amenity:
|
||||
venue_type = u':'.join([key, amenity])
|
||||
break
|
||||
|
||||
if venue_type is None:
|
||||
continue
|
||||
|
||||
for k, v in name_language.iteritems():
|
||||
for s in v:
|
||||
s = s.strip()
|
||||
if k in languages:
|
||||
writer.writerow((k, country, safe_encode(venue_type), tsv_string(s)))
|
||||
if i % 1000 == 0 and i > 0:
|
||||
print('did, {} venues'.format(i))
|
||||
i += 1
|
||||
|
||||
f.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Handle argument parsing here
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-s', '--streets-file',
|
||||
help='Path to planet-ways.osm')
|
||||
|
||||
parser.add_argument('--unabbreviated',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Use unabbreviated street names for token counts')
|
||||
|
||||
parser.add_argument('-a', '--address-file',
|
||||
help='Path to planet-addresses.osm')
|
||||
|
||||
parser.add_argument('-v', '--venues-file',
|
||||
help='Path to planet-venues.osm')
|
||||
|
||||
parser.add_argument('-b', '--borders-file',
|
||||
help='Path to planet-borders.osm')
|
||||
|
||||
parser.add_argument('-f', '--format',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Save formatted addresses (slow)')
|
||||
|
||||
parser.add_argument('-u', '--untagged',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Save untagged formatted addresses (slow)')
|
||||
|
||||
parser.add_argument('-l', '--limited-addresses',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Save formatted addresses without house names or country (slow)')
|
||||
|
||||
parser.add_argument('-p', '--place-nodes-file',
|
||||
help='Path to planet-admin-nodes.osm')
|
||||
|
||||
parser.add_argument('-t', '--temp-dir',
|
||||
default=tempfile.gettempdir(),
|
||||
help='Temp directory to use')
|
||||
|
||||
parser.add_argument('-x', '--intersections-file',
|
||||
help='Path to planet-ways-latlons.osm')
|
||||
|
||||
parser.add_argument('--country-rtree-dir',
|
||||
required=True,
|
||||
help='Country RTree directory')
|
||||
|
||||
parser.add_argument('--rtree-dir',
|
||||
default=None,
|
||||
help='OSM reverse geocoder RTree directory')
|
||||
|
||||
parser.add_argument('--places-index-dir',
|
||||
default=None,
|
||||
help='Places index directory')
|
||||
|
||||
parser.add_argument('--metro-stations-index-dir',
|
||||
default=None,
|
||||
help='Metro stations reverse geocoder directory')
|
||||
|
||||
parser.add_argument('--subdivisions-rtree-dir',
|
||||
default=None,
|
||||
help='Subdivisions reverse geocoder RTree directory')
|
||||
|
||||
parser.add_argument('--buildings-rtree-dir',
|
||||
default=None,
|
||||
help='Buildings reverse geocoder RTree directory')
|
||||
|
||||
parser.add_argument('--neighborhoods-rtree-dir',
|
||||
default=None,
|
||||
help='Neighborhoods reverse geocoder RTree directory')
|
||||
|
||||
parser.add_argument('-o', '--out-dir',
|
||||
default=os.getcwd(),
|
||||
help='Output directory')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir)
|
||||
|
||||
osm_rtree = None
|
||||
if args.rtree_dir:
|
||||
osm_rtree = OSMReverseGeocoder.load(args.rtree_dir)
|
||||
|
||||
neighborhoods_rtree = None
|
||||
if args.neighborhoods_rtree_dir:
|
||||
neighborhoods_rtree = NeighborhoodReverseGeocoder.load(args.neighborhoods_rtree_dir)
|
||||
|
||||
places_index = None
|
||||
if args.places_index_dir:
|
||||
places_index = PlaceReverseGeocoder.load(args.places_index_dir)
|
||||
|
||||
metro_stations_index = None
|
||||
if args.metro_stations_index_dir:
|
||||
metro_stations_index = MetroStationReverseGeocoder.load(args.metro_stations_index_dir)
|
||||
|
||||
subdivisions_rtree = None
|
||||
if args.subdivisions_rtree_dir:
|
||||
subdivisions_rtree = OSMSubdivisionReverseGeocoder.load(args.subdivisions_rtree_dir)
|
||||
|
||||
buildings_rtree = None
|
||||
if args.buildings_rtree_dir:
|
||||
buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree_dir)
|
||||
|
||||
# Can parallelize
|
||||
if args.streets_file and not args.format:
|
||||
build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
|
||||
if args.borders_file:
|
||||
build_toponym_training_data(country_rtree, args.borders_file, args.out_dir)
|
||||
if args.venues_file:
|
||||
build_venue_training_data(country_rtree, args.venues_file, args.out_dir)
|
||||
|
||||
if args.address_file or args.intersections_file:
|
||||
if osm_rtree is None:
|
||||
parser.error('--rtree-dir required for formatted addresses')
|
||||
elif neighborhoods_rtree is None:
|
||||
parser.error('--neighborhoods-rtree-dir required for formatted addresses')
|
||||
elif places_index is None:
|
||||
parser.error('--places-index-dir required for formatted addresses')
|
||||
|
||||
if args.address_file and args.format:
|
||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
||||
osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
|
||||
if args.address_file and args.limited_addresses:
|
||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
|
||||
osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
|
||||
|
||||
if args.place_nodes_file and args.format:
|
||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
||||
osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)
|
||||
|
||||
if args.intersections_file and args.format:
|
||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
||||
osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)
|
||||
|
||||
if args.streets_file and args.format:
|
||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
||||
osm_formatter.build_ways_training_data(args.streets_file, args.out_dir, tag_components=not args.untagged)
|
||||
Reference in New Issue
Block a user