Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,333 @@
'''
admin_boundaries.py
-------------------
Generates polygons from OpenStreetMap relations
'''
import array
import logging
import six
from bisect import bisect_left
from collections import defaultdict, OrderedDict
from itertools import izip, combinations
from geodata.coordinates.conversion import latlon_to_decimal
from geodata.encoding import safe_encode, safe_decode
from geodata.file_utils import ensure_dir
from geodata.graph.scc import strongly_connected_components
from geodata.i18n.languages import osm_admin1_ids
from geodata.math.floats import isclose
from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import *
class OSMPolygonReader(object):
'''
OSM relations are stored with pointers to their bounding ways,
which in turn store pointers to their constituent nodes and the
XML file for planet is far too large to be parsed in-memory.
For the purposes of constructing (multi)polygons, we need lists
of lat/lon coordinates for the edges of each outer and inner polygon
that form the overall boundary (this allows for holes e.g.
Lesotho/South Africa and multiple disjoint polygons such as islands)
This class creates a compact representation of the intermediate
lookup tables and coordinates using Python's typed array module
which stores C-sized ints, doubles, etc. in a dynamic array. It's like
a list but smaller and faster for arrays of numbers and doesn't require
pulling in numpy as a dependency when all we want is the space savings.
One nice property of the .osm files generated by osmfilter is that
nodes/ways/relations are stored in sorted order, so we don't have to
pre-sort the lookup arrays before performing binary search.
'''
def __init__(self, filename):
self.filename = filename
self.node_ids = array.array('l')
self.way_ids = array.array('l')
self.coords = array.array('d')
self.nodes = {}
self.way_deps = array.array('l')
self.way_coords = array.array('d')
self.way_indptr = array.array('i', [0])
self.logger = logging.getLogger('osm_admin_polys')
def binary_search(self, a, x):
'''Locate the leftmost value exactly equal to x'''
i = bisect_left(a, x)
if i != len(a) and a[i] == x:
return i
raise ValueError
def node_coordinates(self, coords, indptr, idx):
start_index = indptr[idx] * 2
end_index = indptr[idx + 1] * 2
node_coords = coords[start_index:end_index]
return zip(node_coords[::2], node_coords[1::2])
def sparse_deps(self, data, indptr, idx):
return [data[i] for i in xrange(indptr[idx], indptr[idx + 1])]
def create_polygons(self, ways):
'''
Polygons (relations) are effectively stored as lists of
line segments (ways) and there may be more than one polygon
(island chains, overseas territories).
If we view the line segments as a graph (any two ways which
share a terminal node are connected), then the process of
constructing polygons reduces to finding strongly connected
components in a graph.
https://en.wikipedia.org/wiki/Strongly_connected_component
Note that even though there may be hundreds of thousands of
points in a complex polygon like a country boundary, we only
need to build a graph of connected ways, which will be many
times smaller and take much less time to traverse.
'''
end_nodes = defaultdict(list)
polys = []
way_indices = {}
start_end_nodes = {}
for way_id in ways:
# Find the way position via binary search
try:
way_index = self.binary_search(self.way_ids, way_id)
except ValueError:
continue
# Cache the way index
way_indices[way_id] = way_index
# way_indptr is a compressed index into way_deps/way_coords
# way_index i is stored at indices way_indptr[i]:way_indptr[i+1]
# in way_deps
start_node_id = self.way_deps[self.way_indptr[way_index]]
end_node_id = self.way_deps[self.way_indptr[way_index + 1] - 1]
start_end_nodes[way_id] = (start_node_id, end_node_id)
if start_node_id == end_node_id:
way_node_points = self.node_coordinates(self.way_coords, self.way_indptr, way_index)
polys.append(way_node_points)
continue
end_nodes[start_node_id].append(way_id)
end_nodes[end_node_id].append(way_id)
# Way graph for a single polygon, don't need to be as concerned about storage
way_graph = defaultdict(OrderedDict)
for node_id, ways in end_nodes.iteritems():
for w1, w2 in combinations(ways, 2):
way_graph[w1][w2] = None
way_graph[w2][w1] = None
way_graph = {v: w.keys() for v, w in way_graph.iteritems()}
for component in strongly_connected_components(way_graph):
poly_nodes = []
seen = set()
if not component:
continue
q = [(c, False) for c in component[:1]]
while q:
way_id, reverse = q.pop()
way_index = way_indices[way_id]
node_coords = self.node_coordinates(self.way_coords, self.way_indptr, way_index)
head, tail = start_end_nodes[way_id]
if reverse:
node_coords = node_coords[::-1]
head, tail = tail, head
for neighbor in way_graph[way_id]:
if neighbor in seen:
continue
neighbor_head, neighbor_tail = start_end_nodes[neighbor]
neighbor_reverse = neighbor_head == head or neighbor_tail == tail
q.append((neighbor, neighbor_reverse))
way_start = 0 if q else 1
poly_nodes.extend(node_coords[way_start:-1])
seen.add(way_id)
polys.append(poly_nodes)
return polys
def include_polygon(self, props):
raise NotImplementedError('Children must implement')
def polygons(self, properties_only=False):
'''
Generator which yields tuples like:
(relation_id, properties, outer_polygons, inner_polygons)
At this point a polygon is a list of coordinate tuples,
suitable for passing to shapely's Polygon constructor
but may be used for other purposes.
outer_polygons is a list of the exterior polygons for this
boundary. inner_polygons is a list of "holes" in the exterior
polygons although donuts and donut-holes need to be matched
by the caller using something like shapely's contains.
'''
i = 0
for element_id, props, deps in parse_osm(self.filename, dependencies=True):
props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
if element_id.startswith('node'):
node_id = long(element_id.split(':')[-1])
lat = props.get('lat')
lon = props.get('lon')
if lat is None or lon is None:
continue
lat, lon = latlon_to_decimal(lat, lon)
if lat is None or lon is None:
continue
if isclose(lat, 90.0):
lat = 89.999
if isclose(lon, 180.0):
lon = 179.999
if 'name' in props and 'place' in props:
self.nodes[node_id] = props
# Nodes are stored in a sorted array, coordinate indices are simply
# [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
# Note that the pairs are lon, lat instead of lat, lon for geometry purposes
self.coords.append(lon)
self.coords.append(lat)
self.node_ids.append(node_id)
elif element_id.startswith('way'):
way_id = long(element_id.split(':')[-1])
# Get node indices by binary search
try:
node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps]
except ValueError:
continue
# Way ids stored in a sorted array
self.way_ids.append(way_id)
# way_deps is the list of dependent node ids
# way_coords is a copy of coords indexed by way ids
for node_id, node_index in izip(deps, node_indices):
self.way_deps.append(node_id)
self.way_coords.append(self.coords[node_index * 2])
self.way_coords.append(self.coords[node_index * 2 + 1])
self.way_indptr.append(len(self.way_deps))
if deps[0] == deps[-1] and self.include_polygon(props):
way_id_offset = WAY_OFFSET + way_id
if not properties_only:
outer_polys = self.create_polygons([way_id])
inner_polys = []
yield way_id_offset, props, {}, outer_polys, inner_polys
else:
yield way_id_offset, props, {}
elif element_id.startswith('relation'):
if self.node_ids is not None:
self.node_ids = None
if self.coords is not None:
self.coords = None
relation_id = long(element_id.split(':')[-1])
if len(deps) == 0 or not self.include_polygon(props) or props.get('type', '').lower() == 'multilinestring':
continue
outer_ways = []
inner_ways = []
admin_centers = []
for elem_id, elem_type, role in deps:
if role in ('outer', '') and elem_type == 'way':
outer_ways.append(elem_id)
elif role == 'inner' and elem_type == 'way':
inner_ways.append(elem_id)
elif role == 'admin_centre' and elem_type == 'node':
val = self.nodes.get(long(elem_id))
if val is not None:
val['type'] = 'node'
val['id'] = long(elem_id)
admin_centers.append(val)
elif role == 'label' and elem_type == 'node':
val = self.nodes.get(long(elem_id))
if val is not None and val.get('name', six.u('')).lower() == props.get('name', six.u('')).lower():
props.update({k: v for k, v in six.iteritems(val)
if k not in props})
admin_center = {}
if len(admin_centers) == 1:
admin_center = admin_centers[0]
relation_id_offset = RELATION_OFFSET + relation_id
if not properties_only:
outer_polys = self.create_polygons(outer_ways)
inner_polys = self.create_polygons(inner_ways)
yield relation_id_offset, props, admin_center, outer_polys, inner_polys
else:
yield relation_id_offset, props, admin_center
if i % 1000 == 0 and i > 0:
self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
i += 1
class OSMAdminPolygonReader(OSMPolygonReader):
def include_polygon(self, props):
return 'boundary' in props or 'place' in props
class OSMSubdivisionPolygonReader(OSMPolygonReader):
def include_polygon(self, props):
return 'landuse' in props or 'place' in props or 'amenity' in props
class OSMBuildingPolygonReader(OSMPolygonReader):
def include_polygon(self, props):
return 'building' in props or 'building:part' in props or props.get('type', None) == 'building'
class OSMCountryPolygonReader(OSMPolygonReader):
def include_polygon(self, props):
return 'ISO3166-1:alpha2' in props or 'ISO3166-2' in props or (props.get('type', 'relation'), safe_encode(props.get('id', ''))) in osm_admin1_ids
class OSMNeighborhoodPolygonReader(OSMPolygonReader):
def include_polygon(self, props):
return osm_definitions.meets_definition(props, osm_definitions.NEIGHBORHOOD)
class OSMPostalCodesPolygonReader(OSMPolygonReader):
def include_polygon(self, props):
return props.get('boundary') == 'postal_code'
class OSMAirportsPolygonReader(OSMPolygonReader):
def include_polygon(self, props):
return 'aerodrome' in props

View File

@@ -0,0 +1,184 @@
import collections
import os
import six
import yaml
from copy import deepcopy
from geodata.address_formatting.formatter import AddressFormatter
from geodata.configs.utils import recursive_merge, DoesNotExist
from geodata.encoding import safe_encode
this_dir = os.path.realpath(os.path.dirname(__file__))
OSM_BOUNDARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'boundaries', 'osm')
class OSMAddressComponents(object):
'''
Keeps a map of OSM keys and values to the standard components
of an address like city, state, etc. used for address formatting.
When we reverse geocode a point, it will fall into a number of
polygons, and we simply need to assign the names of said polygons
to an address field.
'''
ADMIN_LEVEL = 'admin_level'
# These keys override country-level
global_keys_override = {
'place': {
'island': AddressFormatter.ISLAND,
'islet': AddressFormatter.ISLAND,
'municipality': AddressFormatter.CITY,
'city': AddressFormatter.CITY,
'town': AddressFormatter.CITY,
'township': AddressFormatter.CITY,
'village': AddressFormatter.CITY,
'hamlet': AddressFormatter.CITY,
'suburb': AddressFormatter.SUBURB,
'quarter': AddressFormatter.SUBURB,
'neighbourhood': AddressFormatter.SUBURB
},
'border_type': {
'city': AddressFormatter.CITY
}
}
# These keys are fallback in case we haven't added a country or there is no admin_level=
global_keys = {
'place': {
'country': AddressFormatter.COUNTRY,
'state': AddressFormatter.STATE,
'region': AddressFormatter.STATE,
'province': AddressFormatter.STATE,
'county': AddressFormatter.STATE_DISTRICT,
},
'gnis:class': {
'populated place': AddressFormatter.CITY,
}
}
def __init__(self, boundaries_dir=OSM_BOUNDARIES_DIR):
self.config = {}
self.use_admin_center = {}
for filename in os.listdir(boundaries_dir):
if not filename.endswith('.yaml'):
continue
country_code = filename.rsplit('.yaml', 1)[0]
data = yaml.load(open(os.path.join(boundaries_dir, filename)))
for prop, values in six.iteritems(data):
if not hasattr(values, 'items'):
# non-dict key
continue
for k, v in values.iteritems():
if isinstance(v, six.string_types) and v not in AddressFormatter.address_formatter_fields:
raise ValueError(u'Invalid value in {} for prop={}, key={}: {}'.format(filename, prop, k, v))
if prop == 'overrides':
self.use_admin_center.update({(r['type'], safe_encode(r['id'])): r.get('probability', 1.0) for r in values.get('use_admin_center', [])})
containing_overrides = values.get('contained_by', {})
if not containing_overrides:
continue
for id_type, vals in six.iteritems(containing_overrides):
for element_id in vals:
override_config = vals[element_id]
config = deepcopy(data)
config.pop('overrides')
recursive_merge(config, override_config)
vals[element_id] = config
self.config[country_code] = data
def component(self, country, prop, value):
component = self.global_keys_override.get(prop, {}).get(value, None)
if component is not None:
return component
component = self.config.get(country, {}).get(prop, {}).get(value, None)
if component is not None:
return component
return self.global_keys.get(prop, {}).get(value, None)
def component_from_properties(self, country, properties, containing=(), global_keys=True):
country_config = self.config.get(country, {})
config = country_config
overrides = country_config.get('overrides')
if overrides:
id_overrides = overrides.get('id', {})
element_type = properties.get('type')
element_id = properties.get('id')
override_value = id_overrides.get(element_type, {})
element_id = six.binary_type(element_id or '')
if element_id in override_value:
return override_value[element_id]
contained_by_overrides = overrides.get('contained_by')
if contained_by_overrides and containing:
# Note, containing should be passed in from smallest to largest
for containing_type, containing_id in containing:
override_config = contained_by_overrides.get(containing_type, {}).get(six.binary_type(containing_id or ''), None)
if override_config:
config = override_config
break
values = [(k.lower(), v.lower()) for k, v in six.iteritems(properties) if isinstance(v, six.string_types)]
global_overrides_last = config.get('global_overrides_last', False)
# place=city, place=suburb, etc. override per-country boundaries
if not global_overrides_last:
for k, v in values:
containing_component = self.global_keys_override.get(k, {}).get(v, DoesNotExist)
if containing_component is not DoesNotExist:
return containing_component
if k != self.ADMIN_LEVEL and k in config:
containing_component = config.get(k, {}).get(v, DoesNotExist)
if containing_component is not DoesNotExist:
return containing_component
# admin_level tags are mapped per country
for k, v in values:
containing_component = config.get(k, {}).get(v, DoesNotExist)
if containing_component is not DoesNotExist:
return containing_component
# other place keys like place=state, etc. serve as a backup
# when no admin_level tags are available
for k, v in values:
containing_component = self.global_keys.get(k, {}).get(v, DoesNotExist)
if containing_component is not DoesNotExist:
return containing_component
if global_overrides_last:
for k, v in values:
containing_component = self.global_keys_override.get(k, {}).get(v, DoesNotExist)
if containing_component is not DoesNotExist:
return containing_component
return None
osm_address_components = OSMAddressComponents()

View File

@@ -0,0 +1,89 @@
import os
import re
import six
from collections import defaultdict
from geodata.graph.topsort import topsort
this_dir = os.path.realpath(os.path.dirname(__file__))
DEFAULT_SCRIPT_PATH = os.path.join(this_dir, 'fetch_osm_address_data.sh')
valid_key_regex = re.compile('VALID_(.*?)_KEYS="(.*)"')
variable_regex = re.compile(r'\$VALID_(.*?)_KEYS(?=\b)')
kv_regex = re.compile('([^\s]*)=([^\s]*)')
class OSMDefinitions(object):
ALL = '*'
ADMIN_BORDER = 'admin_border'
ADMIN_NODE = 'admin_node'
AEROWAY = 'aeroway'
AMENITY = 'amenity'
BUILDING = 'building'
HISTORIC = 'historic'
LANDUSE = 'landuse'
NATURAL = 'natural'
LOCALITY = 'locality'
NEIGHBORHOOD = 'neighborhood'
EXTENDED_NEIGHBORHOOD = 'extended_neighborhood'
OFFICE = 'office'
PLACE = 'place'
POPULATED_PLACE = 'populated_place'
SHOP = 'shop'
TOURISM = 'tourism'
VENUE = 'venue'
WATERWAY = 'waterway'
def __init__(self, filename=DEFAULT_SCRIPT_PATH):
script = open(filename).read()
dependencies = defaultdict(list)
definitions = {}
matches = valid_key_regex.findall(script)
match_text = {d.lower(): t for d, t in matches}
for definition, text in matches:
variables = variable_regex.findall(text)
if not variables:
dependencies[definition.lower()] = []
for v in variables:
dependencies[definition.lower()].append(v.lower())
for definition in topsort(dependencies):
definition = definition.lower()
text = match_text[definition]
variables = variable_regex.findall(text)
for v in variables:
v = v.lower()
text = text.replace('$VALID_{}_KEYS'.format(v.upper()), match_text[v])
kvs = defaultdict(set)
for k, v in kv_regex.findall(text):
if v != '':
kvs[k].add(v.lower())
else:
kvs[k].add(self.ALL)
definitions[definition] = kvs
self.definitions = definitions
def meets_definition(self, props, category):
defs = self.definitions.get(category, {})
if not defs:
return False
elif self.ALL in defs:
return True
for k, v in six.iteritems(props):
if v.lower() in defs.get(k.lower(), set()):
return True
return False
osm_definitions = OSMDefinitions()

View File

@@ -0,0 +1,207 @@
'''
geodata.osm.extract
-------------------
Extracts nodes/ways/relations, their metadata and dependencies
from .osm XML files.
'''
import re
import six
import urllib
import HTMLParser
from collections import OrderedDict
from lxml import etree
from geodata.csv_utils import unicode_csv_reader
from geodata.text.normalize import normalize_string, NORMALIZE_STRING_DECOMPOSE, NORMALIZE_STRING_LATIN_ASCII
from geodata.encoding import safe_decode, safe_encode
WAY_OFFSET = 10 ** 15
RELATION_OFFSET = 2 * 10 ** 15
NODE = 'node'
WAY = 'way'
RELATION = 'relation'
ALL_OSM_TAGS = set([NODE, WAY, RELATION])
WAYS_RELATIONS = set([WAY, RELATION])
OSM_NAME_TAGS = (
'name',
'alt_name',
'int_name',
'nat_name',
'reg_name',
'loc_name',
'official_name',
'commonname',
'common_name',
'place_name',
'short_name',
)
OSM_BASE_NAME_TAGS = (
'tiger:name_base',
)
def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False):
'''
Parse a file in .osm format iteratively, generating tuples like:
('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4])
'''
f = open(filename)
parser = etree.iterparse(f)
single_type = len(allowed_types) == 1
for (_, elem) in parser:
elem_id = long(elem.attrib.pop('id', 0))
item_type = elem.tag
if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET:
elem_id -= WAY_OFFSET
item_type = 'way'
elif elem_id >= RELATION_OFFSET:
elem_id -= RELATION_OFFSET
item_type = 'relation'
if item_type in allowed_types:
attrs = OrderedDict(elem.attrib)
attrs['type'] = item_type
attrs['id'] = safe_encode(elem_id)
top_level_attrs = set(attrs)
deps = [] if dependencies else None
for e in elem.getchildren():
if e.tag == 'tag':
# Prevent user-defined lat/lon keys from overriding the lat/lon on the node
key = e.attrib['k']
if key not in top_level_attrs:
attrs[key] = e.attrib['v']
elif dependencies and item_type == 'way' and e.tag == 'nd':
deps.append(long(e.attrib['ref']))
elif dependencies and item_type == 'relation' and e.tag == 'member' and 'role' in e.attrib:
deps.append((long(e.attrib['ref']), e.attrib.get('type'), e.attrib['role']))
key = elem_id if single_type else '{}:{}'.format(item_type, elem_id)
yield key, attrs, deps
if elem.tag in ALL_OSM_TAGS:
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
def osm_type_and_id(element_id):
element_id = long(element_id)
if element_id >= RELATION_OFFSET:
id_type = RELATION
element_id -= RELATION_OFFSET
elif element_id >= WAY_OFFSET:
id_type = WAY
element_id -= WAY_OFFSET
else:
id_type = NODE
return id_type, element_id
apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
html_parser = HTMLParser.HTMLParser()
def normalize_wikipedia_title(title):
match = apposition_regex.match(title)
if match:
title = match.group(1)
title = safe_decode(title)
title = html_parser.unescape(title)
title = urllib.unquote_plus(title)
return title.replace(u'_', u' ').strip()
def osm_wikipedia_title_and_language(key, value):
language = None
if u':' in key:
key, language = key.rsplit(u':', 1)
if u':' in value:
possible_language = value.split(u':', 1)[0]
if len(possible_language) == 2 and language is None:
language = possible_language
value = value.rsplit(u':', 1)[-1]
return normalize_wikipedia_title(value), language
non_breaking_dash = six.u('[-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]')
simple_number = six.u('(?:{})?[0-9]+(?:\.[0-9]+)?').format(non_breaking_dash)
simple_number_regex = re.compile(simple_number, re.UNICODE)
non_breaking_dash_regex = re.compile(non_breaking_dash, re.UNICODE)
number_range_regex = re.compile(six.u('({}){}({})').format(simple_number, non_breaking_dash, simple_number), re.UNICODE)
letter_range_regex = re.compile(r'([^\W\d_]){}([^\W\d_])'.format(non_breaking_dash.encode('unicode-escape')), re.UNICODE)
number_split_regex = re.compile('[,;]')
def parse_osm_number_range(value, parse_letter_range=True, max_range=100):
value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE)
numbers = []
values = number_split_regex.split(value)
for val in values:
val = val.strip()
match = number_range_regex.match(val)
if match:
start_num, end_num = match.groups()
start_num_len = len(start_num)
zfill = 0
if start_num.startswith('0'):
zfill = start_num_len
try:
start_num = int(start_num)
end_num = int(end_num)
if end_num > start_num:
if end_num - start_num > max_range:
end_num = start_num + max_range
for i in xrange(start_num, end_num + 1):
numbers.append(safe_decode(i).zfill(zfill))
else:
numbers.append(val.strip().zfill(zfill))
continue
except (TypeError, ValueError):
numbers.append(safe_decode(val).strip().zfill(zfill))
continue
else:
letter_match = letter_range_regex.match(val)
if letter_match and parse_letter_range:
start_num, end_num = letter_match.groups()
start_num = ord(start_num)
end_num = ord(end_num)
if end_num > start_num:
if end_num - start_num > max_range:
end_num = start_num + max_range
for i in xrange(start_num, end_num + 1):
numbers.append(six.unichr(i))
else:
numbers.extend([six.unichr(start_num), six.unichr(end_num)])
continue
else:
numbers.append(safe_decode(val.strip()))
return numbers

View File

@@ -0,0 +1,282 @@
#!/usr/bin/env bash
: '
fetch_osm_address_data.sh
-------------------------
Shell script to download OSM planet and derive inputs
for language detection and address parser training set
construction.
Usage: ./fetch_osm_address_data.sh out_dir
'
if [ "$#" -ge 1 ]; then
OUT_DIR=$1
else
OUT_DIR=`pwd`
fi
set -e
THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
RESOURCES_DIR=$THIS_DIR/../../../resources
ADMIN1_FILE=$RESOURCES_DIR/language/regional/adm1.tsv
# Check for osmfilter and osmconvert
if ! type -P osmfilter osmconvert > /dev/null; then
cat << EOF
ERROR: osmfilter and osmconvert are required
On Debian/Ubuntu:
sudo apt-get install osmctools
Or to compile:
wget -O - http://m.m.i24.cc/osmfilter.c |cc -x c - -O3 -o osmfilter
wget -O - http://m.m.i24.cc/osmconvert.c | cc -x c - -lz -O3 -o osmconvert
EOF
exit 127
fi
PREV_DIR=`pwd`
cd $OUT_DIR
# Download planet as PBF
# TODO: currently uses single mirror, randomly choose one instead
echo "Started OSM download: `date`"
PLANET_PBF="planet-latest.osm.pbf"
JAPAN_PBF="japan-latest.osm.pbf"
wget --quiet http://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-latest.osm.pbf -O $OUT_DIR/$PLANET_PBF &
wget --quiet http://download.geofabrik.de/asia/japan-latest.osm.pbf -O $OUT_DIR/$JAPAN_PBF &
wait
echo "Converting to o5m: `date`"
PLANET_O5M="planet-latest.o5m"
JAPAN_O5M="japan-latest.o5m"
# Needs to be in O5M for some of the subsequent steps to work whereas PBF is smaller for download
osmconvert $PLANET_PBF -o=$PLANET_O5M &
osmconvert $JAPAN_PBF -o=$JAPAN_O5M &
wait
rm $PLANET_PBF
rm $JAPAN_PBF
VALID_AEROWAY_KEYS="aeroway=aerodrome"
VALID_AMENITY_KEYS="amenity=ambulance_station or amenity=animal_boarding or amenity=animal_shelter or amenity=arts_centre or amenity=auditorium or amenity=baby_hatch or amenity=bank or amenity=bar or amenity=bbq or amenity=biergarten or amenity=boathouse or amenity=boat_rental or amenity=boat_sharing or amenity=boat_storage or amenity=brothel or amenity=bureau_de_change or amenity=bus_station or amenity=cafe or amenity=car_rental or amenity=car_sharing or amenity=car_wash or amenity=casino or amenity=cemetery or amenity=charging_station or amenity=cinema or amenity=childcare or amenity=clinic or amenity=club or amenity=clock or amenity=college or amenity=community_center or amenity=community_centre or amenity=community_hall or amenity=concert_hall or amenity=conference_centre or amenity=courthouse or amenity=coworking_space or amenity=crematorium or amenity=crypt or amenity=culture_center or amenity=dancing_school or amenity=dentist or amenity=dive_centre or amenity=doctors or amenity=dojo or amenity=dormitory or amenity=driving_school or amenity=embassy or amenity=emergency_service or amenity=events_venue or amenity=exhibition_centre or amenity=fast_food or amenity=ferry_terminal or amenity=festival_grounds or amenity=fire_station or amenity=food_count or amenity=fountain or amenity=gambling or amenity=game_feeding or amenity=grave_yard or amenity=greenhouse or amenity=gym or amenity=hall or amenity=health_centre or amenity=hospice or amenity=hospital or amenity=hotel or amenity=hunting_stand or amenity=ice_cream or amenity=internet_cafe or amenity=kindergarten or amenity=kiosk or amenity=kneipp_water_cure or amenity=language_school or amenity=lavoir or amenity=library or amenity=love_hotel or amenity=market or amenity=marketplace or amenity=medical_centre or amenity=mobile_money_agent or amenity=monastery or amenity=money_transfer or amenity=mortuary or amenity=mountain_rescue or amenity=music_school or amenity=music_venue or amenity=nightclub or amenity=nursery or amenity=nursing_home or amenity=office or amenity=parish_hall or amenity=park or amenity=pharmacy or amenity=planetarium or amenity=place_of_worship or amenity=police or amenity=post_office or amenity=preschool or amenity=prison or amenity=pub or amenity=public_bath or amenity=public_bookcase or amenity=public_building or amenity=public_facility or amenity=public_hall or amenity=public_market or amenity=ranger_station or amenity=refugee_housing or amenity=register_office or amenity=research_institute or amenity=rescue_station or amenity=residential or amenity=Residential or amenity=restaurant or amenity=retirement_home or amenity=sacco or amenity=sanitary_dump_station or amenity=sanitorium or amenity=sauna or amenity=school or amenity=shelter or amenity=shop or amenity=shopping or amenity=shower or amenity=ski_rental or amenity=ski_school or amenity=social_centre or amenity=social_club or amenity=social_facility or amenity=spa or amenity=stables or amenity=stripclub or amenity=studio or amenity=supermarket or amenity=swimming_pool or amenity=swingerclub or amenity=townhall or amenity=theatre or amenity=training or amenity=trolley_bay or amenity=university or amenity=vehicle_inspection or amenity=veterinary or amenity=village_hall or amenity=vivarium or amenity=waste_transfer_station or amenity=whirlpool or amenity=winery or amenity=youth_centre"
GENERIC_AMENITIES="amenity=atm or amenity=bench or amenity=bicycle_parking or amenity=bicycle_rental or amenity=bicycle_repair_station or amenity=compressed_air or amenity=drinking_water or amenity=emergency_phone or amenity=fire_hydrant or amenity=fuel or amenity=grit_bin or amenity=motorcycle_parking or amenity=parking or amenity=parking_space or amenity=post_box or amenity=reception_area or amenity=recycling or amenity=taxi or amenity=telephone or amenity=ticket_validator or amenity=toilets or amenity=vending_machine or amenity=waste_basket or amenity=waste_disposal or amenity=water_point or amenity=watering_place or amenity=wifi"
VALID_OFFICE_KEYS="office=accountant or office=administrative or office=administration or office=advertising_agency or office=architect or office=association or office=camping or office=charity or office=company or office=consulting or office=educational_institution or office=employment_agency or office=estate_agent or office=financial or office=forestry or office=foundation or office=government or office=insurance or office=it or office=lawyer or office=newspaper or office=ngo or office=notary or office=parish or office=physician or office=political_party or office=publisher or office=quango or office=real_estate_agent or office=realtor or office=register or office=religion or office=research or office=tax or office=tax_advisor or office=telecommunication or office=therapist or office=travel_agent or office=water_utility"
VALID_SHOP_KEYS="shop="
VALID_HISTORIC_KEYS="historic=archaeological_site or historic=castle or historic=fort or historic=memorial or historic=monument or historic=ruins or historic=tomb"
VALID_PLACE_KEYS="place=farm or place=isolated_dwelling or place=square"
VALID_TOURISM_KEYS="tourism=hotel or tourism=attraction or tourism=guest_house or tourism=museum or tourism=chalet or tourism=motel or tourism=hostel or tourism=alpine_hut or tourism=theme_park or tourism=zoo or tourism=apartment or tourism=wilderness_hut or tourism=gallery or tourism=bed_and_breakfast or tourism=hanami or tourism=wine_cellar or tourism=resort or tourism=aquarium or tourism=apartments or tourism=cabin or tourism=winery or tourism=hut"
VALID_LEISURE_KEYS="leisure=adult_gaming_centre or leisure=amusement_arcade or leisure=arena or leisure=bandstand or leisure=beach_resort or leisure=bbq or leisure=bird_hide or leisure=bowling_alley or leisure=casino or leisure=common or leisure=club or leisure=dance or leisure=dancing or leisure=disc_golf_course or leisure=dog_park or leisure=fishing or leisure=fitness_centre or leisure=gambling or leisure=garden or leisure=golf_course or leisure=hackerspace or leisure=horse_riding or leisure=hospital or leisure=hot_spring or leisure=ice_rink leisure=landscape_reserve or leisure=marina or leisure=maze or leisure=miniature_golf or leisure=nature_reserve or leisure=padding_pool or leisure=park or leisure=pitch or leisure=playground or leisure=recreation_ground or leisure=resort or leisure=sailing_club or leisure=sauna or leisure=social_club or leisure=sports_centre or leisure=stadium or leisure=summer_camp or leisure=swimming_pool or leisure=tanning_salon or leisure=track or leisure=trampoline_park or leisure=turkish_bath or leisure=video_arcade or leisure=water_park or leisure=wildlife_hide"
VALID_LANDUSE_KEYS="landuse=allotmenets or landuse=basin or landuse=cemetery or landuse=commercial or landuse=construction or landuse=farmland or landuse=forest or landuse=grass or landuse=greenhouse_horticulture or landuse=industrial or landuse=landfill or landuse=meadow or landuse=military or landuse=orchard or landuse=plant_nursery or landuse=port or landuse=quarry or landuse=recreation_ground or landuse=resevoir or landuse=residential or landuse=retail or landuse=village_green or landuse=vineyard"
VALID_VENUE_KEYS="( ( $VALID_AEROWAY_KEYS ) or ( $VALID_AMENITY_KEYS ) or ( $VALID_HISTORIC_KEYS ) or ( $VALID_OFFICE_KEYS ) or ( $VALID_PLACE_KEYS ) or ( $VALID_SHOP_KEYS ) or ( $VALID_TOURISM_KEYS ) or ( $VALID_LEISURE_KEYS ) or ( $VALID_LANDUSE_KEYS ) )"
# Address data set for use in parser, language detection
echo "Filtering for records with address tags: `date`"
PLANET_ADDRESSES_O5M="planet-addresses.o5m"
JAPAN_ADDRESSES_O5M="japan-addresses.o5m"
VALID_ADDRESSES="( ( ( name= or addr:housename= ) and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS ) ) ) or ( ( addr:street= or addr:place= ) and ( name= or building= or building:levels= or addr:housename= or addr:housenumber= ) )"
VALID_ADDRESSES_JAPAN="( addr:housenumber= or addr:street= ) or ( ( name= or name:ja= or addr:housename= ) and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS ) )"
osmfilter $PLANET_O5M --keep="$VALID_ADDRESSES" --drop-author --drop-version -o=$PLANET_ADDRESSES_O5M &
osmfilter $JAPAN_O5M --keep="$VALID_ADDRESSES_JAPAN" --drop-author --drop-version -o=$JAPAN_ADDRESSES_O5M &
wait
PLANET_ADDRESSES_LATLONS="planet-addresses-latlons.o5m"
JAPAN_ADDRESSES_LATLONS="japan-addresses-latlons.o5m"
osmconvert $PLANET_ADDRESSES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_ADDRESSES_LATLONS &
osmconvert $JAPAN_ADDRESSES_O5M --max-objects=1000000000 --all-to-nodes -o=$JAPAN_ADDRESSES_LATLONS &
wait
rm $PLANET_ADDRESSES_O5M
rm $JAPAN_ADDRESSES_O5M
PLANET_ADDRESSES="planet-addresses.osm"
osmfilter $PLANET_ADDRESSES_LATLONS --keep="$VALID_ADDRESSES" -o=$PLANET_ADDRESSES_O5M &
osmfilter $JAPAN_ADDRESSES_LATLONS --keep="$VALID_ADDRESSES_JAPAN" -o=$JAPAN_ADDRESSES_O5M &
wait
osmconvert $PLANET_ADDRESSES_O5M $JAPAN_ADDRESSES_O5M -o=$PLANET_ADDRESSES
rm $PLANET_ADDRESSES_O5M
rm $JAPAN_ADDRESSES_O5M
rm $PLANET_ADDRESSES_LATLONS
rm $JAPAN_ADDRESSES_LATLONS
# Border data set for use in R-tree index/reverse geocoding, parsing, language detection
echo "Filtering for borders: `date`"
PLANET_COUNTRIES="planet-countries.osm"
PLANET_BORDERS_O5M="planet-borders.o5m"
PLANET_BORDERS="planet-borders.osm"
PLANET_ADMIN_BORDERS_OSM="planet-admin-borders.osm"
VALID_COUNTRY_KEYS="ISO3166-1:alpha2="
VALID_ADMIN1_KEYS="ISO3166-2="
ADMIN1_LANGUAGE_EXCEPTION_IDS=$(grep "osm" $ADMIN1_FILE | sed 's/^.*relation:\([0-9][0-9]*\).*$/@id=\1/' | xargs echo | sed 's/\s/ or /g')
VALID_ADMIN_BORDER_KEYS="boundary=administrative or boundary=town or boundary=city_limit or boundary=civil_parish or boundary=civil or boundary=ceremonial or boundary=postal_district or place=island or place=city or place=town or place=village or place=hamlet or place=municipality or place=settlement"
VALID_POPULATED_PLACE_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=locality or place=settlement or place=census-designated or place:ph=village"
VALID_NEIGHBORHOOD_KEYS="place=neighbourhood or place=neighborhood or place:ph=barangay"
VALID_EXTENDED_NEIGHBORHOOD_KEYS="place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place:ph=barangay"
VALID_LOCALITY_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place=locality or place=settlement or place=census-designated or place:ph=barangay or place:ph=village"
VALID_ADMIN_NODE_KEYS="place=city or place=town or place=village or place=hamlet or placement=municipality or place=neighbourhood or place=neighborhood or place=suburb or place=quarter or place=borough or place=island or place=islet or place=county or place=region or place=state or place=subdistrict or place=township or place=archipelago or place=department or place=country or place=district or place=census-designated or place=ward or place=subward or place=province or place=peninsula or place=settlement or place=subregion"
osmfilter $PLANET_O5M --keep="$VALID_ADMIN_BORDER_KEYS" --drop-author --drop-version -o=$PLANET_ADMIN_BORDERS_OSM &
osmfilter $PLANET_O5M --keep="$VALID_ADMIN_BORDER_KEYS or $VALID_LOCALITY_KEYS" --drop-author --drop-version -o=$PLANET_BORDERS_O5M &
wait
PLANET_ADMIN_NODES="planet-admin-nodes.osm"
osmfilter $PLANET_O5M --keep="$VALID_ADMIN_NODE_KEYS" --drop-ways --drop-relations --ignore-dependencies --drop-author --drop-version -o=$PLANET_ADMIN_NODES
PLANET_BORDERS_LATLONS="planet-borders-latlons.o5m"
osmconvert $PLANET_BORDERS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_BORDERS_LATLONS
rm $PLANET_BORDERS_O5M
osmfilter $PLANET_BORDERS_LATLONS --keep="$VALID_ADMIN_BORDER_KEYS or $VALID_LOCALITY_KEYS" -o=$PLANET_BORDERS
rm $PLANET_BORDERS_LATLONS
osmfilter $PLANET_O5M --keep="$VALID_COUNTRY_KEYS or $VALID_ADMIN1_KEYS or $ADMIN1_LANGUAGE_EXCEPTION_IDS" --drop-author --drop-version -o=$PLANET_COUNTRIES
echo "Filtering for neighborhoods"
PLANET_LOCALITIES="planet-localities.osm"
PLANET_NEIGHBORHOOD_BORDERS="planet-neighborhood-borders.osm"
osmfilter $PLANET_O5M --keep="$VALID_NEIGHBORHOOD_KEYS" --drop-author --drop-version -o=$PLANET_NEIGHBORHOOD_BORDERS
osmfilter $PLANET_O5M --keep="name= and ( $VALID_LOCALITY_KEYS )" --drop-relations --drop-ways --ignore-dependencies --drop-author --drop-version -o=$PLANET_LOCALITIES
echo "Filtering for rail stations"
VALID_RAIL_STATION_KEYS="railway=station"
PLANET_RAILWAYS_O5M="planet-rail-stations.o5m"
PLANET_RAILWAYS="planet-rail-stations.osm"
osmfilter $PLANET_O5M --keep="$VALID_RAIL_STATION_KEYS" --drop-author --drop-version -o=$PLANET_RAILWAYS_O5M
PLANET_RAILWAYS_LATLONS="planet-rail-stations-latlons.o5m"
osmconvert $PLANET_RAILWAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_RAILWAYS_LATLONS
rm $PLANET_RAILWAYS_O5M
osmfilter $PLANET_RAILWAYS_LATLONS --keep="$VALID_RAIL_STATION_KEYS" -o=$PLANET_RAILWAYS
rm $PLANET_RAILWAYS_LATLONS
echo "Filtering for airports and terminals"
VALID_AIRPORT_KEYS="aeroway=aerodrome or aeroway=terminal"
PLANET_AIRPORTS_O5M="planet-airports.o5m"
PLANET_AIRPORTS="planet-airports.osm"
osmfilter $PLANET_O5M --keep="$VALID_AIRPORT_KEYS" --drop-author --drop-version -o=$PLANET_AIRPORTS_O5M
PLANET_AIRPORTS_LATLONS="planet-airports-latlons.o5m"
osmconvert $PLANET_AIRPORTS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_AIRPORTS_LATLONS
PLANET_AIRPORT_POLYGONS="planet-airport-polygons.osm"
osmconvert $PLANET_AIRPORTS_O5M -o=$PLANET_AIRPORT_POLYGONS
rm $PLANET_AIRPORTS_O5M
osmfilter $PLANET_AIRPORTS_LATLONS --keep="$VALID_AIRPORT_KEYS" -o=$PLANET_AIRPORTS
rm $PLANET_AIRPORTS_LATLONS
echo "Filtering for subdivision polygons"
PLANET_SUBDIVISIONS="planet-subdivisions.osm"
SUBDIVISION_AMENITY_TYPES="amenity=university or amentiy=college or amentiy=school or amentiy=hospital"
SUBDIVISION_LANDUSE_TYPES="landuse=residential or landuse=commercial or landuse=industrial or landuse=retail or landuse=military"
SUBDIVISION_PLACE_TYPES="place=allotmenets or place=city_block or place=block or place=plot or place=subdivision"
osmfilter $PLANET_O5M --keep="( $SUBDIVISION_AMENITY_TYPES or $SUBDIVISION_PLACE_TYPES or $SUBDIVISION_LANDUSE_TYPES )" --drop="( place= and not ( $SUBDIVISION_PLACE_TYPES ) ) or boundary=" --drop-author --drop-version -o=$PLANET_SUBDIVISIONS
echo "Filtering for postal_code polygons"
PLANET_POSTAL_CODES="planet-postcodes.osm"
osmfilter $PLANET_O5M --keep="boundary=postal_code" --drop-author --drop-version -o=$PLANET_POSTAL_CODES
# Venue data set for use in venue classification
echo "Filtering for venue records: `date`"
PLANET_VENUES_O5M="planet-venues.o5m"
osmfilter $PLANET_O5M --keep="( name= and ( ( building= and building!=yes ) or $VALID_VENUE_KEYS or ( $VALID_RAIL_STATION_KEYS and addr:street= and ( wikipedia= or wikipedia:*= ) ) ) )" --drop-author --drop-version -o=$PLANET_VENUES_O5M
PLANET_VENUES_LATLONS="planet-venues-latlons.o5m"
osmconvert $PLANET_VENUES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_VENUES_LATLONS
rm $PLANET_VENUES_O5M
PLANET_VENUES="planet-venues.osm"
osmfilter $PLANET_VENUES_LATLONS --keep="name= and ( ( building= and building!=yes ) or ( $VALID_VENUE_KEYS or ( $VALID_RAIL_STATION_KEYS and addr:street= and ( wikipedia= or wikipedia:*= ) ) ) )" -o=$PLANET_VENUES
rm $PLANET_VENUES_LATLONS
# Categories for building generic queries like "restaurants in Brooklyn"
echo "Filtering for buildings: `date`"
PLANET_BUILDINGS_O5M="planet-buildings.o5m"
VALID_BUILDING_KEYS="building= or building:part="
VALID_BUILDINGS="( ( $VALID_BUILDING_KEYS ) and ( building!=yes or name= or addr:housename= or addr:street= or addr:housenumber= or addr:postcode= ) )"
osmfilter $PLANET_O5M --keep="$VALID_BUILDINGS" --drop-author --drop-version -o=$PLANET_BUILDINGS_O5M
PLANET_BUILDINGS_LATLONS="planet-buildings-latlons.o5m"
osmconvert $PLANET_BUILDINGS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_BUILDINGS_LATLONS
rm $PLANET_BUILDINGS_O5M
PLANET_BUILDINGS="planet-buildings.osm"
osmfilter $PLANET_BUILDINGS_LATLONS --keep="$VALID_BUILDINGS" -o=$PLANET_BUILDINGS
rm $PLANET_BUILDINGS_LATLONS
echo "Filtering for building polygons: `date`"
PLANET_BUILDING_POLYGONS="planet-building-polygons.osm"
osmfilter $PLANET_O5M --keep="( ( building= or building:part= or type=building ) and ( building:levels= or name= or addr:street= or addr:place= or addr:housename= or addr:housenumber= ) )" --drop-author --drop-version -o=$PLANET_BUILDING_POLYGONS
echo "Filtering for amenities: `date`"
PLANET_AMENITIES_O5M="planet-amenities.o5m"
ALL_AMENITIES="aeroway= or amenity= or or emergency= or historic= or internet_access= or landuse= or leisure= or man_made= or mountain_pass= or office= or place= or railway= or shop= or tourism="
osmfilter $PLANET_O5M --keep="$ALL_AMENITIES" --drop-author --drop-version -o=$PLANET_AMENITIES_O5M
PLANET_AMENITIES_LATLONS="planet-amenities-latlons.o5m"
osmconvert $PLANET_AMENITIES_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_AMENITIES_LATLONS
rm $PLANET_AMENITIES_O5M
PLANET_AMENITIES="planet-amenities.osm"
osmfilter $PLANET_AMENITIES_LATLONS --keep="$ALL_AMENITIES" -o=$PLANET_AMENITIES
rm $PLANET_AMENITIES_LATLONS
echo "Filtering for natural: `date`"
PLANET_NATURAL_O5M="planet-natural.o5m"
VALID_NATURAL_KEYS="natural="
osmfilter $PLANET_O5M --keep="$VALID_NATURAL_KEYS" --drop-author --drop-version -o=$PLANET_NATURAL_O5M
PLANET_NATURAL_LATLONS="planet-natural-latlons.o5m"
osmconvert $PLANET_NATURAL_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_NATURAL_LATLONS
rm $PLANET_NATURAL_O5M
PLANET_NATURAL="planet-natural.osm"
osmfilter $PLANET_NATURAL_LATLONS --keep="$VALID_NATURAL_KEYS" -o=$PLANET_NATURAL
rm $PLANET_NATURAL_LATLONS
echo "Filtering for waterways: `date`"
PLANET_WATERWAYS_O5M="planet-waterways.o5m"
VALID_WATERWAY_KEYS="waterway="
osmfilter $PLANET_O5M --keep="$VALID_WATERWAY_KEYS" --drop-author --drop-version -o=$PLANET_WATERWAYS_O5M
PLANET_WATERWAYS_LATLONS="planet-waterways-latlons.o5m"
osmconvert $PLANET_WATERWAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_WATERWAYS_LATLONS
rm $PLANET_WATERWAYS_O5M
PLANET_WATERWAYS="planet-waterways.osm"
osmfilter $PLANET_WATERWAYS_LATLONS --keep="$VALID_WATERWAY_KEYS" -o=$PLANET_WATERWAYS
rm $PLANET_WATERWAYS_LATLONS
# Streets data set for use in language classification
echo "Filtering ways: `date`"
PLANET_WAYS_O5M="planet-ways.o5m"
VALID_ROAD_TYPES="( highway=motorway or highway=motorway_link or highway=motorway_junction or highway=trunk or highway=trunk_link or highway=primary or highway=primary_link or highway=secondary or highway=secondary_link or highway=tertiary or highway=tertiary_link or highway=unclassified or highway=unclassified_link or highway=residential or highway=residential_link or highway=service or highway=service_link or highway=living_street or highway=pedestrian or highway=steps or highway=cycleway or highway=bridleway or highway=track or highway=road or ( highway=path and ( motorvehicle=yes or motorcar=yes ) ) )"
osmfilter planet-latest.o5m --keep="name= and $VALID_ROAD_TYPES" --drop-relations --drop-author --drop-version -o=$PLANET_WAYS_O5M
PLANET_WAYS_NODES_LATLON="planet-ways-nodes-latlons.o5m"
osmconvert $PLANET_WAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_WAYS_NODES_LATLON
# 10^15 is the offset used for ways and relations with --all-to-ndoes, extracts just the ways
PLANET_WAYS_LATLONS="planet-ways-latlons.osm"
PLANET_WAYS="planet-ways.osm"
osmfilter $PLANET_WAYS_NODES_LATLON --keep="name= and ( $VALID_ROAD_TYPES )" -o=$PLANET_WAYS
osmfilter $PLANET_WAYS_O5M --keep="name= and ( $VALID_ROAD_TYPES )" -o=$PLANET_WAYS_LATLONS
rm $PLANET_WAYS_NODES_LATLON
rm $PLANET_WAYS_O5M
rm $PLANET_O5M
rm $JAPAN_O5M
echo "Completed : `date`"
cd $PREV_DIR

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,189 @@
import argparse
import array
import logging
import numpy
import os
import six
import sys
import ujson as json
from bisect import bisect_left
from leveldb import LevelDB
from itertools import izip, groupby
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.coordinates.conversion import latlon_to_decimal
from geodata.file_utils import ensure_dir
from geodata.osm.extract import *
from geodata.encoding import safe_decode, safe_encode
DEFAULT_INTERSECTIONS_FILENAME = 'intersections.json'
class OSMIntersectionReader(object):
def __init__(self, filename, db_dir):
self.filename = filename
self.node_ids = array.array('l')
self.logger = logging.getLogger('osm.intersections')
# Store these in a LevelDB
ensure_dir(db_dir)
ways_dir = os.path.join(db_dir, 'ways')
ensure_dir(ways_dir)
nodes_dir = os.path.join(db_dir, 'nodes')
ensure_dir(nodes_dir)
self.way_props = LevelDB(ways_dir)
self.node_props = LevelDB(nodes_dir)
# These form a graph and should always have the same length
self.intersection_edges_nodes = array.array('l')
self.intersection_edges_ways = array.array('l')
def binary_search(self, a, x):
'''Locate the leftmost value exactly equal to x'''
i = bisect_left(a, x)
if i != len(a) and a[i] == x:
return i
return None
def intersections(self):
'''
Generator which yields tuples like:
(node_id, lat, lon, {way_id: way_props})
'''
i = 0
node_ids = array.array('l')
node_counts = array.array('i')
for element_id, props, deps in parse_osm(self.filename, dependencies=True):
props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
if element_id.startswith('node'):
node_id = long(element_id.split(':')[-1])
node_ids.append(node_id)
node_counts.append(0)
self.node_props.Put(safe_encode(node_id), json.dumps(props))
elif element_id.startswith('way'):
# Don't care about the ordering of the nodes, and want uniques e.g. for circular roads
deps = set(deps)
# Get node indices by binary search
for node_id in deps:
try:
node_index = self.binary_search(node_ids, node_id)
except ValueError:
continue
if node_index is None:
continue
node_counts[node_index] += 1
if i % 1000 == 0 and i > 0:
self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
i += 1
for i, count in enumerate(node_counts):
if count > 1:
self.node_ids.append(node_ids[i])
del node_ids
del node_counts
i = 0
for element_id, props, deps in parse_osm(self.filename, dependencies=True):
if element_id.startswith('node'):
node_id = long(element_id.split(':')[-1])
node_index = self.binary_search(self.node_ids, node_id)
elif element_id.startswith('way'):
props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
way_id = long(element_id.split(':')[-1])
props['id'] = way_id
for node_id in deps:
node_index = self.binary_search(self.node_ids, node_id)
if node_index is not None:
self.intersection_edges_nodes.append(node_id)
self.intersection_edges_ways.append(way_id)
self.way_props.Put(safe_encode(way_id), json.dumps(props))
if i % 1000 == 0 and i > 0:
self.logger.info('second pass, doing {}s, at {}'.format(element_id.split(':')[0], i))
i += 1
i = 0
indices = numpy.argsort(self.intersection_edges_nodes)
self.intersection_edges_nodes = numpy.fromiter((self.intersection_edges_nodes[i] for i in indices), dtype=numpy.uint64)
self.intersection_edges_ways = numpy.fromiter((self.intersection_edges_ways[i] for i in indices), dtype=numpy.uint64)
del indices
idx = 0
# Need to make a copy here otherwise will change dictionary size during iteration
for node_id, g in groupby(self.intersection_edges_nodes):
group_len = sum((1 for j in g))
node_props = json.loads(self.node_props.Get(safe_encode(node_id)))
way_indices = self.intersection_edges_ways[idx:idx + group_len]
all_ways = [json.loads(self.way_props.Get(safe_encode(w))) for w in way_indices]
way_names = set()
ways = []
for way in all_ways:
if way['name'] in way_names:
continue
ways.append(way)
way_names.add(way['name'])
idx += group_len
if i % 1000 == 0 and i > 0:
self.logger.info('checking intersections, did {}'.format(i))
i += 1
if len(ways) > 1:
node_index = self.binary_search(self.node_ids, node_id)
yield self.node_ids[node_index], node_props, ways
def create_intersections(self, outfile):
out = open(outfile, 'w')
for node_id, node_props, ways in self.intersections():
d = {'id': safe_encode(node_id),
'node': node_props,
'ways': ways}
out.write(json.dumps(d) + six.u('\n'))
@classmethod
def read_intersections(cls, infile):
f = open(infile)
for line in f:
data = json.loads(line.rstrip())
yield data['id'], data['node'], data['ways']
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input',
required=True,
help='Path to planet-ways-latlons.osm')
parser.add_argument('--db-dir',
required=True,
help='Path to temporary db')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
required=True,
help='Output directory')
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
reader = OSMIntersectionReader(args.input, args.db_dir)
reader.create_intersections(os.path.join(args.out_dir, DEFAULT_INTERSECTIONS_FILENAME))

View File

@@ -0,0 +1,563 @@
# -*- coding: utf-8 -*-
'''
osm_address_training_data.py
----------------------------
This script generates several training sets from OpenStreetMap addresses,
streets, venues and toponyms.
Note: the combined size of all the files created by this script exceeds 100GB
so if training these models, it is wise to use a server-grade machine with
plenty of disk space. The following commands can be used in parallel to create
all the training sets:
Ways:
python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
Venues:
python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
Limited formatted addresses:
python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) -o $(OUT_DIR)
Formatted addresses (tagged):
python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
Formatted addresses (untagged):
python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f -u --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
Intersections (after running intersections.py to create the JSON file):
python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
Toponyms:
python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) -o $(OUT_DIR)
'''
import argparse
import csv
import logging
import os
import operator
import random
import re
import sys
import tempfile
import urllib
import ujson as json
import HTMLParser
from collections import defaultdict, OrderedDict
from lxml import etree
from itertools import ifilter, chain, combinations
from shapely.geos import LOG as shapely_geos_logger
shapely_geos_logger.setLevel(logging.CRITICAL)
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.address_expansions.abbreviations import abbreviate
from geodata.address_expansions.gazetteers import *
from geodata.addresses.components import AddressComponents
from geodata.coordinates.conversion import *
from geodata.language_id.disambiguation import *
from geodata.language_id.sample import sample_random_language
from geodata.i18n.languages import *
from geodata.metro_stations.reverse_geocode import MetroStationReverseGeocoder
from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
from geodata.osm.extract import *
from geodata.osm.formatter import OSMAddressFormatter
from geodata.places.reverse_geocode import PlaceReverseGeocoder
from geodata.polygons.language_polys import *
from geodata.polygons.reverse_geocode import *
from geodata.i18n.unicode_paths import DATA_DIR
from geodata.csv_utils import *
from geodata.file_utils import *
# Input files
PLANET_ADDRESSES_INPUT_FILE = 'planet-addresses.osm'
PLANET_WAYS_INPUT_FILE = 'planet-ways.osm'
PLANET_VENUES_INPUT_FILE = 'planet-venues.osm'
PLANET_BORDERS_INPUT_FILE = 'planet-borders.osm'
# Output files
WAYS_LANGUAGE_DATA_FILENAME = 'streets_by_language.tsv'
ADDRESS_LANGUAGE_DATA_FILENAME = 'address_streets_by_language.tsv'
TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv'
def normalize_osm_name_tag(tag, script=False):
norm = tag.rsplit(':', 1)[-1]
if not script:
return norm
return norm.split('_', 1)[0]
def get_language_names(country_rtree, key, value, tag_prefix='name'):
if not ('lat' in value and 'lon' in value):
return None, None
has_colon = ':' in tag_prefix
tag_first_component = tag_prefix.split(':')[0]
tag_last_component = tag_prefix.split(':')[-1]
try:
latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
except Exception:
return None, None
osm_country_components = country_rtree.point_in_poly(latitude, longitude, return_all=True)
country, candidate_languages = country_rtree.country_and_languages_from_components(osm_country_components)
if not (country and candidate_languages):
return None, None
num_langs = len(candidate_languages)
default_langs = set([l for l, d in candidate_languages if d])
num_defaults = len(default_langs)
name_language = defaultdict(list)
alternate_langs = []
equivalent_alternatives = defaultdict(list)
for k, v in value.iteritems():
if k.startswith(tag_prefix + ':') and normalize_osm_name_tag(k, script=True) in languages:
lang = k.rsplit(':', 1)[-1]
alternate_langs.append((lang, v))
equivalent_alternatives[v].append(lang)
has_alternate_names = len(alternate_langs)
# Some countries like Lebanon list things like name:en == name:fr == "Rue Abdel Hamid Karame"
# Those addresses should be disambiguated rather than taken for granted
ambiguous_alternatives = set([k for k, v in equivalent_alternatives.iteritems() if len(v) > 1])
regional_defaults = 0
country_defaults = 0
regional_langs = set()
country_langs = set()
for c in osm_country_components:
_, langs = country_rtree.country_and_languages_from_components([c])
if 'ISO3166-1:alpha2' not in c:
regional_defaults += sum((1 for l, d in langs if d))
regional_langs |= set([l for l, d in langs])
else:
country_defaults += sum((1 for l, d in langs if d))
country_langs |= set([l for l, d in langs])
ambiguous_already_seen = set()
for k, v in value.iteritems():
if k.startswith(tag_prefix + ':'):
if v not in ambiguous_alternatives:
norm = normalize_osm_name_tag(k)
norm_sans_script = normalize_osm_name_tag(k, script=True)
if norm in languages or norm_sans_script in languages:
name_language[norm].append(v)
elif v not in ambiguous_already_seen:
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
lang = disambiguate_language(v, langs)
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
name_language[lang].append(v)
ambiguous_already_seen.add(v)
elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
if num_langs == 1:
name_language[candidate_languages[0][0]].append(v)
else:
lang = disambiguate_language(v, candidate_languages)
default_lang = candidate_languages[0][0]
if lang == AMBIGUOUS_LANGUAGE:
return None, None
elif lang == UNKNOWN_LANGUAGE and num_defaults == 1:
name_language[default_lang].append(v)
elif lang != UNKNOWN_LANGUAGE:
if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
return None, None
name_language[lang].append(v)
else:
return None, None
return country, name_language
def build_ways_training_data(country_rtree, infile, out_dir, abbreviate_streets=True):
'''
Creates a training set for language classification using most OSM ways
(streets) under a fairly lengthy osmfilter definition which attempts to
identify all roads/ways designated for motor vehicle traffic, which
is more-or-less what we'd expect to see in addresses.
The fields are {language, country, street name}. Example:
ar ma ﺵﺍﺮﻋ ﻑﺎﻟ ﻮﻟﺩ ﻊﻤﻳﺭ
'''
i = 0
f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile, allowed_types=WAYS_RELATIONS):
country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
if not name_language:
continue
for lang, val in name_language.iteritems():
for v in val:
for s in v.split(';'):
if lang in languages:
writer.writerow((lang, country, tsv_string(s)))
if not abbreviate_streets:
continue
abbrev = abbreviate(street_and_synonyms_gazetteer, s, lang)
if abbrev != s:
writer.writerow((lang, country, tsv_string(abbrev)))
if i % 1000 == 0 and i > 0:
print('did {} ways'.format(i))
i += 1
f.close()
NAME_KEYS = (
'name',
'addr:housename',
)
HOUSE_NUMBER_KEYS = (
'addr:house_number',
'addr:housenumber',
'house_number'
)
COUNTRY_KEYS = (
'country',
'country_name',
'addr:country',
'is_in:country',
'addr:country_code',
'country_code',
'is_in:country_code'
)
POSTAL_KEYS = (
'postcode',
'postal_code',
'addr:postcode',
'addr:postal_code',
)
def build_toponym_training_data(country_rtree, infile, out_dir):
'''
Data set of toponyms by language and country which should assist
in language classification. OSM tends to use the native language
by default (e.g. Москва instead of Moscow). Toponyms get messy
due to factors like colonialism, historical names, name borrowing
and the shortness of the names generally. In these cases
we're more strict as to what constitutes a valid language for a
given country.
Example:
ja jp 東京都
'''
i = 0
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile):
if not any((k.startswith('name') for k, v in value.iteritems())):
continue
try:
latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
except Exception:
continue
osm_country_components = country_rtree.point_in_poly(latitude, longitude, return_all=True)
country, candidate_languages = country_rtree.country_and_languages_from_components(osm_country_components)
if not (country and candidate_languages):
continue
name_language = defaultdict(list)
official = official_languages[country]
default_langs = set([l for l, default in official.iteritems() if default])
_, regional_langs = country_rtree.country_and_languages_from_components([c for c in osm_country_components if 'ISO3166-1:alpha2' not in c])
top_lang = None
if len(official) > 0:
top_lang = official.iterkeys().next()
# E.g. Hindi in India, Urdu in Pakistan
if top_lang is not None and top_lang not in WELL_REPRESENTED_LANGUAGES and len(default_langs) > 1:
default_langs -= WELL_REPRESENTED_LANGUAGES
valid_languages = set([l for l, d in candidate_languages])
'''
WELL_REPRESENTED_LANGUAGES are languages like English, French, etc. for which we have a lot of data
WELL_REPRESENTED_LANGUAGE_COUNTRIES are more-or-less the "origin" countries for said languages where
we can take the place names as examples of the language itself (e.g. place names in France are examples
of French, whereas place names in much of Francophone Africa tend to get their names from languages
other than French, even though French is the official language.
'''
valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]])
valid_languages |= default_langs
if not valid_languages:
continue
have_qualified_names = False
for k, v in value.iteritems():
if not k.startswith('name:'):
continue
norm = normalize_osm_name_tag(k)
norm_sans_script = normalize_osm_name_tag(k, script=True)
if norm in languages:
lang = norm
elif norm_sans_script in languages:
lang = norm_sans_script
else:
continue
if lang in valid_languages:
have_qualified_names = True
name_language[lang].append(v)
if not have_qualified_names and len(regional_langs) <= 1 and 'name' in value and len(valid_languages) == 1:
name_language[top_lang].append(value['name'])
for k, v in name_language.iteritems():
for s in v:
s = s.strip()
if not s:
continue
writer.writerow((k, country, tsv_string(s)))
if i % 1000 == 0 and i > 0:
print('did {} toponyms'.format(i))
i += 1
f.close()
def build_address_training_data(country_rtree, infile, out_dir, format=False):
'''
Creates training set similar to the ways data but using addr:street tags instead.
These may be slightly closer to what we'd see in real live addresses, containing
variations, some abbreviations (although this is discouraged in OSM), etc.
Example record:
eu es Errebal kalea
'''
i = 0
f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile):
country, street_language = get_language_names(country_rtree, key, value, tag_prefix='addr:street')
if not street_language:
continue
for k, v in street_language.iteritems():
for s in v:
s = s.strip()
if not s:
continue
if k in languages:
writer.writerow((k, country, tsv_string(s)))
if i % 1000 == 0 and i > 0:
print('did {} streets'.format(i))
i += 1
f.close()
VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv'
def build_venue_training_data(country_rtree, infile, out_dir):
i = 0
f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile):
country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
if not name_language:
continue
venue_type = None
for key in (u'amenity', u'building'):
amenity = value.get(key, u'').strip()
if amenity in ('yes', 'y'):
continue
if amenity:
venue_type = u':'.join([key, amenity])
break
if venue_type is None:
continue
for k, v in name_language.iteritems():
for s in v:
s = s.strip()
if k in languages:
writer.writerow((k, country, safe_encode(venue_type), tsv_string(s)))
if i % 1000 == 0 and i > 0:
print('did, {} venues'.format(i))
i += 1
f.close()
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--streets-file',
help='Path to planet-ways.osm')
parser.add_argument('--unabbreviated',
action='store_true',
default=False,
help='Use unabbreviated street names for token counts')
parser.add_argument('-a', '--address-file',
help='Path to planet-addresses.osm')
parser.add_argument('-v', '--venues-file',
help='Path to planet-venues.osm')
parser.add_argument('-b', '--borders-file',
help='Path to planet-borders.osm')
parser.add_argument('-f', '--format',
action='store_true',
default=False,
help='Save formatted addresses (slow)')
parser.add_argument('-u', '--untagged',
action='store_true',
default=False,
help='Save untagged formatted addresses (slow)')
parser.add_argument('-l', '--limited-addresses',
action='store_true',
default=False,
help='Save formatted addresses without house names or country (slow)')
parser.add_argument('-p', '--place-nodes-file',
help='Path to planet-admin-nodes.osm')
parser.add_argument('-t', '--temp-dir',
default=tempfile.gettempdir(),
help='Temp directory to use')
parser.add_argument('-x', '--intersections-file',
help='Path to planet-ways-latlons.osm')
parser.add_argument('--country-rtree-dir',
required=True,
help='Country RTree directory')
parser.add_argument('--rtree-dir',
default=None,
help='OSM reverse geocoder RTree directory')
parser.add_argument('--places-index-dir',
default=None,
help='Places index directory')
parser.add_argument('--metro-stations-index-dir',
default=None,
help='Metro stations reverse geocoder directory')
parser.add_argument('--subdivisions-rtree-dir',
default=None,
help='Subdivisions reverse geocoder RTree directory')
parser.add_argument('--buildings-rtree-dir',
default=None,
help='Buildings reverse geocoder RTree directory')
parser.add_argument('--neighborhoods-rtree-dir',
default=None,
help='Neighborhoods reverse geocoder RTree directory')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
args = parser.parse_args()
country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir)
osm_rtree = None
if args.rtree_dir:
osm_rtree = OSMReverseGeocoder.load(args.rtree_dir)
neighborhoods_rtree = None
if args.neighborhoods_rtree_dir:
neighborhoods_rtree = NeighborhoodReverseGeocoder.load(args.neighborhoods_rtree_dir)
places_index = None
if args.places_index_dir:
places_index = PlaceReverseGeocoder.load(args.places_index_dir)
metro_stations_index = None
if args.metro_stations_index_dir:
metro_stations_index = MetroStationReverseGeocoder.load(args.metro_stations_index_dir)
subdivisions_rtree = None
if args.subdivisions_rtree_dir:
subdivisions_rtree = OSMSubdivisionReverseGeocoder.load(args.subdivisions_rtree_dir)
buildings_rtree = None
if args.buildings_rtree_dir:
buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree_dir)
# Can parallelize
if args.streets_file and not args.format:
build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
if args.borders_file:
build_toponym_training_data(country_rtree, args.borders_file, args.out_dir)
if args.venues_file:
build_venue_training_data(country_rtree, args.venues_file, args.out_dir)
if args.address_file or args.intersections_file:
if osm_rtree is None:
parser.error('--rtree-dir required for formatted addresses')
elif neighborhoods_rtree is None:
parser.error('--neighborhoods-rtree-dir required for formatted addresses')
elif places_index is None:
parser.error('--places-index-dir required for formatted addresses')
if args.address_file and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
if args.address_file and args.limited_addresses:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
if args.place_nodes_file and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)
if args.intersections_file and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)
if args.streets_file and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_ways_training_data(args.streets_file, args.out_dir, tag_components=not args.untagged)