[polygons/osm] Adding a unified neighborhood reverse geocoder incorporating Zetashapes, OSM and Quattroshapes. Uses the new Soft TFIDF implementation to approximately match OSM names to Quattroshapes/Zetashapes names and geohash indices for more coarse point-in-polygon tests (OSM neighborhoods are stored as points not polygons, so need to match with a geometry from the other sources)
This commit is contained in:
@@ -2,6 +2,13 @@ VISIT, VISIT_EDGE, POST_VISIT = range(3)
|
|||||||
|
|
||||||
|
|
||||||
def strongly_connected_components(graph):
|
def strongly_connected_components(graph):
|
||||||
|
'''
|
||||||
|
Find strongly connected components in a graph using iterative
|
||||||
|
depth-first search.
|
||||||
|
|
||||||
|
Based on:
|
||||||
|
http://code.activestate.com/recipes/578507-strongly-connected-components-of-a-directed-graph/
|
||||||
|
'''
|
||||||
identified = set()
|
identified = set()
|
||||||
stack = []
|
stack = []
|
||||||
index = {}
|
index = {}
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ class NameDeduper(object):
|
|||||||
tokens2 = cls.content_tokens(s2)
|
tokens2 = cls.content_tokens(s2)
|
||||||
|
|
||||||
if not cls.possible_match(tokens1, tokens2):
|
if not cls.possible_match(tokens1, tokens2):
|
||||||
return max(cls.dupe_threshold - 0.1, 0.0)
|
return 0.0
|
||||||
|
|
||||||
tokens1_only = [t for t, c in tokens1]
|
tokens1_only = [t for t, c in tokens1]
|
||||||
tokens2_only = [t for t, c in tokens2]
|
tokens2_only = [t for t, c in tokens2]
|
||||||
@@ -87,7 +87,7 @@ class NameDeduper(object):
|
|||||||
tokens2 = cls.content_tokens(s2)
|
tokens2 = cls.content_tokens(s2)
|
||||||
|
|
||||||
if not cls.possible_match(tokens1, tokens2):
|
if not cls.possible_match(tokens1, tokens2):
|
||||||
return max(cls.dupe_threshold - 0.1, 0.0)
|
return 0.0
|
||||||
|
|
||||||
tokens1_only = [t for t, c in tokens1]
|
tokens1_only = [t for t, c in tokens1]
|
||||||
tokens2_only = [t for t, c in tokens2]
|
tokens2_only = [t for t, c in tokens2]
|
||||||
|
|||||||
@@ -12,18 +12,33 @@ Usage:
|
|||||||
'''
|
'''
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
import operator
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||||
|
|
||||||
|
from geodata.coordinates.conversion import latlon_to_decimal
|
||||||
from geodata.encoding import safe_decode
|
from geodata.encoding import safe_decode
|
||||||
|
from geodata.file_utils import ensure_dir
|
||||||
|
from geodata.i18n.unicode_properties import get_chars_by_script
|
||||||
|
from geodata.i18n.word_breaks import ideographic_scripts
|
||||||
|
from geodata.names.similarity import soft_tfidf
|
||||||
|
from geodata.osm.extract import parse_osm, OSM_NAME_TAGS
|
||||||
from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader
|
from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader
|
||||||
from geodata.polygons.index import *
|
from geodata.polygons.index import *
|
||||||
|
from geodata.statistics.tf_idf import IDFIndex
|
||||||
|
|
||||||
|
from postal.text.tokenize import tokenize, token_types
|
||||||
|
from postal.text.normalize import *
|
||||||
|
|
||||||
decode_latin1 = partial(safe_decode, encoding='latin1')
|
decode_latin1 = partial(safe_decode, encoding='latin1')
|
||||||
|
|
||||||
@@ -35,7 +50,345 @@ def str_id(v):
|
|||||||
return str(v)
|
return str(v)
|
||||||
|
|
||||||
|
|
||||||
class QuattroshapesReverseGeocoder(RTreePolygonIndex):
|
class NeighborhoodDeduper(NameDeduper):
|
||||||
|
# Lossless conversions only
|
||||||
|
replacements = {
|
||||||
|
u'saint': u'st',
|
||||||
|
u'and': u'&',
|
||||||
|
}
|
||||||
|
|
||||||
|
discriminative_words = set([
|
||||||
|
# Han numbers
|
||||||
|
u'〇', u'一',
|
||||||
|
u'二', u'三',
|
||||||
|
u'四', u'五',
|
||||||
|
u'六', u'七',
|
||||||
|
u'八', u'九',
|
||||||
|
u'十', u'百',
|
||||||
|
u'千', u'万',
|
||||||
|
u'億', u'兆',
|
||||||
|
u'京', u'第',
|
||||||
|
|
||||||
|
# Roman numerals
|
||||||
|
u'i', u'ii',
|
||||||
|
u'iii', u'iv',
|
||||||
|
u'v', u'vi',
|
||||||
|
u'vii', u'viii',
|
||||||
|
u'ix', u'x',
|
||||||
|
u'xi', u'xii',
|
||||||
|
u'xiii', u'xiv',
|
||||||
|
u'xv', u'xvi',
|
||||||
|
u'xvii', u'xviii',
|
||||||
|
u'xix', u'xx',
|
||||||
|
|
||||||
|
# English directionals
|
||||||
|
u'north', u'south',
|
||||||
|
u'east', u'west',
|
||||||
|
u'northeast', u'northwest',
|
||||||
|
u'southeast', u'southwest',
|
||||||
|
|
||||||
|
# Spanish, Portguese and Italian directionals
|
||||||
|
u'norte', u'nord', u'sur', u'sul', u'sud',
|
||||||
|
u'est', u'este', u'leste', u'oeste', u'ovest',
|
||||||
|
|
||||||
|
# New in various languages
|
||||||
|
u'new',
|
||||||
|
u'nova',
|
||||||
|
u'novo',
|
||||||
|
u'nuevo',
|
||||||
|
u'nueva',
|
||||||
|
u'nuovo',
|
||||||
|
u'nuova',
|
||||||
|
|
||||||
|
# Qualifiers
|
||||||
|
u'heights',
|
||||||
|
u'hills',
|
||||||
|
|
||||||
|
u'upper', u'lower',
|
||||||
|
u'little', u'great',
|
||||||
|
|
||||||
|
u'park',
|
||||||
|
u'parque',
|
||||||
|
|
||||||
|
u'village',
|
||||||
|
|
||||||
|
])
|
||||||
|
|
||||||
|
stopwords = set([
|
||||||
|
u'cp',
|
||||||
|
u'de',
|
||||||
|
u'la',
|
||||||
|
u'urbanizacion',
|
||||||
|
u'do',
|
||||||
|
u'da',
|
||||||
|
u'dos',
|
||||||
|
u'del',
|
||||||
|
u'community',
|
||||||
|
u'bairro',
|
||||||
|
u'barrio',
|
||||||
|
u'le',
|
||||||
|
u'el',
|
||||||
|
u'mah',
|
||||||
|
u'раион',
|
||||||
|
u'vila',
|
||||||
|
u'villa',
|
||||||
|
u'kampung',
|
||||||
|
u'ahupua`a',
|
||||||
|
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||||
|
'''
|
||||||
|
Neighborhoods are very important in cities like NYC, SF, Chicago, London
|
||||||
|
and many others. We want the address parser to be trained with addresses
|
||||||
|
that sufficiently capture variations in address patterns, including
|
||||||
|
neighborhoods. Quattroshapes neighborhood data (in the US at least)
|
||||||
|
is not great in terms of names, mostly becasue GeoPlanet has so many
|
||||||
|
incorrect names. The neighborhoods project, also known as Zetashapes
|
||||||
|
has very accurate polygons with correct names, but only for a handful
|
||||||
|
of cities. OSM usually lists neighborhoods and some other local admin
|
||||||
|
areas like boroughs as points rather than polygons.
|
||||||
|
|
||||||
|
This index merges all of the above data sets in prioritized order
|
||||||
|
(Zetashapes > OSM > Quattroshapes) to provide unified point-in-polygon
|
||||||
|
tests for neighborhoods. The properties vary by source but each has
|
||||||
|
source has least a "name" key which in practice is what we care about.
|
||||||
|
'''
|
||||||
|
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods'
|
||||||
|
|
||||||
|
SCRATCH_DIR = '/tmp'
|
||||||
|
|
||||||
|
DUPE_THRESHOLD = 0.9
|
||||||
|
|
||||||
|
source_priorities = {
|
||||||
|
'zetashapes': 0, # Best names/polygons
|
||||||
|
'osm': 1, # OSM names with Quattroshapes/Zetashapes polygon
|
||||||
|
'quattroshapes': 2, # Good results in some countries/areas
|
||||||
|
}
|
||||||
|
|
||||||
|
level_priorities = {
|
||||||
|
'neighborhood': 0,
|
||||||
|
'local_admin': 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def clone_repo(cls, path):
|
||||||
|
subprocess.check_call(['rm', '-rf', path])
|
||||||
|
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_zetashapes_neighborhoods_index(cls):
|
||||||
|
scratch_dir = cls.SCRATCH_DIR
|
||||||
|
repo_path = os.path.join(scratch_dir, 'neighborhoods')
|
||||||
|
cls.clone_repo(repo_path)
|
||||||
|
|
||||||
|
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
|
||||||
|
ensure_dir(neighborhoods_dir)
|
||||||
|
|
||||||
|
index = GeohashPolygonIndex()
|
||||||
|
|
||||||
|
have_geonames = set()
|
||||||
|
is_neighborhood = set()
|
||||||
|
|
||||||
|
for filename in os.listdir(repo_path):
|
||||||
|
path = os.path.join(repo_path, filename)
|
||||||
|
base_name = filename.split('.')[0].split('gn-')[-1]
|
||||||
|
if filename.endswith('.geojson') and filename.startswith('gn-'):
|
||||||
|
have_geonames.add(base_name)
|
||||||
|
elif filename.endswith('metadata.json'):
|
||||||
|
data = json.load(open(os.path.join(repo_path, filename)))
|
||||||
|
if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'):
|
||||||
|
is_neighborhood.add(base_name)
|
||||||
|
|
||||||
|
for filename in os.listdir(repo_path):
|
||||||
|
if not filename.endswith('.geojson'):
|
||||||
|
continue
|
||||||
|
base_name = filename.rsplit('.geojson')[0]
|
||||||
|
if base_name in have_geonames:
|
||||||
|
f = open(os.path.join(repo_path, 'gn-{}'.format(filename)))
|
||||||
|
elif base_name in is_neighborhood:
|
||||||
|
f = open(os.path.join(repo_path, filename))
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
index.add_geojson_like_file(json.load(f)['features'])
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def count_words(cls, s):
|
||||||
|
doc = defaultdict(int)
|
||||||
|
for t, c in NeighborhoodDeduper.content_tokens(s):
|
||||||
|
doc[t] += 1
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR):
|
||||||
|
'''
|
||||||
|
Given an OSM file (planet or some other bounds) containing neighborhoods
|
||||||
|
as points (some suburbs have boundaries)
|
||||||
|
|
||||||
|
and their dependencies, create an R-tree index for coarse-grained
|
||||||
|
reverse geocoding.
|
||||||
|
|
||||||
|
Note: the input file is expected to have been created using
|
||||||
|
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
|
||||||
|
admin borders commands if using other geometries.
|
||||||
|
'''
|
||||||
|
index = cls(save_dir=output_dir)
|
||||||
|
|
||||||
|
ensure_dir(scratch_dir)
|
||||||
|
|
||||||
|
logger = logging.getLogger('neighborhoods')
|
||||||
|
|
||||||
|
qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
|
||||||
|
ensure_dir(qs_scratch_dir)
|
||||||
|
logger.info('Creating Quattroshapes neighborhoods')
|
||||||
|
|
||||||
|
qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
|
||||||
|
logger.info('Creating Zetashapes neighborhoods')
|
||||||
|
zs = cls.create_zetashapes_neighborhoods_index()
|
||||||
|
|
||||||
|
logger.info('Creating IDF index')
|
||||||
|
idf = IDFIndex()
|
||||||
|
|
||||||
|
char_scripts = get_chars_by_script()
|
||||||
|
|
||||||
|
for idx in (zs, qs):
|
||||||
|
for i, (props, poly) in enumerate(idx.polygons):
|
||||||
|
name = props.get('name')
|
||||||
|
if name is not None:
|
||||||
|
doc = cls.count_words(name)
|
||||||
|
idf.update(doc)
|
||||||
|
|
||||||
|
for key, attrs, deps in parse_osm(filename):
|
||||||
|
for k, v in attrs.iteritems():
|
||||||
|
if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
|
||||||
|
doc = cls.count_words(v)
|
||||||
|
idf.update(doc)
|
||||||
|
|
||||||
|
qs.matched = [False] * qs.i
|
||||||
|
zs.matched = [False] * zs.i
|
||||||
|
|
||||||
|
logger.info('Matching OSM points to neighborhood polygons')
|
||||||
|
# Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
|
||||||
|
num_polys = 0
|
||||||
|
for node_id, attrs, deps in parse_osm(filename):
|
||||||
|
try:
|
||||||
|
lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
osm_name = attrs.get('name')
|
||||||
|
if not osm_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_neighborhood = attrs.get('place') == 'neighbourhood'
|
||||||
|
|
||||||
|
ranks = []
|
||||||
|
osm_names = []
|
||||||
|
|
||||||
|
for key in OSM_NAME_TAGS:
|
||||||
|
name = attrs.get(key)
|
||||||
|
if name:
|
||||||
|
osm_names.append(name)
|
||||||
|
|
||||||
|
for name_key in OSM_NAME_TAGS:
|
||||||
|
osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))])
|
||||||
|
|
||||||
|
for idx in (zs, qs):
|
||||||
|
candidates = idx.get_candidate_polygons(lat, lon, all_levels=True)
|
||||||
|
|
||||||
|
if candidates:
|
||||||
|
max_sim = 0.0
|
||||||
|
arg_max = None
|
||||||
|
|
||||||
|
for osm_name in osm_names:
|
||||||
|
|
||||||
|
contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
|
||||||
|
for c in safe_decode(osm_name)))
|
||||||
|
|
||||||
|
for i in candidates:
|
||||||
|
props, poly = idx.polygons[i]
|
||||||
|
name = props.get('name')
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
level = props.get(QuattroshapesReverseGeocoder.LEVEL)
|
||||||
|
if is_neighborhood and level != 'neighborhood':
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not contains_ideographs:
|
||||||
|
sim = NeighborhoodDeduper.compare(osm_name, name, idf)
|
||||||
|
else:
|
||||||
|
# Many Han/Hangul characters are common, shouldn't use IDF
|
||||||
|
sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)
|
||||||
|
|
||||||
|
if sim > max_sim:
|
||||||
|
max_sim = sim
|
||||||
|
arg_max = (max_sim, props, poly.context, idx, i)
|
||||||
|
|
||||||
|
if arg_max:
|
||||||
|
ranks.append(arg_max)
|
||||||
|
|
||||||
|
ranks.sort(key=operator.itemgetter(0), reverse=True)
|
||||||
|
if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
|
||||||
|
score, props, poly, idx, i = ranks[0]
|
||||||
|
matches.append((score, [safe_decode(attrs[k]) for k in OSM_NAME_TAGS if k in attrs], safe_decode(props['name'])))
|
||||||
|
|
||||||
|
if idx is zs:
|
||||||
|
attrs['polygon_type'] = 'neighborhood'
|
||||||
|
else:
|
||||||
|
level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
|
||||||
|
if level == 'neighborhood':
|
||||||
|
attrs['polygon_type'] = 'neighborhood'
|
||||||
|
else:
|
||||||
|
attrs['polygon_type'] = 'local_admin'
|
||||||
|
|
||||||
|
attrs['source'] = 'osm'
|
||||||
|
index.index_polygon(poly)
|
||||||
|
index.add_polygon(poly, attrs)
|
||||||
|
idx.matched[i] = True
|
||||||
|
else:
|
||||||
|
if ranks:
|
||||||
|
score, props, poly, idx, i = ranks[0]
|
||||||
|
top_matches.append((ranks[0][0], [safe_decode(attrs[k]) for k in OSM_NAME_TAGS if k in attrs], safe_decode(props['name'])))
|
||||||
|
|
||||||
|
non_match.append((node_id, attrs))
|
||||||
|
|
||||||
|
num_polys += 1
|
||||||
|
if num_polys % 1000 == 0 and num_polys > 0:
|
||||||
|
logger.info('did {} neighborhoods'.format(num_polys))
|
||||||
|
|
||||||
|
for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
|
||||||
|
for i, (props, poly) in enumerate(idx.polygons):
|
||||||
|
if idx.matched[i]:
|
||||||
|
continue
|
||||||
|
props['source'] = source
|
||||||
|
if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
|
||||||
|
props['polygon_type'] = 'neighborhood'
|
||||||
|
else:
|
||||||
|
props['polygon_type'] = 'local_admin'
|
||||||
|
index.index_polygon(poly.context)
|
||||||
|
index.add_polygon(poly.context, props)
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
def priority(self, i):
|
||||||
|
props, p = self.polygons[i]
|
||||||
|
return (self.level_priorities[props['polygon_type']], self.source_priorities[props['source']])
|
||||||
|
|
||||||
|
def get_candidate_polygons(self, lat, lon):
|
||||||
|
candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon)
|
||||||
|
return sorted(candidates, key=self.priority)
|
||||||
|
|
||||||
|
|
||||||
|
class QuattroshapesReverseGeocoder(GeohashPolygonIndex):
|
||||||
|
'''
|
||||||
|
Quattroshapes polygons, for levels up to localities, are relatively
|
||||||
|
accurate and provide concordance with GeoPlanet and in some cases
|
||||||
|
GeoNames (which is used in other parts of this project).
|
||||||
|
'''
|
||||||
COUNTRIES_FILENAME = 'qs_adm0.shp'
|
COUNTRIES_FILENAME = 'qs_adm0.shp'
|
||||||
ADMIN1_FILENAME = 'qs_adm1.shp'
|
ADMIN1_FILENAME = 'qs_adm1.shp'
|
||||||
ADMIN1_REGION_FILENAME = 'qs_adm1_region.shp'
|
ADMIN1_REGION_FILENAME = 'qs_adm1_region.shp'
|
||||||
@@ -134,18 +487,23 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
|
|||||||
def create_from_shapefiles(cls,
|
def create_from_shapefiles(cls,
|
||||||
input_files,
|
input_files,
|
||||||
output_dir,
|
output_dir,
|
||||||
index_filename=DEFAULT_INDEX_FILENAME,
|
index_filename=None,
|
||||||
polys_filename=DEFAULT_POLYS_FILENAME):
|
polys_filename=DEFAULT_POLYS_FILENAME,
|
||||||
|
use_all_props=False):
|
||||||
|
|
||||||
index = cls(save_dir=output_dir, index_filename=index_filename)
|
index = cls(save_dir=output_dir, index_filename=index_filename)
|
||||||
|
|
||||||
i = 0
|
|
||||||
|
|
||||||
for input_file in input_files:
|
for input_file in input_files:
|
||||||
f = fiona.open(input_file)
|
f = fiona.open(input_file)
|
||||||
|
|
||||||
filename = os.path.split(input_file)[-1]
|
filename = os.path.split(input_file)[-1]
|
||||||
include_props = cls.polygon_properties.get(filename)
|
|
||||||
|
aliases = cls.polygon_properties.get(filename)
|
||||||
|
|
||||||
|
if not use_all_props:
|
||||||
|
include_props = aliases
|
||||||
|
else:
|
||||||
|
include_props = None
|
||||||
|
|
||||||
for rec in f:
|
for rec in f:
|
||||||
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
|
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
|
||||||
@@ -156,9 +514,8 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
|
|||||||
if filename == cls.NEIGHBORHOODS_FILENAME:
|
if filename == cls.NEIGHBORHOODS_FILENAME:
|
||||||
properties['qs_level'] = 'neighborhood'
|
properties['qs_level'] = 'neighborhood'
|
||||||
|
|
||||||
if include_props:
|
|
||||||
have_all_props = False
|
have_all_props = False
|
||||||
for k, (prop, func) in include_props.iteritems():
|
for k, (prop, func) in aliases.iteritems():
|
||||||
v = properties.get(prop, None)
|
v = properties.get(prop, None)
|
||||||
if v is not None:
|
if v is not None:
|
||||||
try:
|
try:
|
||||||
@@ -173,7 +530,7 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
|
|||||||
poly_type = rec['geometry']['type']
|
poly_type = rec['geometry']['type']
|
||||||
if poly_type == 'Polygon':
|
if poly_type == 'Polygon':
|
||||||
poly = Polygon(rec['geometry']['coordinates'][0])
|
poly = Polygon(rec['geometry']['coordinates'][0])
|
||||||
index.index_polygon(i, poly)
|
index.index_polygon(poly)
|
||||||
poly = index.simplify_polygon(poly)
|
poly = index.simplify_polygon(poly)
|
||||||
index.add_polygon(poly, dict(rec['properties']), include_only_properties=include_props)
|
index.add_polygon(poly, dict(rec['properties']), include_only_properties=include_props)
|
||||||
elif poly_type == 'MultiPolygon':
|
elif poly_type == 'MultiPolygon':
|
||||||
@@ -181,20 +538,19 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
|
|||||||
for coords in rec['geometry']['coordinates']:
|
for coords in rec['geometry']['coordinates']:
|
||||||
poly = Polygon(coords[0])
|
poly = Polygon(coords[0])
|
||||||
polys.append(poly)
|
polys.append(poly)
|
||||||
index.index_polygon(i, poly)
|
index.index_polygon(poly)
|
||||||
|
|
||||||
multi_poly = index.simplify_polygon(MultiPolygon(polys))
|
multi_poly = index.simplify_polygon(MultiPolygon(polys))
|
||||||
index.add_polygon(multi_poly, dict(rec['properties']), include_only_properties=include_props)
|
index.add_polygon(multi_poly, dict(rec['properties']), include_only_properties=include_props)
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
i += 1
|
|
||||||
return index
|
return index
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_with_quattroshapes(cls, quattroshapes_dir,
|
def create_with_quattroshapes(cls, quattroshapes_dir,
|
||||||
output_dir,
|
output_dir,
|
||||||
index_filename=DEFAULT_INDEX_FILENAME,
|
index_filename=None,
|
||||||
polys_filename=DEFAULT_POLYS_FILENAME):
|
polys_filename=DEFAULT_POLYS_FILENAME):
|
||||||
|
|
||||||
admin0_filename = os.path.join(quattroshapes_dir, cls.COUNTRIES_FILENAME)
|
admin0_filename = os.path.join(quattroshapes_dir, cls.COUNTRIES_FILENAME)
|
||||||
@@ -212,16 +568,41 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
|
|||||||
output_dir, index_filename=index_filename,
|
output_dir, index_filename=index_filename,
|
||||||
polys_filename=polys_filename)
|
polys_filename=polys_filename)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_neighborhoods_index(cls, quattroshapes_dir,
|
||||||
|
output_dir,
|
||||||
|
index_filename=None,
|
||||||
|
polys_filename=DEFAULT_POLYS_FILENAME):
|
||||||
|
local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME)
|
||||||
|
neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME)
|
||||||
|
return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename],
|
||||||
|
output_dir, index_filename=index_filename,
|
||||||
|
polys_filename=polys_filename)
|
||||||
|
|
||||||
def sort_level(self, i):
|
def sort_level(self, i):
|
||||||
props, p = self.polygons[i]
|
props, p = self.polygons[i]
|
||||||
return self.sort_levels.get(props[self.LEVEL], 0)
|
return self.sort_levels.get(props[self.LEVEL], 0)
|
||||||
|
|
||||||
def get_candidate_polygons(self, lat, lon):
|
def get_candidate_polygons(self, lat, lon, all_levels=False):
|
||||||
candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()
|
candidates = super(QuattroshapesReverseGeocoder, self).get_candidate_polygons(lat, lon, all_levels=all_levels)
|
||||||
return sorted(candidates, key=self.sort_level, reverse=True)
|
return sorted(candidates, key=self.sort_level, reverse=True)
|
||||||
|
|
||||||
|
|
||||||
class OSMReverseGeocoder(RTreePolygonIndex):
|
class OSMReverseGeocoder(RTreePolygonIndex):
|
||||||
|
'''
|
||||||
|
OSM has among the best, most carefully-crafted, accurate administrative
|
||||||
|
polygons in the business in addition to using local language naming
|
||||||
|
conventions which is desirable for creating a truly multilingual address
|
||||||
|
parser.
|
||||||
|
|
||||||
|
The storage of these polygons is byzantine. See geodata.osm.osm_admin_boundaries
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
Suffice to say, this reverse geocoder builds an R-tree index on OSM planet
|
||||||
|
in a reasonable amount of memory using arrays of C integers and binary search
|
||||||
|
for the dependency lookups and Tarjan's algorithm for finding strongly connected
|
||||||
|
components to stitch together the polygons.
|
||||||
|
'''
|
||||||
include_property_patterns = set([
|
include_property_patterns = set([
|
||||||
'name',
|
'name',
|
||||||
'name:*',
|
'name:*',
|
||||||
@@ -237,27 +618,6 @@ class OSMReverseGeocoder(RTreePolygonIndex):
|
|||||||
'wikipedia:*',
|
'wikipedia:*',
|
||||||
])
|
])
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def fix_polygon(cls, poly):
|
|
||||||
'''
|
|
||||||
Coerce to valid polygon
|
|
||||||
'''
|
|
||||||
if not poly.is_valid:
|
|
||||||
poly = poly.buffer(0)
|
|
||||||
if not poly.is_valid:
|
|
||||||
return None
|
|
||||||
return poly
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def to_polygon(cls, coords):
|
|
||||||
'''
|
|
||||||
Create shapely polygon from list of coordinate tuples if valid
|
|
||||||
'''
|
|
||||||
if not coords or len(coords) < 3:
|
|
||||||
return None
|
|
||||||
poly = Polygon(coords)
|
|
||||||
return cls.fix_polygon(poly)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_from_osm_file(cls, filename, output_dir,
|
def create_from_osm_file(cls, filename, output_dir,
|
||||||
index_filename=DEFAULT_INDEX_FILENAME,
|
index_filename=DEFAULT_INDEX_FILENAME,
|
||||||
@@ -269,7 +629,7 @@ class OSMReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
Note: the input file is expected to have been created using
|
Note: the input file is expected to have been created using
|
||||||
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
|
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
|
||||||
admin borders commands if using other geometries.
|
admin borders commands if using other bounds.
|
||||||
'''
|
'''
|
||||||
index = cls(save_dir=output_dir, index_filename=index_filename)
|
index = cls(save_dir=output_dir, index_filename=index_filename)
|
||||||
|
|
||||||
@@ -282,8 +642,6 @@ class OSMReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
logger = logging.getLogger('osm.reverse_geocode')
|
logger = logging.getLogger('osm.reverse_geocode')
|
||||||
|
|
||||||
polygon_index = 0
|
|
||||||
|
|
||||||
for relation_id, props, outer_polys, inner_polys in polygons:
|
for relation_id, props, outer_polys, inner_polys in polygons:
|
||||||
props = {k: v for k, v in props.iteritems() if k in cls.include_property_patterns
|
props = {k: v for k, v in props.iteritems() if k in cls.include_property_patterns
|
||||||
or (':' in k and '{}:*'.format(k.split(':', 1)[0]) in cls.include_property_patterns)}
|
or (':' in k and '{}:*'.format(k.split(':', 1)[0]) in cls.include_property_patterns)}
|
||||||
@@ -298,10 +656,10 @@ class OSMReverseGeocoder(RTreePolygonIndex):
|
|||||||
if poly is None or not poly.bounds or len(poly.bounds) != 4:
|
if poly is None or not poly.bounds or len(poly.bounds) != 4:
|
||||||
continue
|
continue
|
||||||
if poly.type != 'MultiPolygon':
|
if poly.type != 'MultiPolygon':
|
||||||
index.index_polygon(polygon_index, poly)
|
index.index_polygon(poly)
|
||||||
else:
|
else:
|
||||||
for p in poly:
|
for p in poly:
|
||||||
index.index_polygon(polygon_index, p)
|
index.index_polygon(p)
|
||||||
else:
|
else:
|
||||||
multi = []
|
multi = []
|
||||||
inner = []
|
inner = []
|
||||||
@@ -331,11 +689,11 @@ class OSMReverseGeocoder(RTreePolygonIndex):
|
|||||||
continue
|
continue
|
||||||
# R-tree only stores the bounding box, so add the whole polygon
|
# R-tree only stores the bounding box, so add the whole polygon
|
||||||
if poly.type != 'MultiPolygon':
|
if poly.type != 'MultiPolygon':
|
||||||
index.index_polygon(polygon_index, poly)
|
index.index_polygon(poly)
|
||||||
multi.append(poly)
|
multi.append(poly)
|
||||||
else:
|
else:
|
||||||
for p in poly:
|
for p in poly:
|
||||||
index.index_polygon(polygon_index, p)
|
index.index_polygon(p)
|
||||||
multi.extend(poly)
|
multi.extend(poly)
|
||||||
|
|
||||||
if len(multi) > 1:
|
if len(multi) > 1:
|
||||||
@@ -351,6 +709,7 @@ class OSMReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
return index
|
return index
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Handle argument parsing here
|
# Handle argument parsing here
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
@@ -361,15 +720,25 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('-a', '--osm-admin-file',
|
parser.add_argument('-a', '--osm-admin-file',
|
||||||
help='Path to OSM borders file (with dependencies, .osm format)')
|
help='Path to OSM borders file (with dependencies, .osm format)')
|
||||||
|
|
||||||
|
parser.add_argument('-n', '--osm-neighborhoods-file',
|
||||||
|
help='Path to OSM neighborhoods file (no dependencies, .osm format)')
|
||||||
|
|
||||||
parser.add_argument('-o', '--out-dir',
|
parser.add_argument('-o', '--out-dir',
|
||||||
default=os.getcwd(),
|
default=os.getcwd(),
|
||||||
help='Output directory')
|
help='Output directory')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.quattroshapes_dir:
|
if args.osm_admin_file:
|
||||||
|
index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir,
|
||||||
|
quattroshapes_dir=args.quattroshapes_dir)
|
||||||
|
elif args.osm_neighborhoods_filename and args.quattroshapes_dir:
|
||||||
|
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
|
||||||
|
args.osm_neighorhoods_file,
|
||||||
|
args.quattroshapes_dir,
|
||||||
|
args.out_dir
|
||||||
|
)
|
||||||
|
elif args.quattroshapes_dir:
|
||||||
index = QuattroshapesReverseGeocoder.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir)
|
index = QuattroshapesReverseGeocoder.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir)
|
||||||
elif args.osm_admin_file:
|
|
||||||
index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir)
|
|
||||||
else:
|
else:
|
||||||
parser.error('Must specify quattroshapes dir or osm admin borders file')
|
parser.error('Must specify quattroshapes dir or osm admin borders file')
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user