Files
libpostal/scripts/geodata/polygons/reverse_geocode.py
2016-07-21 17:04:57 -04:00

488 lines
18 KiB
Python

'''
reverse_geocode.py
-------------------
In-memory reverse geocoder using polygons from Quattroshapes or OSM.
This should be useful for filling in the blanks both in constructing
training data from OSM addresses and for OpenVenues.
Usage:
python reverse_geocode.py -o /data/quattroshapes/rtree/reverse -q /data/quattroshapes/
python reverse_geocode.py -o /data/quattroshapes/rtree/reverse -a /data/osm/planet-admin-borders.osm
'''
import argparse
import logging
import operator
import os
import re
import requests
import shutil
import six
import subprocess
import sys
import tempfile
from functools import partial
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.coordinates.conversion import latlon_to_decimal
from geodata.encoding import safe_decode
from geodata.file_utils import ensure_dir, download_file
from geodata.i18n.unicode_properties import get_chars_by_script
from geodata.i18n.word_breaks import ideographic_scripts
from geodata.names.deduping import NameDeduper
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
from geodata.osm.admin_boundaries import OSMAdminPolygonReader, OSMSubdivisionPolygonReader, OSMBuildingPolygonReader
from geodata.polygons.index import *
from geodata.statistics.tf_idf import IDFIndex
from geodata.text.tokenize import tokenize, token_types
from geodata.text.normalize import *
from shapely.topology import TopologicalError
decode_latin1 = partial(safe_decode, encoding='latin1')
def str_id(v):
v = int(v)
if v <= 0:
return None
return str(v)
class QuattroshapesReverseGeocoder(RTreePolygonIndex):
'''
Quattroshapes polygons, for levels up to localities, are relatively
accurate and provide concordance with GeoPlanet and in some cases
GeoNames (which is used in other parts of this project).
'''
COUNTRIES_FILENAME = 'qs_adm0.shp'
ADMIN1_FILENAME = 'qs_adm1.shp'
ADMIN1_REGION_FILENAME = 'qs_adm1_region.shp'
ADMIN2_FILENAME = 'qs_adm2.shp'
ADMIN2_REGION_FILENAME = 'qs_adm2_region.shp'
LOCAL_ADMIN_FILENAME = 'qs_localadmin.shp'
LOCALITIES_FILENAME = 'qs_localities.shp'
NEIGHBORHOODS_FILENAME = 'qs_neighborhoods.shp'
COUNTRY = 'adm0'
ADMIN1 = 'adm1'
ADMIN1_REGION = 'adm1_region'
ADMIN2 = 'adm2'
ADMIN2_REGION = 'adm2_region'
LOCAL_ADMIN = 'localadmin'
LOCALITY = 'locality'
NEIGHBORHOOD = 'neighborhood'
PRIORITIES_FILENAME = 'priorities.json'
persistent_polygons = True
cache_size = 100000
sorted_levels = (COUNTRY,
ADMIN1_REGION,
ADMIN1,
ADMIN2_REGION,
ADMIN2,
LOCAL_ADMIN,
LOCALITY,
NEIGHBORHOOD,
)
sort_levels = {k: i for i, k in enumerate(sorted_levels)}
NAME = 'name'
CODE = 'code'
LEVEL = 'level'
GEONAMES_ID = 'geonames_id'
WOE_ID = 'woe_id'
polygon_properties = {
COUNTRIES_FILENAME: {
NAME: ('qs_a0', safe_decode),
CODE: ('qs_iso_cc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN1_FILENAME: {
NAME: ('qs_a1', safe_decode),
CODE: ('qs_a1_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN1_REGION_FILENAME: {
NAME: ('qs_a1r', safe_decode),
CODE: ('qs_a1r_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN2_FILENAME: {
NAME: ('qs_a2', decode_latin1),
CODE: ('qs_a2_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN2_REGION_FILENAME: {
NAME: ('qs_a2r', safe_decode),
CODE: ('qs_a2r_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
LOCAL_ADMIN_FILENAME: {
NAME: ('qs_la', safe_decode),
CODE: ('qs_la_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
LOCALITIES_FILENAME: {
NAME: ('qs_loc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str),
WOE_ID: ('qs_woe_id', str),
},
NEIGHBORHOODS_FILENAME: {
NAME: ('name', safe_decode),
CODE: ('name_en', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('gn_id', str_id),
WOE_ID: ('woe_id', str_id),
}
}
@classmethod
def create_from_shapefiles(cls,
input_files,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME,
use_all_props=False):
index = cls(save_dir=output_dir, index_filename=index_filename)
for input_file in input_files:
f = fiona.open(input_file)
filename = os.path.split(input_file)[-1]
aliases = cls.polygon_properties.get(filename)
if not use_all_props:
include_props = aliases
else:
include_props = None
for rec in f:
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
continue
properties = rec['properties']
if filename == cls.NEIGHBORHOODS_FILENAME:
properties['qs_level'] = 'neighborhood'
have_all_props = False
for k, (prop, func) in aliases.iteritems():
v = properties.get(prop, None)
if v is not None:
try:
properties[k] = func(v)
except Exception:
break
else:
have_all_props = True
if not have_all_props or not properties.get(cls.NAME):
continue
poly_type = rec['geometry']['type']
if poly_type == 'Polygon':
poly = cls.to_polygon(rec['geometry']['coordinates'][0])
index.index_polygon(poly)
poly = index.simplify_polygon(poly)
index.add_polygon(poly, dict(rec['properties']), include_only_properties=include_props)
elif poly_type == 'MultiPolygon':
polys = []
for coords in rec['geometry']['coordinates']:
poly = cls.to_polygon(coords[0])
polys.append(poly)
index.index_polygon(poly)
multi_poly = index.simplify_polygon(MultiPolygon(polys))
index.add_polygon(multi_poly, dict(rec['properties']), include_only_properties=include_props)
else:
continue
return index
@classmethod
def create_with_quattroshapes(cls, quattroshapes_dir,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
admin0_filename = os.path.join(quattroshapes_dir, cls.COUNTRIES_FILENAME)
admin1_filename = os.path.join(quattroshapes_dir, cls.ADMIN1_FILENAME)
admin1r_filename = os.path.join(quattroshapes_dir, cls.ADMIN1_REGION_FILENAME)
admin2_filename = os.path.join(quattroshapes_dir, cls.ADMIN2_FILENAME)
admin2r_filename = os.path.join(quattroshapes_dir, cls.ADMIN2_REGION_FILENAME)
localities_filename = os.path.join(quattroshapes_dir, cls.LOCALITIES_FILENAME)
return cls.create_from_shapefiles([admin0_filename, admin1_filename, admin1r_filename,
admin2_filename, admin2r_filename,
localities_filename],
output_dir, index_filename=index_filename,
polys_filename=polys_filename)
def setup(self):
self.priorities = []
def index_polygon_properties(self, properties):
self.priorities.append(self.sort_levels.get(properties[self.LEVEL], 0))
def load_polygon_properties(self, d):
self.priorities = json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
def sort_level(self, i):
return self.priorities[i]
def get_candidate_polygons(self, lat, lon):
candidates = super(QuattroshapesReverseGeocoder, self).get_candidate_polygons(lat, lon)
return sorted(candidates, key=self.sort_level, reverse=True)
class OSMReverseGeocoder(RTreePolygonIndex):
'''
OSM has among the best, most carefully-crafted, accurate administrative
polygons in the business in addition to using local language naming
conventions which is desirable for creating a truly multilingual address
parser.
The storage of these polygons is byzantine. See geodata.osm.osm_admin_boundaries
for more details.
Suffice to say, this reverse geocoder builds an R-tree index on OSM planet
in a reasonable amount of memory using arrays of C integers and binary search
for the dependency lookups and Tarjan's algorithm for finding strongly connected
components to stitch together the polygons.
'''
ADMIN_LEVEL = 'admin_level'
ADMIN_LEVELS_FILENAME = 'admin_levels.json'
polygon_reader = OSMAdminPolygonReader
persistent_polygons = True
# Cache everything
cache_size = 1000000
simplify_polygons = True
include_property_patterns = set([
'name',
'name:*',
'ISO3166-1:alpha2',
'ISO3166-1:alpha3',
'int_name',
'official_name',
'official_name:*',
'alt_name',
'alt_name:*',
'short_name',
'short_name:*',
'admin_level',
'place',
'wikipedia',
'wikipedia:*',
])
@classmethod
def create_from_osm_file(cls, filename, output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
'''
Given an OSM file (planet or some other bounds) containing relations
and their dependencies, create an R-tree index for coarse-grained
reverse geocoding.
Note: the input file is expected to have been created using
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
admin borders commands if using other bounds.
'''
index = cls(save_dir=output_dir, index_filename=index_filename)
reader = cls.polygon_reader(filename)
polygons = reader.polygons()
logger = logging.getLogger('osm.reverse_geocode')
for element_id, props, admin_center, outer_polys, inner_polys in polygons:
props = {k: v for k, v in six.iteritems(props)
if k in cls.include_property_patterns or (six.u(':') in k and
six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)}
id_type, element_id = osm_type_and_id(element_id)
props['type'] = id_type
props['id'] = element_id
if admin_center:
props['admin_center'] = {k: v for k, v in six.iteritems(admin_center)
if k in ('id', 'type') or k in cls.include_property_patterns or (six.u(':') in k and
six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)}
if inner_polys and not outer_polys:
logger.warn('inner polygons with no outer')
continue
if len(outer_polys) == 1 and not inner_polys:
poly = cls.to_polygon(outer_polys[0])
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
if poly.type != 'MultiPolygon':
index.index_polygon(poly)
else:
for p in poly:
index.index_polygon(p)
else:
multi = []
inner = []
# Validate inner polygons (holes)
for p in inner_polys:
poly = cls.to_polygon(p)
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
if not poly.is_valid:
poly = cls.fix_polygon(poly)
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
if poly.type != 'MultiPolygon':
inner.append(poly)
else:
inner.extend(poly)
# Validate outer polygons
for p in outer_polys:
poly = cls.to_polygon(p)
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
interior = []
try:
# Figure out which outer polygon contains each inner polygon
interior = [p2 for p2 in inner if poly.contains(p2)]
except TopologicalError:
continue
if interior:
# Polygon with holes constructor
poly = cls.to_polygon(p, [zip(*p2.exterior.coords.xy) for p2 in interior])
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
# R-tree only stores the bounding box, so add the whole polygon
if poly.type != 'MultiPolygon':
index.index_polygon(poly)
multi.append(poly)
else:
for p in poly:
index.index_polygon(p)
multi.extend(poly)
if len(multi) > 1:
poly = MultiPolygon(multi)
elif multi:
poly = multi[0]
else:
continue
if index.simplify_polygons:
poly = index.simplify_polygon(poly)
index.add_polygon(poly, props)
return index
def setup(self):
self.admin_levels = []
def index_polygon_properties(self, properties):
admin_level = properties.get(self.ADMIN_LEVEL, 0)
try:
admin_level = int(admin_level)
except ValueError:
admin_level = 0
self.admin_levels.append(admin_level)
def load_polygon_properties(self, d):
self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w'))
def sort_level(self, i):
return self.admin_levels[i]
def get_candidate_polygons(self, lat, lon):
candidates = super(OSMReverseGeocoder, self).get_candidate_polygons(lat, lon)
return sorted(candidates, key=self.sort_level, reverse=True)
class OSMSubdivisionReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = True
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMSubdivisionPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['landuse', 'place', 'amenity'])
class OSMBuildingReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = True
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMBuildingPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['building', 'building:levels', 'building:part', 'addr:*'])
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-q', '--quattroshapes-dir',
help='Path to quattroshapes dir')
parser.add_argument('-a', '--osm-admin-file',
help='Path to OSM borders file (with dependencies, .osm format)')
parser.add_argument('-s', '--osm-subdivisions-file',
help='Path to OSM subdivisions file (with dependencies, .osm format)')
parser.add_argument('-b', '--osm-building-polygons-file',
help='Path to OSM building polygons file (with dependencies, .osm format)')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
logging.basicConfig(level=logging.INFO)
args = parser.parse_args()
if args.osm_admin_file:
index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir)
elif args.osm_subdivisions_file:
index = OSMSubdivisionReverseGeocoder.create_from_osm_file(args.osm_subdivisions_file, args.out_dir)
elif args.osm_building_polygons_file:
index = OSMBuildingReverseGeocoder.create_from_osm_file(args.osm_building_polygons_file, args.out_dir)
elif args.quattroshapes_dir:
index = QuattroshapesReverseGeocoder.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir)
else:
parser.error('Must specify quattroshapes dir or osm admin borders file')
index.save()