Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,26 @@
import pyproj
from functools import partial
from shapely.ops import transform
from shapely.geometry import Polygon
def polygon_area(poly):
return transform(
partial(pyproj.transform,
pyproj.Proj(init='EPSG:4326'),
pyproj.Proj(proj='aea',
lat1=poly.bounds[1],
lat2=poly.bounds[2],
)
),
poly
).area
def polygon_bounding_box_area(poly):
bbox = poly.bounds
p = Polygon([(bbox[0], bbox[3]), (bbox[0], bbox[1]),
(bbox[2], bbox[1]), (bbox[2], bbox[3]),
])
return polygon_area(p)

View File

@@ -0,0 +1,452 @@
import fiona
import gc
import geohash
import os
import rtree
import six
import ujson as json
from collections import OrderedDict, defaultdict
from leveldb import LevelDB
from lru import LRU
from shapely.geometry import Point, Polygon, MultiPolygon
from shapely.prepared import prep
from shapely.geometry.geo import mapping
from geodata.polygons.area import polygon_bounding_box_area
DEFAULT_POLYS_FILENAME = 'polygons.geojson'
DEFAULT_PROPS_FILENAME = 'properties.json'
class PolygonIndex(object):
include_only_properties = None
simplify_tolerance = 0.0001
preserve_topology = True
persistent_polygons = False
cache_size = 0
fix_invalid_polygons = False
INDEX_FILENAME = None
POLYGONS_DB_DIR = 'polygons'
def __init__(self, index=None, polygons=None, polygons_db=None, save_dir=None,
index_filename=None,
polygons_db_path=None,
include_only_properties=None):
if save_dir:
self.save_dir = save_dir
else:
self.save_dir = None
if not index_filename:
index_filename = self.INDEX_FILENAME
self.index_path = os.path.join(save_dir or '.', index_filename)
if not index:
self.create_index(overwrite=True)
else:
self.index = index
if include_only_properties and hasattr(include_only_properties, '__contains__'):
self.include_only_properties = include_only_properties
if not polygons and not self.persistent_polygons:
self.polygons = {}
elif polygons and not self.persistent_polygons:
self.polygons = polygons
elif self.persistent_polygons and self.cache_size > 0:
self.polygons = LRU(self.cache_size)
if polygons:
for key, value in six.iteritems(polygons):
self.polygons[key] = value
self.cache_hits = 0
self.cache_misses = 0
self.get_polygon = self.get_polygon_cached
if not polygons_db_path:
polygons_db_path = os.path.join(save_dir or '.', self.POLYGONS_DB_DIR)
if not polygons_db:
self.polygons_db = LevelDB(polygons_db_path)
else:
self.polygons_db = polygons_db
self.setup()
self.i = 0
def create_index(self, overwrite=False):
raise NotImplementedError('Children must implement')
def index_polygon(self, polygon):
raise NotImplementedError('Children must implement')
def setup(self):
pass
def clear_cache(self, garbage_collect=True):
if self.persistent_polygons and self.cache_size > 0:
self.polygons.clear()
if garbage_collect:
gc.collect()
def simplify_polygon(self, poly, simplify_tolerance=None, preserve_topology=None):
if simplify_tolerance is None:
simplify_tolerance = self.simplify_tolerance
if preserve_topology is None:
preserve_topology = self.preserve_topology
return poly.simplify(simplify_tolerance, preserve_topology=preserve_topology)
def index_polygon_properties(self, properties):
pass
def polygon_geojson(self, poly, properties):
return {
'type': 'Feature',
'geometry': mapping(poly),
}
def add_polygon(self, poly, properties, cache=False, include_only_properties=None):
if include_only_properties is not None:
properties = {k: v for k, v in properties.iteritems() if k in include_only_properties}
if not self.persistent_polygons or cache:
self.polygons[self.i] = prep(poly)
if self.persistent_polygons:
self.polygons_db.Put(self.polygon_key(self.i), json.dumps(self.polygon_geojson(poly, properties)))
self.polygons_db.Put(self.properties_key(self.i), json.dumps(properties))
self.index_polygon_properties(properties)
self.i += 1
@classmethod
def create_from_shapefiles(cls, inputs, output_dir,
index_filename=None,
include_only_properties=None):
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
for input_file in inputs:
if include_only_properties is not None:
include_props = include_only_properties.get(input_file, cls.include_only_properties)
else:
include_props = cls.include_only_properties
f = fiona.open(input_file)
index.add_geojson_like_file(f)
return index
@classmethod
def fix_polygon(cls, poly):
'''
Coerce to valid polygon
'''
if not poly.is_valid:
poly = poly.buffer(0)
if not poly.is_valid:
return None
return poly
@classmethod
def to_polygon(cls, coords, holes=None, test_point=None):
'''
Create shapely polygon from list of coordinate tuples if valid
'''
if not coords or len(coords) < 3:
return None
# Fix for polygons crossing the 180th meridian
lons = [lon for lon, lat in coords]
if (max(lons) - min(lons) > 180):
coords = [(lon + 360.0 if lon < 0 else lon, lat) for lon, lat in coords]
if holes:
holes = [(lon + 360.0 if lon < 0 else lon, lat) for lon, lat in holes]
poly = Polygon(coords, holes)
try:
if test_point is None:
test_point = poly.representative_point()
invalid = cls.fix_invalid_polygons and not poly.is_valid and not poly.contains(test_point)
except Exception:
invalid = True
if invalid:
try:
poly_fix = cls.fix_polygon(poly)
if poly_fix is not None and poly_fix.bounds and len(poly_fix.bounds) == 4 and poly_fix.is_valid and poly_fix.type == poly.type:
if test_point is None:
test_point = poly_fix.representative_point()
if poly_fix.contains(test_point):
poly = poly_fix
except Exception:
pass
return poly
def add_geojson_like_record(self, rec, include_only_properties=None):
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
return
poly_type = rec['geometry']['type']
if poly_type == 'Polygon':
coords = rec['geometry']['coordinates'][0]
poly = self.to_polygon(coords)
if poly is None or not poly.bounds or len(poly.bounds) != 4:
return
self.index_polygon(poly)
self.add_polygon(poly, rec['properties'], include_only_properties=include_only_properties)
elif poly_type == 'MultiPolygon':
polys = []
poly_coords = rec['geometry']['coordinates']
for coords in poly_coords:
poly = self.to_polygon(coords[0])
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
polys.append(poly)
self.index_polygon(poly)
self.add_polygon(MultiPolygon(polys), rec['properties'], include_only_properties=include_only_properties)
else:
return
def add_geojson_like_file(self, f, include_only_properties=None):
'''
Add either GeoJSON or a shapefile record to the index
'''
for rec in f:
self.add_geojson_like_record(rec, include_only_properties=include_only_properties)
@classmethod
def create_from_geojson_files(cls, inputs, output_dir,
index_filename=None,
include_only_properties=None):
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
for input_file in inputs:
if include_only_properties is not None:
include_props = include_only_properties.get(input_file, cls.include_only_properties)
else:
include_props = cls.include_only_properties
f = json.load(open(input_file))
index.add_geojson_like_file(f['features'], include_only_properties=include_props)
return index
def compact_polygons_db(self):
self.polygons_db.CompactRange('\x00', '\xff')
def save(self):
self.save_index()
self.save_properties(os.path.join(self.save_dir, DEFAULT_PROPS_FILENAME))
if not self.persistent_polygons:
self.save_polygons(os.path.join(self.save_dir, DEFAULT_POLYS_FILENAME))
self.compact_polygons_db()
self.save_polygon_properties(self.save_dir)
def load_properties(self, filename):
properties = json.load(open(filename))
self.i = int(properties.get('num_polygons', self.i))
def save_properties(self, out_filename):
out = open(out_filename, 'w')
json.dump({'num_polygons': str(self.i)}, out)
def save_polygons(self, out_filename):
out = open(out_filename, 'w')
for i in xrange(self.i):
poly = self.polygons[i]
feature = {
'type': 'Feature',
'geometry': mapping(poly.context),
}
out.write(json.dumps(feature) + u'\n')
def save_index(self):
raise NotImplementedError('Children must implement')
def load_polygon_properties(self, d):
pass
def save_polygon_properties(self, d):
pass
@classmethod
def polygon_from_geojson(cls, feature):
poly_type = feature['geometry']['type']
if poly_type == 'Polygon':
coords = feature['geometry']['coordinates']
poly = cls.to_polygon(coords[0], holes=coords[1:] or None)
return poly
elif poly_type == 'MultiPolygon':
polys = []
for coords in feature['geometry']['coordinates']:
poly = cls.to_polygon(coords[0], holes=coords[1:] or None)
polys.append(poly)
return MultiPolygon(polys)
@classmethod
def load_polygons(cls, filename):
f = open(filename)
polygons = {}
cls.i = 0
for line in f:
feature = json.loads(line.rstrip())
polygons[cls.i] = prep(cls.polygon_from_geojson(feature))
cls.i += 1
return polygons
@classmethod
def load_index(cls, d, index_name=None):
raise NotImplementedError('Children must implement')
@classmethod
def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME,
properties_filename=DEFAULT_PROPS_FILENAME,
polys_db_dir=POLYGONS_DB_DIR):
index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME)
if not cls.persistent_polygons:
polys = cls.load_polygons(os.path.join(d, polys_filename))
else:
polys = None
polygons_db = LevelDB(os.path.join(d, polys_db_dir))
polygon_index = cls(index=index, polygons=polys, polygons_db=polygons_db, save_dir=d)
polygon_index.load_properties(os.path.join(d, properties_filename))
polygon_index.load_polygon_properties(d)
return polygon_index
def get_candidate_polygons(self, lat, lon):
raise NotImplementedError('Children must implement')
def get_properties(self, i):
return json.loads(self.polygons_db.Get(self.properties_key(i)))
def get_polygon(self, i):
return self.polygons[i]
def get_polygon_cached(self, i):
poly = self.polygons.get(i, None)
if poly is None:
data = json.loads(self.polygons_db.Get(self.polygon_key(i)))
poly = prep(self.polygon_from_geojson(data))
self.polygons[i] = poly
self.cache_misses += 1
else:
self.cache_hits += 1
return poly
def __iter__(self):
for i in xrange(self.i):
yield self.get_properties(i), self.get_polygon(i)
def __len__(self):
return self.i
def polygons_contain(self, candidates, point, return_all=False):
containing = None
if return_all:
containing = []
for i in candidates:
poly = self.get_polygon(i)
contains = poly.contains(point)
if contains:
properties = self.get_properties(i)
if not return_all:
return properties
else:
containing.append(properties)
return containing
def polygon_key(self, i):
return 'poly:{}'.format(i)
def properties_key(self, i):
return 'props:{}'.format(i)
def point_in_poly(self, lat, lon, return_all=False):
candidates = self.get_candidate_polygons(lat, lon)
point = Point(lon, lat)
return self.polygons_contain(candidates, point, return_all=return_all)
class RTreePolygonIndex(PolygonIndex):
INDEX_FILENAME = 'rtree'
def create_index(self, overwrite=False):
self.index = rtree.index.Index(self.index_path, overwrite=overwrite)
def index_polygon(self, polygon):
self.index.insert(self.i, polygon.bounds)
def get_candidate_polygons(self, lat, lon):
return OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()
def save_index(self):
# need to close index before loading it
self.index.close()
@classmethod
def load_index(cls, d, index_name=None):
return rtree.index.Index(os.path.join(d, index_name or cls.INDEX_FILENAME))
class GeohashPolygonIndex(PolygonIndex):
# Reference: https://www.elastic.co/guide/en/elasticsearch/guide/current/geohashes.html
GEOHASH_PRECISIONS = [
(7, 152.8 ** 2),
(6, 1200.0 * 610.0),
(5, 4900.0 * 4900.0),
(4, 39000.0 * 19500.0),
(3, 156000.0 * 156000.0),
(2, 1251000.0 * 625000.0),
]
INDEX_FILENAME = 'index.json'
def create_index(self, overwrite=False):
self.index = defaultdict(list)
def index_point(self, lat, lon, geohash_level):
code = geohash.encode(lat, lon)[:geohash_level]
for key in [code] + geohash.neighbors(code):
self.index[key].append(self.i)
def index_polygon(self, polygon):
poly_area = polygon_bounding_box_area(polygon)
for geohash_level, area in self.GEOHASH_PRECISIONS:
if area * 9.0 >= poly_area:
break
bbox = polygon.bounds
lat, lon = (bbox[1] + bbox[3]) / 2.0, (bbox[0] + bbox[2]) / 2.0
self.index_point(lat, lon, geohash_level)
def get_candidate_polygons(self, lat, lon, return_all=False):
candidates = []
code = geohash.encode(lat, lon)
for level, area in self.GEOHASH_PRECISIONS:
indices = self.index.get(code[:level])
if not indices:
continue
candidates.extend(indices)
if not return_all:
break
return candidates
def save_index(self):
if not self.index_path:
self.index_path = os.path.join(self.save_dir or '.', self.INDEX_FILENAME)
json.dump(self.index, open(self.index_path, 'w'))
@classmethod
def load_index(cls, d, index_name=None):
return json.load(open(os.path.join(d, index_name or cls.INDEX_FILENAME)))

View File

@@ -0,0 +1,243 @@
import argparse
import operator
import os
import sys
import ujson as json
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.polygons.index import *
from geodata.i18n.languages import *
from geodata.language_id.disambiguation import disambiguate_language, AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE, WELL_REPRESENTED_LANGUAGES
country_language_dir = os.path.join(LANGUAGES_DIR, 'countries')
regional_language_dir = os.path.join(LANGUAGES_DIR, 'regional')
class LanguagePolygonIndex(RTreePolygonIndex):
DEFAULT_POLYS_FILENAME = 'polygons.geojson'
ADMIN_LEVELS_FILENAME = 'admin_levels.json'
include_only_properties = set([
'qs_a0',
'qs_iso_cc',
'qs_a1',
'qs_a1_lc',
'qs_a1r',
'qs_a1r_lc',
'qs_level',
'languages',
'admin_level'
])
@classmethod
def create_from_shapefiles(cls,
admin0_shapefile,
admin1_shapefile,
admin1_region_file,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
init_languages()
index = cls(save_dir=output_dir, index_filename=index_filename)
i = 0
'''
Ordering of the files is important here as we want to match
the most granular admin polygon first for regional languages. Currently
most regional languages as they would apply to street signage are regional in
terms of an admin 1 level (states, provinces, regions)
'''
for input_file in (admin0_shapefile, admin1_region_file, admin1_shapefile):
f = fiona.open(input_file)
for rec in f:
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
continue
country = rec['properties']['qs_iso_cc'].lower()
properties = rec['properties']
admin_level = properties['qs_level']
level_num = None
if admin_level == 'adm1':
name_key = 'qs_a1'
code_key = 'qs_a1_lc'
level_num = 1
elif admin_level == 'adm1_region':
name_key = 'qs_a1r'
code_key = 'qs_a1r_lc'
level_num = 1
elif admin_level == 'adm0':
level_num = 0
else:
continue
assert level_num is not None
if admin_level != 'adm0':
admin1 = properties.get(name_key)
admin1_code = properties.get(code_key)
regional = None
if name_key:
regional = get_regional_languages(country, name_key, admin1)
if code_key and not regional:
regional = get_regional_languages(country, code_key, admin1_code)
if not regional:
continue
if all((not default for lang, default in regional.iteritems())):
languages = get_country_languages(country)
languages.update(regional)
languages = languages.items()
else:
languages = regional.items()
else:
languages = get_country_languages(country).items()
properties['languages'] = [{'lang': lang, 'default': default}
for lang, default in languages]
properties['admin_level'] = level_num
poly_type = rec['geometry']['type']
if poly_type == 'Polygon':
poly = cls.to_polygon(rec['geometry']['coordinates'][0])
index.index_polygon(poly)
poly = index.simplify_polygon(poly)
index.add_polygon(poly, dict(rec['properties']))
elif poly_type == 'MultiPolygon':
polys = []
for coords in rec['geometry']['coordinates']:
poly = cls.to_polygon(coords[0])
polys.append(poly)
index.index_polygon(poly)
multi_poly = index.simplify_polygon(MultiPolygon(polys))
index.add_polygon(multi_poly, dict(rec['properties']))
else:
continue
i += 1
return index
@classmethod
def create_with_quattroshapes(cls, quattroshapes_dir,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
admin0_filename = os.path.join(quattroshapes_dir, 'qs_adm0.shp')
admin1_filename = os.path.join(quattroshapes_dir, 'qs_adm1.shp')
admin1r_filename = os.path.join(quattroshapes_dir, 'qs_adm1_region.shp')
return cls.create_from_shapefiles(admin0_filename, admin1_filename, admin1r_filename,
output_dir, index_filename=index_filename,
polys_filename=polys_filename)
def setup(self):
self.admin_levels = []
def index_polygon_properties(self, properties):
self.admin_levels.append(properties['admin_level'])
def load_polygon_properties(self, d):
self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w'))
def admin_level(self, i):
return self.admin_levels[i]
def get_candidate_polygons(self, lat, lon):
candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()
return sorted(candidates, key=self.admin_level, reverse=True)
def country_and_languages(self, latitude, longitude):
props = self.point_in_poly(latitude, longitude, return_all=True)
if not props:
return None, None, None
country = props[0]['qs_iso_cc'].lower()
languages = []
language_set = set()
have_regional = False
for p in props:
for l in p['languages']:
lang = l['lang']
if lang not in language_set:
language_set.add(lang)
if p['admin_level'] > 0 and l['default']:
have_regional = True
elif have_regional:
l = {'lang': l['lang'], 'default': 0}
languages.append(l)
# Python's builtin sort is stable, so if there are two defaults, the first remains first
# Since polygons are returned from the index ordered from smallest admin level to largest,
# it means the default language of the region overrides the country default
default_languages = sorted(languages, key=operator.itemgetter('default'), reverse=True)
return country, default_languages, props
def best_country_and_language(self, latitude, longitude, name):
country, candidate_languages, language_props = self.country_and_languages(latitude, longitude)
if not (country and candidate_languages):
return None, None
num_langs = len(candidate_languages)
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
num_defaults = len(default_langs)
regional_defaults = 0
country_defaults = 0
regional_langs = set()
country_langs = set()
for p in language_props:
if p['admin_level'] > 0:
regional_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
regional_langs |= set([l['lang'] for l in p['languages']])
else:
country_defaults += sum((1 for lang in p['languages'] if lang.get('default')))
country_langs |= set([l['lang'] for l in p['languages']])
if num_langs == 1:
return country, candidate_languages[0]['lang']
else:
lang = disambiguate_language(name, [(l['lang'], l['default']) for l in candidate_languages])
default_lang = candidate_languages[0]['lang']
if lang == UNKNOWN_LANGUAGE and num_defaults == 1:
return country, default_lang
elif lang == AMBIGUOUS_LANGUAGE:
return country, lang
elif lang != UNKNOWN_LANGUAGE:
if lang != default_lang and lang in country_langs and country_defaults > 1 and regional_defaults > 0 and lang in WELL_REPRESENTED_LANGUAGES:
return country, UNKNOWN_LANGUAGE
return country, lang
else:
return country, lang
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-q', '--quattroshapes-dir',
help='Path to quattroshapes dir')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
args = parser.parse_args()
index = LanguagePolygonIndex.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir)
index.save()

View File

@@ -0,0 +1,589 @@
'''
reverse_geocode.py
-------------------
In-memory reverse geocoder using polygons from Quattroshapes or OSM.
This should be useful for filling in the blanks both in constructing
training data from OSM addresses and for OpenVenues.
Usage:
python reverse_geocode.py -o /data/quattroshapes/rtree/reverse -q /data/quattroshapes/
python reverse_geocode.py -o /data/quattroshapes/rtree/reverse -a /data/osm/planet-admin-borders.osm
'''
import argparse
import logging
import operator
import os
import re
import requests
import shutil
import six
import subprocess
import sys
import tempfile
from functools import partial
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.coordinates.conversion import latlon_to_decimal
from geodata.countries.constants import Countries
from geodata.encoding import safe_decode
from geodata.file_utils import ensure_dir, download_file
from geodata.i18n.unicode_properties import get_chars_by_script
from geodata.i18n.languages import *
from geodata.i18n.word_breaks import ideographic_scripts
from geodata.names.deduping import NameDeduper
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
from geodata.osm.admin_boundaries import *
from geodata.polygons.index import *
from geodata.statistics.tf_idf import IDFIndex
from geodata.text.tokenize import tokenize, token_types
from geodata.text.normalize import *
from shapely.topology import TopologicalError
decode_latin1 = partial(safe_decode, encoding='latin1')
def str_id(v):
v = int(v)
if v <= 0:
return None
return str(v)
class QuattroshapesReverseGeocoder(RTreePolygonIndex):
'''
Quattroshapes polygons, for levels up to localities, are relatively
accurate and provide concordance with GeoPlanet and in some cases
GeoNames (which is used in other parts of this project).
'''
COUNTRIES_FILENAME = 'qs_adm0.shp'
ADMIN1_FILENAME = 'qs_adm1.shp'
ADMIN1_REGION_FILENAME = 'qs_adm1_region.shp'
ADMIN2_FILENAME = 'qs_adm2.shp'
ADMIN2_REGION_FILENAME = 'qs_adm2_region.shp'
LOCAL_ADMIN_FILENAME = 'qs_localadmin.shp'
LOCALITIES_FILENAME = 'qs_localities.shp'
NEIGHBORHOODS_FILENAME = 'qs_neighborhoods.shp'
COUNTRY = 'adm0'
ADMIN1 = 'adm1'
ADMIN1_REGION = 'adm1_region'
ADMIN2 = 'adm2'
ADMIN2_REGION = 'adm2_region'
LOCAL_ADMIN = 'localadmin'
LOCALITY = 'locality'
NEIGHBORHOOD = 'neighborhood'
PRIORITIES_FILENAME = 'priorities.json'
persistent_polygons = True
cache_size = 100000
sorted_levels = (COUNTRY,
ADMIN1_REGION,
ADMIN1,
ADMIN2_REGION,
ADMIN2,
LOCAL_ADMIN,
LOCALITY,
NEIGHBORHOOD,
)
sort_levels = {k: i for i, k in enumerate(sorted_levels)}
NAME = 'name'
CODE = 'code'
LEVEL = 'level'
GEONAMES_ID = 'geonames_id'
WOE_ID = 'woe_id'
polygon_properties = {
COUNTRIES_FILENAME: {
NAME: ('qs_a0', safe_decode),
CODE: ('qs_iso_cc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN1_FILENAME: {
NAME: ('qs_a1', safe_decode),
CODE: ('qs_a1_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN1_REGION_FILENAME: {
NAME: ('qs_a1r', safe_decode),
CODE: ('qs_a1r_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN2_FILENAME: {
NAME: ('qs_a2', decode_latin1),
CODE: ('qs_a2_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
ADMIN2_REGION_FILENAME: {
NAME: ('qs_a2r', safe_decode),
CODE: ('qs_a2r_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
LOCAL_ADMIN_FILENAME: {
NAME: ('qs_la', safe_decode),
CODE: ('qs_la_lc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str_id),
WOE_ID: ('qs_woe_id', str_id),
},
LOCALITIES_FILENAME: {
NAME: ('qs_loc', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('qs_gn_id', str),
WOE_ID: ('qs_woe_id', str),
},
NEIGHBORHOODS_FILENAME: {
NAME: ('name', safe_decode),
CODE: ('name_en', safe_decode),
LEVEL: ('qs_level', safe_decode),
GEONAMES_ID: ('gn_id', str_id),
WOE_ID: ('woe_id', str_id),
}
}
@classmethod
def create_from_shapefiles(cls,
input_files,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME,
use_all_props=False):
index = cls(save_dir=output_dir, index_filename=index_filename)
for input_file in input_files:
f = fiona.open(input_file)
filename = os.path.split(input_file)[-1]
aliases = cls.polygon_properties.get(filename)
if not use_all_props:
include_props = aliases
else:
include_props = None
for rec in f:
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
continue
properties = rec['properties']
if filename == cls.NEIGHBORHOODS_FILENAME:
properties['qs_level'] = 'neighborhood'
have_all_props = False
for k, (prop, func) in aliases.iteritems():
v = properties.get(prop, None)
if v is not None:
try:
properties[k] = func(v)
except Exception:
break
else:
have_all_props = True
if not have_all_props or not properties.get(cls.NAME):
continue
poly_type = rec['geometry']['type']
if poly_type == 'Polygon':
poly = cls.to_polygon(rec['geometry']['coordinates'][0])
index.index_polygon(poly)
poly = index.simplify_polygon(poly)
index.add_polygon(poly, dict(rec['properties']), include_only_properties=include_props)
elif poly_type == 'MultiPolygon':
polys = []
for coords in rec['geometry']['coordinates']:
poly = cls.to_polygon(coords[0])
polys.append(poly)
index.index_polygon(poly)
multi_poly = index.simplify_polygon(MultiPolygon(polys))
index.add_polygon(multi_poly, dict(rec['properties']), include_only_properties=include_props)
else:
continue
return index
@classmethod
def create_with_quattroshapes(cls, quattroshapes_dir,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
admin0_filename = os.path.join(quattroshapes_dir, cls.COUNTRIES_FILENAME)
admin1_filename = os.path.join(quattroshapes_dir, cls.ADMIN1_FILENAME)
admin1r_filename = os.path.join(quattroshapes_dir, cls.ADMIN1_REGION_FILENAME)
admin2_filename = os.path.join(quattroshapes_dir, cls.ADMIN2_FILENAME)
admin2r_filename = os.path.join(quattroshapes_dir, cls.ADMIN2_REGION_FILENAME)
localities_filename = os.path.join(quattroshapes_dir, cls.LOCALITIES_FILENAME)
return cls.create_from_shapefiles([admin0_filename, admin1_filename, admin1r_filename,
admin2_filename, admin2r_filename,
localities_filename],
output_dir, index_filename=index_filename,
polys_filename=polys_filename)
def setup(self):
self.priorities = []
def index_polygon_properties(self, properties):
self.priorities.append(self.sort_levels.get(properties[self.LEVEL], 0))
def load_polygon_properties(self, d):
self.priorities = json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
def sort_level(self, i):
return self.priorities[i]
def get_candidate_polygons(self, lat, lon):
candidates = super(QuattroshapesReverseGeocoder, self).get_candidate_polygons(lat, lon)
return sorted(candidates, key=self.sort_level, reverse=True)
class OSMReverseGeocoder(RTreePolygonIndex):
'''
OSM has among the best, most carefully-crafted, accurate administrative
polygons in the business in addition to using local language naming
conventions which is desirable for creating a truly multilingual address
parser.
The storage of these polygons is byzantine. See geodata.osm.osm_admin_boundaries
for more details.
Suffice to say, this reverse geocoder builds an R-tree index on OSM planet
in a reasonable amount of memory using arrays of C integers and binary search
for the dependency lookups and Tarjan's algorithm for finding strongly connected
components to stitch together the polygons.
'''
ADMIN_LEVEL = 'admin_level'
ADMIN_LEVELS_FILENAME = 'admin_levels.json'
polygon_reader = OSMAdminPolygonReader
persistent_polygons = True
# Cache almost everything
cache_size = 250000
simplify_polygons = False
fix_invalid_polygons = True
include_property_patterns = set([
'id',
'type',
'name',
'name:*',
'ISO3166-1:alpha2',
'ISO3166-1:alpha3',
'ISO3166-2',
'int_name',
'official_name',
'official_name:*',
'alt_name',
'alt_name:*',
'short_name',
'short_name:*',
'admin_level',
'place',
'population',
'designation',
'wikipedia',
'wikipedia:*',
])
@classmethod
def create_from_osm_file(cls, filename, output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
'''
Given an OSM file (planet or some other bounds) containing relations
and their dependencies, create an R-tree index for coarse-grained
reverse geocoding.
Note: the input file is expected to have been created using
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
admin borders commands if using other bounds.
'''
index = cls(save_dir=output_dir, index_filename=index_filename)
reader = cls.polygon_reader(filename)
polygons = reader.polygons()
logger = logging.getLogger('osm.reverse_geocode')
for element_id, props, admin_center, outer_polys, inner_polys in polygons:
props = {k: v for k, v in six.iteritems(props)
if k in cls.include_property_patterns or (six.u(':') in k and
six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)}
id_type, element_id = osm_type_and_id(element_id)
test_point = None
if admin_center:
admin_center_props = {k: v for k, v in six.iteritems(admin_center)
if k in ('id', 'type', 'lat', 'lon') or k in cls.include_property_patterns or (six.u(':') in k and
six.u('{}:*').format(k.split(six.u(':'), 1)[0]) in cls.include_property_patterns)}
if cls.fix_invalid_polygons:
center_lat, center_lon = latlon_to_decimal(admin_center_props['lat'], admin_center_props['lon'])
test_point = Point(center_lon, center_lat)
props['admin_center'] = admin_center_props
if inner_polys and not outer_polys:
logger.warn('inner polygons with no outer')
continue
if len(outer_polys) == 1 and not inner_polys:
poly = cls.to_polygon(outer_polys[0])
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
if poly.type != 'MultiPolygon':
index.index_polygon(poly)
else:
for p in poly:
index.index_polygon(p)
else:
multi = []
inner = []
# Validate inner polygons (holes)
for p in inner_polys:
poly = cls.to_polygon(p)
if poly is None or not poly.bounds or len(poly.bounds) != 4 or not poly.is_valid:
continue
if poly.type != 'MultiPolygon':
inner.append(poly)
else:
inner.extend(poly)
# Validate outer polygons
for p in outer_polys:
poly = cls.to_polygon(p, test_point=test_point)
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
interior = []
try:
# Figure out which outer polygon contains each inner polygon
interior = [p2 for p2 in inner if poly.contains(p2)]
except TopologicalError:
continue
if interior:
# Polygon with holes constructor
poly = cls.to_polygon(p, [zip(*p2.exterior.coords.xy) for p2 in interior], test_point=test_point)
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
# R-tree only stores the bounding box, so add the whole polygon
if poly.type != 'MultiPolygon':
index.index_polygon(poly)
multi.append(poly)
else:
for p in poly:
index.index_polygon(p)
multi.extend(poly)
if len(multi) > 1:
poly = MultiPolygon(multi)
elif multi:
poly = multi[0]
else:
continue
if index.simplify_polygons:
poly = index.simplify_polygon(poly)
index.add_polygon(poly, props)
return index
def setup(self):
self.admin_levels = []
def index_polygon_properties(self, properties):
admin_level = properties.get(self.ADMIN_LEVEL, 0)
try:
admin_level = int(admin_level)
except ValueError:
admin_level = 0
self.admin_levels.append(admin_level)
def load_polygon_properties(self, d):
self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w'))
def sort_level(self, i):
return self.admin_levels[i]
def get_candidate_polygons(self, lat, lon):
candidates = super(OSMReverseGeocoder, self).get_candidate_polygons(lat, lon)
return sorted(candidates, key=self.sort_level, reverse=True)
class OSMSubdivisionReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = True
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMSubdivisionPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['landuse', 'place', 'amenity'])
class OSMBuildingReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = True
cache_size = 10000
simplify_polygons = False
fix_invalid_polygons = False
polygon_reader = OSMBuildingPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['building', 'building:levels', 'building:part', 'addr:*'])
class OSMPostalCodeReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = True
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMPostalCodesPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['postal_code'])
class OSMAirportReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = True
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMAirportsPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['iata', 'aerodrome', 'aerodrome:type', 'city_served'])
class OSMCountryReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = True
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMCountryPolygonReader
@classmethod
def country_and_languages_from_components(cls, osm_components):
country = None
for c in osm_components:
country = c.get('ISO3166-1:alpha2')
if country:
break
else:
# See if there's an ISO3166-2 code that matches
# in case the country polygon is wacky
for c in osm_components:
admin1 = c.get('ISO3166-2')
if admin1:
# If so, and if the country is valid, use that
country = admin1[:2]
if not Countries.is_valid_country_code(country.lower()):
return None, []
break
country = country.lower()
regional = None
for c in osm_components:
place_id = '{}:{}'.format(c.get('type', 'relation'), c.get('id', '0'))
regional = get_regional_languages(country, 'osm', place_id)
if regional:
break
languages = []
if not regional:
languages = get_country_languages(country).items()
else:
if not all(regional.values()):
languages = get_country_languages(country)
languages.update(regional)
languages = languages.items()
else:
languages = regional.items()
default_languages = sorted(languages, key=operator.itemgetter(1), reverse=True)
return country, default_languages
def country_and_languages(self, lat, lon):
osm_components = self.point_in_poly(lat, lon, return_all=True)
return self.country_and_languages_from_components(osm_components)
if __name__ == '__main__':
# Handle argument parsing here
parser = argparse.ArgumentParser()
parser.add_argument('-q', '--quattroshapes-dir',
help='Path to quattroshapes dir')
parser.add_argument('-a', '--osm-admin-file',
help='Path to OSM borders file (with dependencies, .osm format)')
parser.add_argument('-s', '--osm-subdivisions-file',
help='Path to OSM subdivisions file (with dependencies, .osm format)')
parser.add_argument('-b', '--osm-building-polygons-file',
help='Path to OSM building polygons file (with dependencies, .osm format)')
parser.add_argument('-p', '--osm-postal-code-polygons-file',
help='Path to OSM postal code polygons file (with dependencies, .osm format)')
parser.add_argument('-r', '--osm-airport-polygons-file',
help='Path to OSM airport polygons file (with dependencies, .osm format)')
parser.add_argument('-c', '--osm-country-polygons-file',
help='Path to OSM country polygons file (with dependencies, .osm format)')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
logging.basicConfig(level=logging.INFO)
args = parser.parse_args()
if args.osm_admin_file:
index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir)
elif args.osm_subdivisions_file:
index = OSMSubdivisionReverseGeocoder.create_from_osm_file(args.osm_subdivisions_file, args.out_dir)
elif args.osm_building_polygons_file:
index = OSMBuildingReverseGeocoder.create_from_osm_file(args.osm_building_polygons_file, args.out_dir)
elif args.osm_country_polygons_file:
index = OSMCountryReverseGeocoder.create_from_osm_file(args.osm_country_polygons_file, args.out_dir)
elif args.osm_postal_code_polygons_file:
index = OSMPostalCodeReverseGeocoder.create_from_osm_file(args.osm_postal_code_polygons_file, args.out_dir)
elif args.osm_airport_polygons_file:
index = OSMAirportReverseGeocoder.create_from_osm_file(args.osm_airport_polygons_file, args.out_dir)
elif args.quattroshapes_dir:
index = QuattroshapesReverseGeocoder.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir)
else:
parser.error('Must specify quattroshapes dir or osm admin borders file')
index.save()