Files
libpostal/scripts/geodata/polygons/index.py
2016-07-21 17:04:57 -04:00

417 lines
14 KiB
Python

import fiona
import geohash
import os
import rtree
import six
import ujson as json
from collections import OrderedDict, defaultdict
from leveldb import LevelDB
from lru import LRU
from shapely.geometry import Point, Polygon, MultiPolygon
from shapely.prepared import prep
from shapely.geometry.geo import mapping
from geodata.polygons.area import polygon_bounding_box_area
DEFAULT_POLYS_FILENAME = 'polygons.geojson'
DEFAULT_PROPS_FILENAME = 'properties.json'
class PolygonIndex(object):
include_only_properties = None
simplify_tolerance = 0.0001
preserve_topology = True
persistent_polygons = False
cache_size = 0
INDEX_FILENAME = None
POLYGONS_DB_DIR = 'polygons'
def __init__(self, index=None, polygons=None, polygons_db=None, save_dir=None,
index_filename=None,
polygons_db_path=None,
include_only_properties=None):
if save_dir:
self.save_dir = save_dir
else:
self.save_dir = None
if not index_filename:
index_filename = self.INDEX_FILENAME
self.index_path = os.path.join(save_dir or '.', index_filename)
if not index:
self.create_index(overwrite=True)
else:
self.index = index
if include_only_properties and hasattr(include_only_properties, '__contains__'):
self.include_only_properties = include_only_properties
if not polygons and not self.persistent_polygons:
self.polygons = {}
elif polygons and not self.persistent_polygons:
self.polygons = polygons
elif self.persistent_polygons and self.cache_size > 0:
self.polygons = LRU(self.cache_size)
if polygons:
for key, value in six.iteritems(polygons):
self.polygons[key] = value
self.cache_hits = 0
self.cache_misses = 0
self.get_polygon = self.get_polygon_cached
if not polygons_db_path:
polygons_db_path = os.path.join(save_dir or '.', self.POLYGONS_DB_DIR)
if not polygons_db:
self.polygons_db = LevelDB(polygons_db_path)
else:
self.polygons_db = polygons_db
self.setup()
self.i = 0
def create_index(self, overwrite=False):
raise NotImplementedError('Children must implement')
def index_polygon(self, polygon):
raise NotImplementedError('Children must implement')
def setup(self):
pass
def simplify_polygon(self, poly, simplify_tolerance=None, preserve_topology=None):
if simplify_tolerance is None:
simplify_tolerance = self.simplify_tolerance
if preserve_topology is None:
preserve_topology = self.preserve_topology
return poly.simplify(simplify_tolerance, preserve_topology=preserve_topology)
def index_polygon_properties(self, properties):
pass
def polygon_geojson(self, poly, properties):
return {
'type': 'Feature',
'geometry': mapping(poly),
}
def add_polygon(self, poly, properties, cache=False, include_only_properties=None):
if include_only_properties is not None:
properties = {k: v for k, v in properties.iteritems() if k in include_only_properties}
if not self.persistent_polygons or cache:
self.polygons[self.i] = prep(poly)
if self.persistent_polygons:
self.polygons_db.Put(self.polygon_key(self.i), json.dumps(self.polygon_geojson(poly, properties)))
self.polygons_db.Put(self.properties_key(self.i), json.dumps(properties))
self.index_polygon_properties(properties)
self.i += 1
@classmethod
def create_from_shapefiles(cls, inputs, output_dir,
index_filename=None,
include_only_properties=None):
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
for input_file in inputs:
if include_only_properties is not None:
include_props = include_only_properties.get(input_file, cls.include_only_properties)
else:
include_props = cls.include_only_properties
f = fiona.open(input_file)
index.add_geojson_like_file(f)
return index
@classmethod
def fix_polygon(cls, poly):
'''
Coerce to valid polygon
'''
if not poly.is_valid:
poly = poly.buffer(0)
if not poly.is_valid:
return None
return poly
@classmethod
def to_polygon(cls, coords):
'''
Create shapely polygon from list of coordinate tuples if valid
'''
if not coords or len(coords) < 3:
return None
poly = Polygon(coords)
return poly
def add_geojson_like_record(self, rec, include_only_properties=None):
if not rec or not rec.get('geometry') or 'type' not in rec['geometry']:
return
poly_type = rec['geometry']['type']
if poly_type == 'Polygon':
coords = rec['geometry']['coordinates'][0]
poly = self.to_polygon(coords)
if poly is None or not poly.bounds or len(poly.bounds) != 4:
return
self.index_polygon(poly)
self.add_polygon(poly, rec['properties'], include_only_properties=include_only_properties)
elif poly_type == 'MultiPolygon':
polys = []
poly_coords = rec['geometry']['coordinates']
for coords in poly_coords:
poly = self.to_polygon(coords[0])
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
polys.append(poly)
self.index_polygon(poly)
self.add_polygon(MultiPolygon(polys), rec['properties'], include_only_properties=include_only_properties)
else:
return
def add_geojson_like_file(self, f, include_only_properties=None):
'''
Add either GeoJSON or a shapefile record to the index
'''
for rec in f:
self.add_geojson_like_record(rec, include_only_properties=include_only_properties)
@classmethod
def create_from_geojson_files(cls, inputs, output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME,
include_only_properties=None):
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
for input_file in inputs:
if include_only_properties is not None:
include_props = include_only_properties.get(input_file, cls.include_only_properties)
else:
include_props = cls.include_only_properties
f = json.load(open(input_file))
index.add_geojson_like_file(f['features'], include_only_properties=include_props)
return index
def compact_polygons_db(self):
self.polygons_db.CompactRange('\x00', '\xff')
def save(self):
self.save_index()
self.save_properties(os.path.join(self.save_dir, DEFAULT_PROPS_FILENAME))
if not self.persistent_polygons:
self.save_polygons(os.path.join(self.save_dir, DEFAULT_POLYS_FILENAME))
self.compact_polygons_db()
self.save_polygon_properties(self.save_dir)
def load_properties(self, filename):
properties = json.load(open(filename))
self.i = int(properties.get('num_polygons', self.i))
def save_properties(self, out_filename):
out = open(out_filename, 'w')
json.dump({'num_polygons': str(self.i)}, out)
def save_polygons(self, out_filename):
out = open(out_filename, 'w')
for i in xrange(self.i):
poly = self.polygons[i]
feature = {
'type': 'Feature',
'geometry': mapping(poly.context),
}
out.write(json.dumps(feature) + u'\n')
def save_index(self):
raise NotImplementedError('Children must implement')
def load_polygon_properties(self, d):
pass
def save_polygon_properties(self, d):
pass
@classmethod
def polygon_from_geojson(cls, feature):
poly_type = feature['geometry']['type']
if poly_type == 'Polygon':
poly = Polygon(feature['geometry']['coordinates'][0])
return poly
elif poly_type == 'MultiPolygon':
polys = []
for coords in feature['geometry']['coordinates']:
poly = Polygon(coords[0])
polys.append(poly)
return MultiPolygon(polys)
@classmethod
def load_polygons(cls, filename):
f = open(filename)
polygons = {}
cls.i = 0
for line in f:
feature = json.loads(line.rstrip())
polygons[cls.i] = prep(cls.polygon_from_geojson(feature))
cls.i += 1
return polygons
@classmethod
def load_index(cls, d, index_name=None):
raise NotImplementedError('Children must implement')
@classmethod
def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME,
properties_filename=DEFAULT_PROPS_FILENAME,
polys_db_dir=POLYGONS_DB_DIR):
index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME)
if not cls.persistent_polygons:
polys = cls.load_polygons(os.path.join(d, polys_filename))
else:
polys = None
polygons_db = LevelDB(os.path.join(d, polys_db_dir))
polygon_index = cls(index=index, polygons=polys, polygons_db=polygons_db, save_dir=d)
polygon_index.load_properties(os.path.join(d, properties_filename))
polygon_index.load_polygon_properties(d)
return polygon_index
def get_candidate_polygons(self, lat, lon):
raise NotImplementedError('Children must implement')
def get_properties(self, i):
return json.loads(self.polygons_db.Get(self.properties_key(i)))
def get_polygon(self, i):
return self.polygons[i]
def get_polygon_cached(self, i):
poly = self.polygons.get(i, None)
if poly is None:
data = json.loads(self.polygons_db.Get(self.polygon_key(i)))
poly = prep(self.polygon_from_geojson(data))
self.polygons[i] = poly
self.cache_misses += 1
else:
self.cache_hits += 1
return poly
def __iter__(self):
for i in xrange(self.i):
yield self.get_properties(i), self.get_polygon(i)
def __len__(self):
return self.i
def polygons_contain(self, candidates, point, return_all=False):
containing = None
if return_all:
containing = []
for i in candidates:
poly = self.get_polygon(i)
contains = poly.contains(point)
if contains:
properties = self.get_properties(i)
if not return_all:
return properties
else:
containing.append(properties)
return containing
def polygon_key(self, i):
return 'poly:{}'.format(i)
def properties_key(self, i):
return 'props:{}'.format(i)
def point_in_poly(self, lat, lon, return_all=False):
candidates = self.get_candidate_polygons(lat, lon)
point = Point(lon, lat)
return self.polygons_contain(candidates, point, return_all=return_all)
class RTreePolygonIndex(PolygonIndex):
INDEX_FILENAME = 'rtree'
def create_index(self, overwrite=False):
self.index = rtree.index.Index(self.index_path, overwrite=overwrite)
def index_polygon(self, polygon):
self.index.insert(self.i, polygon.bounds)
def get_candidate_polygons(self, lat, lon):
return OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()
def save_index(self):
# need to close index before loading it
self.index.close()
@classmethod
def load_index(cls, d, index_name=None):
return rtree.index.Index(os.path.join(d, index_name or cls.INDEX_FILENAME))
class GeohashPolygonIndex(PolygonIndex):
# Reference: https://www.elastic.co/guide/en/elasticsearch/guide/current/geohashes.html
GEOHASH_PRECISIONS = [
(7, 152.8 ** 2),
(6, 1200.0 * 610.0),
(5, 4900.0 * 4900.0),
(4, 39000.0 * 19500.0),
(3, 156000.0 * 156000.0),
(2, 1251000.0 * 625000.0),
]
INDEX_FILENAME = 'index.json'
def create_index(self, overwrite=False):
self.index = defaultdict(list)
def index_point(self, lat, lon, geohash_level):
code = geohash.encode(lat, lon)[:geohash_level]
for key in [code] + geohash.neighbors(code):
self.index[key].append(self.i)
def index_polygon(self, polygon):
poly_area = polygon_bounding_box_area(polygon)
for geohash_level, area in self.GEOHASH_PRECISIONS:
if area * 9.0 >= poly_area:
break
bbox = polygon.bounds
lat, lon = (bbox[1] + bbox[3]) / 2.0, (bbox[0] + bbox[2]) / 2.0
self.index_point(lat, lon, geohash_level)
def get_candidate_polygons(self, lat, lon, return_all=False):
candidates = []
code = geohash.encode(lat, lon)
for level, area in self.GEOHASH_PRECISIONS:
indices = self.index.get(code[:level])
if not indices:
continue
candidates.extend(indices)
if not return_all:
break
return candidates
def save_index(self):
if not self.index_path:
self.index_path = os.path.join(self.save_dir or '.', self.INDEX_FILENAME)
json.dump(self.index, open(self.index_path, 'w'))
@classmethod
def load_index(cls, d, index_name=None):
return json.load(open(os.path.join(d, index_name or cls.INDEX_FILENAME)))