[osm] Storing polygon properties in a LevelDB, polygons themselves stay in memory

This commit is contained in:
Al
2016-04-04 21:30:11 -04:00
parent 8db7f139ba
commit 4e17ef6f91
3 changed files with 139 additions and 43 deletions

View File

@@ -4,6 +4,8 @@ import os
import rtree import rtree
import ujson as json import ujson as json
from leveldb import LevelDB
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from shapely.geometry import Point, Polygon, MultiPolygon from shapely.geometry import Point, Polygon, MultiPolygon
from shapely.prepared import prep from shapely.prepared import prep
@@ -11,28 +13,30 @@ from shapely.geometry.geo import mapping
from geodata.polygons.area import polygon_bounding_box_area from geodata.polygons.area import polygon_bounding_box_area
DEFAULT_POLYS_FILENAME = 'polygons.geojson'
class PolygonIndex(object): class PolygonIndex(object):
include_only_properties = None include_only_properties = None
simplify_tolerance = 0.0001 simplify_tolerance = 0.00001
preserve_topology = True preserve_topology = True
large_polygon_threshold = 10000
INDEX_FILENAME = None INDEX_FILENAME = None
POLYGONS_DB_DIR = 'polygons'
DEFAULT_POLYS_FILENAME = 'polygons.geojson'
def __init__(self, index=None, polygons=None, save_dir=None, def __init__(self, index=None, polygons=None, polygons_db=None, save_dir=None,
index_filename=None, index_filename=None,
polygons_db_path=None,
include_only_properties=None): include_only_properties=None):
if save_dir: if save_dir:
self.save_dir = save_dir self.save_dir = save_dir
else: else:
self.save_dir = None self.save_dir = None
if index_filename: if not index_filename:
self.index_path = os.path.join(save_dir or '.', index_filename) index_filename = self.INDEX_FILENAME
else:
self.index_path = os.path.join(save_dir or '.', self.INDEX_FILENAME) self.index_path = os.path.join(save_dir or '.', index_filename)
if not index: if not index:
self.create_index(overwrite=True) self.create_index(overwrite=True)
@@ -47,6 +51,16 @@ class PolygonIndex(object):
else: else:
self.polygons = polygons self.polygons = polygons
if not polygons_db_path:
polygons_db_path = os.path.join(save_dir or '.', self.POLYGONS_DB_DIR)
if not polygons_db:
self.polygons_db = LevelDB(polygons_db_path)
else:
self.polygons_db = polygons_db
self.setup()
self.i = 0 self.i = 0
def create_index(self, overwrite=False): def create_index(self, overwrite=False):
@@ -55,6 +69,9 @@ class PolygonIndex(object):
def index_polygon(self, polygon): def index_polygon(self, polygon):
raise NotImplementedError('Children must implement') raise NotImplementedError('Children must implement')
def setup(self):
pass
def simplify_polygon(self, poly, simplify_tolerance=None, preserve_topology=None): def simplify_polygon(self, poly, simplify_tolerance=None, preserve_topology=None):
if simplify_tolerance is None: if simplify_tolerance is None:
simplify_tolerance = self.simplify_tolerance simplify_tolerance = self.simplify_tolerance
@@ -62,16 +79,21 @@ class PolygonIndex(object):
preserve_topology = self.preserve_topology preserve_topology = self.preserve_topology
return poly.simplify(simplify_tolerance, preserve_topology=preserve_topology) return poly.simplify(simplify_tolerance, preserve_topology=preserve_topology)
def index_polygon_properties(self, properties):
pass
def add_polygon(self, poly, properties, include_only_properties=None): def add_polygon(self, poly, properties, include_only_properties=None):
if include_only_properties is not None: if include_only_properties is not None:
properties = {k: v for k, v in properties.iteritems() if k in include_only_properties} properties = {k: v for k, v in properties.iteritems() if k in include_only_properties}
self.polygons.append((properties, prep(poly)))
self.polygons.append(prep(poly))
self.polygons_db.Put(str(self.i), json.dumps(properties))
self.index_polygon_properties(properties)
self.i += 1 self.i += 1
@classmethod @classmethod
def create_from_shapefiles(cls, inputs, output_dir, def create_from_shapefiles(cls, inputs, output_dir,
index_filename=None, index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME,
include_only_properties=None): include_only_properties=None):
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME) index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
for input_file in inputs: for input_file in inputs:
@@ -161,20 +183,27 @@ class PolygonIndex(object):
def save(self, polys_filename=DEFAULT_POLYS_FILENAME): def save(self, polys_filename=DEFAULT_POLYS_FILENAME):
self.save_polygons(os.path.join(self.save_dir, polys_filename)) self.save_polygons(os.path.join(self.save_dir, polys_filename))
self.save_index() self.save_index()
self.save_polygon_properties(self.save_dir)
def save_polygons(self, out_filename): def save_polygons(self, out_filename):
out = open(out_filename, 'w') out = open(out_filename, 'w')
for props, poly in self.polygons: for poly in self.polygons:
feature = { feature = {
'type': 'Feature', 'type': 'Feature',
'geometry': mapping(poly.context), 'geometry': mapping(poly.context),
'properties': props
} }
out.write(json.dumps(feature) + u'\n') out.write(json.dumps(feature) + u'\n')
self.polygons_db.CompactRange('\x00', '\xff')
def save_index(self): def save_index(self):
raise NotImplementedError('Children must implement') raise NotImplementedError('Children must implement')
def load_polygon_properties(self, d):
pass
def save_polygon_properties(self, d):
pass
@classmethod @classmethod
def load_polygons(cls, filename): def load_polygons(cls, filename):
f = open(filename) f = open(filename)
@@ -185,14 +214,14 @@ class PolygonIndex(object):
if poly_type == 'Polygon': if poly_type == 'Polygon':
poly = Polygon(feature['geometry']['coordinates'][0]) poly = Polygon(feature['geometry']['coordinates'][0])
polygons.append((feature['properties'], prep(poly))) polygons.append(prep(poly))
elif poly_type == 'MultiPolygon': elif poly_type == 'MultiPolygon':
polys = [] polys = []
for coords in feature['geometry']['coordinates']: for coords in feature['geometry']['coordinates']:
poly = Polygon(coords[0]) poly = Polygon(coords[0])
polys.append(poly) polys.append(poly)
polygons.append((feature['properties'], prep(MultiPolygon(polys)))) polygons.append(prep(MultiPolygon(polys)))
return polygons return polygons
@classmethod @classmethod
@@ -200,10 +229,13 @@ class PolygonIndex(object):
raise NotImplementedError('Children must implement') raise NotImplementedError('Children must implement')
@classmethod @classmethod
def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME): def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME, polys_db_dir=POLYGONS_DB_DIR):
index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME) index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME)
polys = cls.load_polygons(os.path.join(d, polys_filename)) polys = cls.load_polygons(os.path.join(d, polys_filename))
return cls(index=index, polygons=polys, save_dir=d) polygons_db = LevelDB(os.path.join(d, polys_db_dir))
polygon_index = cls(index=index, polygons=polys, polygons_db=polygons_db, save_dir=d)
polygon_index.load_polygon_properties(d)
return polygon_index
def get_candidate_polygons(self, lat, lon): def get_candidate_polygons(self, lat, lon):
raise NotImplementedError('Children must implement') raise NotImplementedError('Children must implement')
@@ -215,12 +247,15 @@ class PolygonIndex(object):
if return_all: if return_all:
containing = [] containing = []
for i in polys: for i in polys:
props, poly = self.polygons[i] poly = self.polygons[i]
contains = poly.contains(pt) contains = poly.contains(pt)
if contains and not return_all: if contains and not return_all:
return props try:
return json.loads(self.polygons_db.Get(str(i)))
except KeyError:
return None
elif contains: elif contains:
containing.append(props) containing.append(json.loads(self.polygons_db.Get(str(i))))
return containing return containing

View File

@@ -1,6 +1,7 @@
import argparse import argparse
import os import os
import sys import sys
import ujson as json
this_dir = os.path.realpath(os.path.dirname(__file__)) this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
@@ -13,6 +14,8 @@ regional_language_dir = os.path.join(LANGUAGES_DIR, 'regional')
class LanguagePolygonIndex(RTreePolygonIndex): class LanguagePolygonIndex(RTreePolygonIndex):
DEFAULT_POLYS_FILENAME = 'polygons.geojson'
ADMIN_LEVELS_FILENAME = 'admin_levels.json'
include_only_properties = set([ include_only_properties = set([
'qs_a0', 'qs_a0',
@@ -136,9 +139,20 @@ class LanguagePolygonIndex(RTreePolygonIndex):
output_dir, index_filename=index_filename, output_dir, index_filename=index_filename,
polys_filename=polys_filename) polys_filename=polys_filename)
def setup(self):
self.admin_levels = []
def index_polygon_properties(self, properties):
self.admin_levels.append(properties['admin_level'])
def load_polygon_properties(self, d):
self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w'))
def admin_level(self, i): def admin_level(self, i):
props, p = self.polygons[i] return self.admin_levels[i]
return props['admin_level']
def get_candidate_polygons(self, lat, lon): def get_candidate_polygons(self, lat, lon):
candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys() candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys()

View File

@@ -35,7 +35,7 @@ from geodata.i18n.unicode_properties import get_chars_by_script
from geodata.i18n.word_breaks import ideographic_scripts from geodata.i18n.word_breaks import ideographic_scripts
from geodata.names.deduping import NameDeduper from geodata.names.deduping import NameDeduper
from geodata.osm.extract import parse_osm, OSM_NAME_TAGS from geodata.osm.extract import parse_osm, OSM_NAME_TAGS
from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader, OSMZonePolygonReader from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader, OSMSubdivisionPolygonReader
from geodata.polygons.index import * from geodata.polygons.index import *
from geodata.statistics.tf_idf import IDFIndex from geodata.statistics.tf_idf import IDFIndex
@@ -161,6 +161,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
''' '''
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods' NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods'
PRIORITIES_FILENAME = 'priorities.json'
SCRATCH_DIR = '/tmp' SCRATCH_DIR = '/tmp'
DUPE_THRESHOLD = 0.9 DUPE_THRESHOLD = 0.9
@@ -386,9 +388,20 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return index return index
def setup(self):
self.priorities = []
def index_polygon_properties(self, properties):
self.priorities.append((props['polygon_type'], self.source_priorities[props['source']]))
def load_polygon_properties(self, d):
self.priorities = json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
def priority(self, i): def priority(self, i):
props, p = self.polygons[i] return self.priorities[i]
return (self.level_priorities[props['polygon_type']], self.source_priorities[props['source']])
def get_candidate_polygons(self, lat, lon): def get_candidate_polygons(self, lat, lon):
candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon) candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon)
@@ -419,6 +432,8 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
LOCALITY = 'locality' LOCALITY = 'locality'
NEIGHBORHOOD = 'neighborhood' NEIGHBORHOOD = 'neighborhood'
PRIORITIES_FILENAME = 'priorities.json'
sorted_levels = (COUNTRY, sorted_levels = (COUNTRY,
ADMIN1_REGION, ADMIN1_REGION,
ADMIN1, ADMIN1,
@@ -578,9 +593,20 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
output_dir, index_filename=index_filename, output_dir, index_filename=index_filename,
polys_filename=polys_filename) polys_filename=polys_filename)
def setup(self):
self.priorities = []
def index_polygon_properties(self, properties):
self.priorities.append(self.sort_levels.get(properties[self.LEVEL], 0))
def load_polygon_properties(self, d):
self.priorities = json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
def sort_level(self, i): def sort_level(self, i):
props, p = self.polygons[i] return self.priorities[i]
return self.sort_levels.get(props[self.LEVEL], 0)
def get_candidate_polygons(self, lat, lon): def get_candidate_polygons(self, lat, lon):
candidates = super(QuattroshapesReverseGeocoder, self).get_candidate_polygons(lat, lon) candidates = super(QuattroshapesReverseGeocoder, self).get_candidate_polygons(lat, lon)
@@ -618,6 +644,8 @@ class OSMReverseGeocoder(RTreePolygonIndex):
ADMIN_LEVEL = 'admin_level' ADMIN_LEVEL = 'admin_level'
ADMIN_LEVELS_FILENAME = 'admin_levels.json'
polygon_reader = OSMAdminPolygonReader polygon_reader = OSMAdminPolygonReader
include_property_patterns = set([ include_property_patterns = set([
@@ -713,16 +741,11 @@ class OSMReverseGeocoder(RTreePolygonIndex):
# Figure out which outer polygon contains each inner polygon # Figure out which outer polygon contains each inner polygon
interior = [p2 for p2 in inner if poly.contains(p2)] interior = [p2 for p2 in inner if poly.contains(p2)]
except TopologicalError: except TopologicalError:
poly = cls.fix_polygon(poly) continue
if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue
if poly.is_valid:
interior = [p2 for p2 in inner if poly.contains(p2)]
if interior: if interior:
# Polygon with holes constructor # Polygon with holes constructor
poly = Polygon(p, [zip(*p2.exterior.coords.xy) for p2 in interior]) poly = Polygon(p, [zip(*p2.exterior.coords.xy) for p2 in interior])
poly = cls.fix_polygon(poly)
if poly is None or not poly.bounds or len(poly.bounds) != 4: if poly is None or not poly.bounds or len(poly.bounds) != 4:
continue continue
# R-tree only stores the bounding box, so add the whole polygon # R-tree only stores the bounding box, so add the whole polygon
@@ -740,28 +763,47 @@ class OSMReverseGeocoder(RTreePolygonIndex):
poly = multi[0] poly = multi[0]
else: else:
continue continue
poly = index.simplify_polygon(poly) if len(poly.exterior.xy[0]) >= cls.large_polygon_threshold:
poly = index.simplify_polygon(poly)
index.add_polygon(poly, props) index.add_polygon(poly, props)
return index return index
def sort_level(self, i): def setup(self):
props, p = self.polygons[i] self.admin_levels = []
admin_level = props.get(self.ADMIN_LEVEL, 0)
def index_polygon_properties(self, properties):
admin_level = properties.get(self.ADMIN_LEVEL, 0)
try: try:
return int(admin_level) admin_level = int(admin_level)
except ValueError: except ValueError:
return 0 admin_level = 0
self.admin_levels.append(admin_level)
def load_polygon_properties(self, d):
self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w'))
def sort_level(self, i):
return self.admin_level[i]
def get_candidate_polygons(self, lat, lon): def get_candidate_polygons(self, lat, lon):
candidates = super(OSMReverseGeocoder, self).get_candidate_polygons(lat, lon) candidates = super(OSMReverseGeocoder, self).get_candidate_polygons(lat, lon)
return sorted(candidates, key=self.sort_level, reverse=True) return sorted(candidates, key=self.sort_level, reverse=True)
class OSMZoneReverseGeocoder(OSMReverseGeocoder): class OSMSubdivisionReverseGeocoder(OSMReverseGeocoder):
polygon_reader = OSMZonePolygonReader polygon_reader = OSMSubdivisionPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['landuse']) include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['landuse'])
class OSMBuildingReverseGeocoder(OSMReverseGeocoder):
polygon_reader = OSMBuildingPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['building', 'building:levels'])
if __name__ == '__main__': if __name__ == '__main__':
# Handle argument parsing here # Handle argument parsing here
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@@ -772,8 +814,11 @@ if __name__ == '__main__':
parser.add_argument('-a', '--osm-admin-file', parser.add_argument('-a', '--osm-admin-file',
help='Path to OSM borders file (with dependencies, .osm format)') help='Path to OSM borders file (with dependencies, .osm format)')
parser.add_argument('-l', '--osm-landuse-file', parser.add_argument('-s', '--osm-subdivisions-file',
help='Path to OSM landuse file (with dependencies, .osm format)') help='Path to OSM subdivisions file (with dependencies, .osm format)')
parser.add_argument('-b', '--osm-building-polygons-file',
help='Path to OSM building polygons file (with dependencies, .osm format)')
parser.add_argument('-n', '--osm-neighborhoods-file', parser.add_argument('-n', '--osm-neighborhoods-file',
help='Path to OSM neighborhoods file (no dependencies, .osm format)') help='Path to OSM neighborhoods file (no dependencies, .osm format)')
@@ -788,7 +833,9 @@ if __name__ == '__main__':
if args.osm_admin_file: if args.osm_admin_file:
index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir) index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir)
elif args.osm_landuse_file: elif args.osm_landuse_file:
index = OSMZoneReverseGeocoder.create_from_osm_file(args.osm_landuse_file, args.out_dir) index = OSMSubdivisionReverseGeocoder.create_from_osm_file(args.osm_landuse_file, args.out_dir)
elif args.osm_building_polygons_file:
index = OSMBuildingReverseGeocoder.create_from_osm_file(args.osm_building_polygons_file, args.out_dir)
elif args.osm_neighborhoods_file and args.quattroshapes_dir: elif args.osm_neighborhoods_file and args.quattroshapes_dir:
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
args.osm_neighborhoods_file, args.osm_neighborhoods_file,