diff --git a/scripts/geodata/polygons/index.py b/scripts/geodata/polygons/index.py index 2cfe0f71..d1a5094e 100644 --- a/scripts/geodata/polygons/index.py +++ b/scripts/geodata/polygons/index.py @@ -4,6 +4,8 @@ import os import rtree import ujson as json +from leveldb import LevelDB + from collections import OrderedDict, defaultdict from shapely.geometry import Point, Polygon, MultiPolygon from shapely.prepared import prep @@ -11,28 +13,30 @@ from shapely.geometry.geo import mapping from geodata.polygons.area import polygon_bounding_box_area -DEFAULT_POLYS_FILENAME = 'polygons.geojson' - class PolygonIndex(object): include_only_properties = None - simplify_tolerance = 0.0001 + simplify_tolerance = 0.00001 preserve_topology = True + large_polygon_threshold = 10000 INDEX_FILENAME = None + POLYGONS_DB_DIR = 'polygons' + DEFAULT_POLYS_FILENAME = 'polygons.geojson' - def __init__(self, index=None, polygons=None, save_dir=None, + def __init__(self, index=None, polygons=None, polygons_db=None, save_dir=None, index_filename=None, + polygons_db_path=None, include_only_properties=None): if save_dir: self.save_dir = save_dir else: self.save_dir = None - if index_filename: - self.index_path = os.path.join(save_dir or '.', index_filename) - else: - self.index_path = os.path.join(save_dir or '.', self.INDEX_FILENAME) + if not index_filename: + index_filename = self.INDEX_FILENAME + + self.index_path = os.path.join(save_dir or '.', index_filename) if not index: self.create_index(overwrite=True) @@ -47,6 +51,16 @@ class PolygonIndex(object): else: self.polygons = polygons + if not polygons_db_path: + polygons_db_path = os.path.join(save_dir or '.', self.POLYGONS_DB_DIR) + + if not polygons_db: + self.polygons_db = LevelDB(polygons_db_path) + else: + self.polygons_db = polygons_db + + self.setup() + self.i = 0 def create_index(self, overwrite=False): @@ -55,6 +69,9 @@ class PolygonIndex(object): def index_polygon(self, polygon): raise NotImplementedError('Children must implement') + def setup(self): + pass + def simplify_polygon(self, poly, simplify_tolerance=None, preserve_topology=None): if simplify_tolerance is None: simplify_tolerance = self.simplify_tolerance @@ -62,16 +79,21 @@ class PolygonIndex(object): preserve_topology = self.preserve_topology return poly.simplify(simplify_tolerance, preserve_topology=preserve_topology) + def index_polygon_properties(self, properties): + pass + def add_polygon(self, poly, properties, include_only_properties=None): if include_only_properties is not None: properties = {k: v for k, v in properties.iteritems() if k in include_only_properties} - self.polygons.append((properties, prep(poly))) + + self.polygons.append(prep(poly)) + self.polygons_db.Put(str(self.i), json.dumps(properties)) + self.index_polygon_properties(properties) self.i += 1 @classmethod def create_from_shapefiles(cls, inputs, output_dir, index_filename=None, - polys_filename=DEFAULT_POLYS_FILENAME, include_only_properties=None): index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME) for input_file in inputs: @@ -161,20 +183,27 @@ class PolygonIndex(object): def save(self, polys_filename=DEFAULT_POLYS_FILENAME): self.save_polygons(os.path.join(self.save_dir, polys_filename)) self.save_index() + self.save_polygon_properties(self.save_dir) def save_polygons(self, out_filename): out = open(out_filename, 'w') - for props, poly in self.polygons: + for poly in self.polygons: feature = { 'type': 'Feature', 'geometry': mapping(poly.context), - 'properties': props } out.write(json.dumps(feature) + u'\n') + self.polygons_db.CompactRange('\x00', '\xff') def save_index(self): raise NotImplementedError('Children must implement') + def load_polygon_properties(self, d): + pass + + def save_polygon_properties(self, d): + pass + @classmethod def load_polygons(cls, filename): f = open(filename) @@ -185,14 +214,14 @@ class PolygonIndex(object): if poly_type == 'Polygon': poly = Polygon(feature['geometry']['coordinates'][0]) - polygons.append((feature['properties'], prep(poly))) + polygons.append(prep(poly)) elif poly_type == 'MultiPolygon': polys = [] for coords in feature['geometry']['coordinates']: poly = Polygon(coords[0]) polys.append(poly) - polygons.append((feature['properties'], prep(MultiPolygon(polys)))) + polygons.append(prep(MultiPolygon(polys))) return polygons @classmethod @@ -200,10 +229,13 @@ class PolygonIndex(object): raise NotImplementedError('Children must implement') @classmethod - def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME): + def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME, polys_db_dir=POLYGONS_DB_DIR): index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME) polys = cls.load_polygons(os.path.join(d, polys_filename)) - return cls(index=index, polygons=polys, save_dir=d) + polygons_db = LevelDB(os.path.join(d, polys_db_dir)) + polygon_index = cls(index=index, polygons=polys, polygons_db=polygons_db, save_dir=d) + polygon_index.load_polygon_properties(d) + return polygon_index def get_candidate_polygons(self, lat, lon): raise NotImplementedError('Children must implement') @@ -215,12 +247,15 @@ class PolygonIndex(object): if return_all: containing = [] for i in polys: - props, poly = self.polygons[i] + poly = self.polygons[i] contains = poly.contains(pt) if contains and not return_all: - return props + try: + return json.loads(self.polygons_db.Get(str(i))) + except KeyError: + return None elif contains: - containing.append(props) + containing.append(json.loads(self.polygons_db.Get(str(i)))) return containing diff --git a/scripts/geodata/polygons/language_polys.py b/scripts/geodata/polygons/language_polys.py index da76e435..8d73d441 100644 --- a/scripts/geodata/polygons/language_polys.py +++ b/scripts/geodata/polygons/language_polys.py @@ -1,6 +1,7 @@ import argparse import os import sys +import ujson as json this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) @@ -13,6 +14,8 @@ regional_language_dir = os.path.join(LANGUAGES_DIR, 'regional') class LanguagePolygonIndex(RTreePolygonIndex): + DEFAULT_POLYS_FILENAME = 'polygons.geojson' + ADMIN_LEVELS_FILENAME = 'admin_levels.json' include_only_properties = set([ 'qs_a0', @@ -136,9 +139,20 @@ class LanguagePolygonIndex(RTreePolygonIndex): output_dir, index_filename=index_filename, polys_filename=polys_filename) + def setup(self): + self.admin_levels = [] + + def index_polygon_properties(self, properties): + self.admin_levels.append(properties['admin_level']) + + def load_polygon_properties(self, d): + self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME))) + + def save_polygon_properties(self, d): + json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w')) + def admin_level(self, i): - props, p = self.polygons[i] - return props['admin_level'] + return self.admin_levels[i] def get_candidate_polygons(self, lat, lon): candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys() diff --git a/scripts/geodata/polygons/reverse_geocode.py b/scripts/geodata/polygons/reverse_geocode.py index f58419e7..77089841 100644 --- a/scripts/geodata/polygons/reverse_geocode.py +++ b/scripts/geodata/polygons/reverse_geocode.py @@ -35,7 +35,7 @@ from geodata.i18n.unicode_properties import get_chars_by_script from geodata.i18n.word_breaks import ideographic_scripts from geodata.names.deduping import NameDeduper from geodata.osm.extract import parse_osm, OSM_NAME_TAGS -from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader, OSMZonePolygonReader +from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader, OSMSubdivisionPolygonReader from geodata.polygons.index import * from geodata.statistics.tf_idf import IDFIndex @@ -161,6 +161,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): ''' NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods' + PRIORITIES_FILENAME = 'priorities.json' + SCRATCH_DIR = '/tmp' DUPE_THRESHOLD = 0.9 @@ -386,9 +388,20 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return index + def setup(self): + self.priorities = [] + + def index_polygon_properties(self, properties): + self.priorities.append((props['polygon_type'], self.source_priorities[props['source']])) + + def load_polygon_properties(self, d): + self.priorities = json.load(open(os.path.join(d, self.PRIORITIES_FILENAME))) + + def save_polygon_properties(self, d): + json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w')) + def priority(self, i): - props, p = self.polygons[i] - return (self.level_priorities[props['polygon_type']], self.source_priorities[props['source']]) + return self.priorities[i] def get_candidate_polygons(self, lat, lon): candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon) @@ -419,6 +432,8 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex): LOCALITY = 'locality' NEIGHBORHOOD = 'neighborhood' + PRIORITIES_FILENAME = 'priorities.json' + sorted_levels = (COUNTRY, ADMIN1_REGION, ADMIN1, @@ -578,9 +593,20 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex): output_dir, index_filename=index_filename, polys_filename=polys_filename) + def setup(self): + self.priorities = [] + + def index_polygon_properties(self, properties): + self.priorities.append(self.sort_levels.get(properties[self.LEVEL], 0)) + + def load_polygon_properties(self, d): + self.priorities = json.load(open(os.path.join(d, self.PRIORITIES_FILENAME))) + + def save_polygon_properties(self, d): + json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w')) + def sort_level(self, i): - props, p = self.polygons[i] - return self.sort_levels.get(props[self.LEVEL], 0) + return self.priorities[i] def get_candidate_polygons(self, lat, lon): candidates = super(QuattroshapesReverseGeocoder, self).get_candidate_polygons(lat, lon) @@ -618,6 +644,8 @@ class OSMReverseGeocoder(RTreePolygonIndex): ADMIN_LEVEL = 'admin_level' + ADMIN_LEVELS_FILENAME = 'admin_levels.json' + polygon_reader = OSMAdminPolygonReader include_property_patterns = set([ @@ -713,16 +741,11 @@ class OSMReverseGeocoder(RTreePolygonIndex): # Figure out which outer polygon contains each inner polygon interior = [p2 for p2 in inner if poly.contains(p2)] except TopologicalError: - poly = cls.fix_polygon(poly) - if poly is None or not poly.bounds or len(poly.bounds) != 4: - continue - if poly.is_valid: - interior = [p2 for p2 in inner if poly.contains(p2)] + continue if interior: # Polygon with holes constructor poly = Polygon(p, [zip(*p2.exterior.coords.xy) for p2 in interior]) - poly = cls.fix_polygon(poly) if poly is None or not poly.bounds or len(poly.bounds) != 4: continue # R-tree only stores the bounding box, so add the whole polygon @@ -740,28 +763,47 @@ class OSMReverseGeocoder(RTreePolygonIndex): poly = multi[0] else: continue - poly = index.simplify_polygon(poly) + if len(poly.exterior.xy[0]) >= cls.large_polygon_threshold: + poly = index.simplify_polygon(poly) index.add_polygon(poly, props) return index - def sort_level(self, i): - props, p = self.polygons[i] - admin_level = props.get(self.ADMIN_LEVEL, 0) + def setup(self): + self.admin_levels = [] + + def index_polygon_properties(self, properties): + admin_level = properties.get(self.ADMIN_LEVEL, 0) try: - return int(admin_level) + admin_level = int(admin_level) except ValueError: - return 0 + admin_level = 0 + self.admin_levels.append(admin_level) + + def load_polygon_properties(self, d): + self.admin_levels = json.load(open(os.path.join(d, self.ADMIN_LEVELS_FILENAME))) + + def save_polygon_properties(self, d): + json.dump(self.admin_levels, open(os.path.join(d, self.ADMIN_LEVELS_FILENAME), 'w')) + + def sort_level(self, i): + return self.admin_level[i] def get_candidate_polygons(self, lat, lon): candidates = super(OSMReverseGeocoder, self).get_candidate_polygons(lat, lon) return sorted(candidates, key=self.sort_level, reverse=True) -class OSMZoneReverseGeocoder(OSMReverseGeocoder): - polygon_reader = OSMZonePolygonReader +class OSMSubdivisionReverseGeocoder(OSMReverseGeocoder): + polygon_reader = OSMSubdivisionPolygonReader include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['landuse']) + +class OSMBuildingReverseGeocoder(OSMReverseGeocoder): + polygon_reader = OSMBuildingPolygonReader + include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['building', 'building:levels']) + + if __name__ == '__main__': # Handle argument parsing here parser = argparse.ArgumentParser() @@ -772,8 +814,11 @@ if __name__ == '__main__': parser.add_argument('-a', '--osm-admin-file', help='Path to OSM borders file (with dependencies, .osm format)') - parser.add_argument('-l', '--osm-landuse-file', - help='Path to OSM landuse file (with dependencies, .osm format)') + parser.add_argument('-s', '--osm-subdivisions-file', + help='Path to OSM subdivisions file (with dependencies, .osm format)') + + parser.add_argument('-b', '--osm-building-polygons-file', + help='Path to OSM building polygons file (with dependencies, .osm format)') parser.add_argument('-n', '--osm-neighborhoods-file', help='Path to OSM neighborhoods file (no dependencies, .osm format)') @@ -788,7 +833,9 @@ if __name__ == '__main__': if args.osm_admin_file: index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir) elif args.osm_landuse_file: - index = OSMZoneReverseGeocoder.create_from_osm_file(args.osm_landuse_file, args.out_dir) + index = OSMSubdivisionReverseGeocoder.create_from_osm_file(args.osm_landuse_file, args.out_dir) + elif args.osm_building_polygons_file: + index = OSMBuildingReverseGeocoder.create_from_osm_file(args.osm_building_polygons_file, args.out_dir) elif args.osm_neighborhoods_file and args.quattroshapes_dir: index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( args.osm_neighborhoods_file,