diff --git a/resources/neighborhoods/click_that_hood.yaml b/resources/neighborhoods/click_that_hood.yaml new file mode 100644 index 00000000..ccb66436 --- /dev/null +++ b/resources/neighborhoods/click_that_hood.yaml @@ -0,0 +1,264 @@ + +files: + - filename: akron.geojson + component: suburb + - filename: alameda.geojson + component: suburb + - filename: albany.geojson + component: suburb + - filename: amsterdam.geojson + component: suburb + - filename: anchorage.geojson + component: suburb + - filename: angers.geojson + component: suburb + - filename: antwerp.geojson + component: suburb + - filename: atlanta.geojson + component: suburb + - filename: augsburg.geojson + component: city_district + - filename: austin.geojson + component: suburb + - filename: baltimore.geojson + component: suburb + - filename: bari.geojson + component: suburb + - filename: berlin.geojson + component: suburb + - filename: birmingham.geojson + component: suburb + - filename: blacksburg.geojson + component: suburb + - filename: blumenau.geojson + component: suburb + - filename: boston.geojson + component: suburb + - filename: braunschweig.geojson + component: city_district + - filename: bremen.geojson + component: city_district + - filename: bronx.geojson + component: suburb + - filename: brooklyn.geojson + component: suburb + - filename: calgary.geojson + component: suburb + - filename: canberra.geojson + component: suburb + - filename: chapel-hill.geojson + component: suburb + - filename: charlottesville.geojson + component: suburb + - filename: chemnitz.geojson + component: city_district + - filename: chesapeake.geojson + component: suburb + - filename: chicago.geojson + component: suburb + - filename: cincinnati.geojson + component: suburb + - filename: cleveland.geojson + component: suburb + - filename: cologne.geojson + component: city_district + - filename: columbus.geojson + component: suburb + - filename: copenhagen.geojson + component: suburb + - filename: dallas.geojson + component: suburb + - filename: denver.geojson + component: suburb + - filename: des-moines.geojson + component: suburb + - filename: detroit.geojson + component: suburb + - filename: dresden.geojson + component: suburb + - filename: dublin.geojson + component: suburb + - filename: duesseldorf.geojson + component: city_district + - filename: edmonton.geojson + component: suburb + - filename: eindhoven.geojson + component: suburb + - filename: esztergom.geojson + component: suburb + - filename: fairbanks.geojson + component: suburb + - filename: fargo.geojson + component: suburb + - filename: fort-lauderdale.geojson + component: suburb + - filename: frankfurt-main.geojson + component: suburb + - filename: freiburg.geojson + component: city_district + - filename: ghent.geojson + component: suburb + - filename: gisborne.geojson + component: suburb + - filename: grand-rapids.geojson + component: suburb + - filename: hamburg.geojson + component: suburb + - filename: hampton.geojson + component: suburb + - filename: hartford.geojson + component: suburb + - filename: henderson.geojson + component: suburb + - filename: honolulu.geojson + component: suburb + - filename: houston.geojson + component: suburb + - filename: indianapolis.geojson + component: suburb + - filename: kansas-city.geojson + component: suburb + - filename: las-vegas.geojson + component: suburb + - filename: lexington.geojson + component: suburb + - filename: long-beach.geojson + component: suburb + - filename: los-angeles-county.geojson + component: suburb + - filename: louisville.geojson + component: suburb + - filename: macon.geojson + component: suburb + - filename: madrid.geojson + component: suburb + - filename: manhattan.geojson + component: suburb + - filename: melbourne.geojson + component: suburb + - filename: miami.geojson + component: suburb + - filename: milan.geojson + component: suburb + - filename: milwaukee.geojson + component: suburb + - filename: minneapolis.geojson + component: suburb + - filename: mississauga.geojson + component: suburb + - filename: montreal.geojson + component: suburb + - filename: moscow.geojson + component: suburb + - filename: muenster.geojson + component: suburb + - filename: new-haven.geojson + component: suburb + - filename: new-orleans.geojson + component: suburb + - filename: norfolk.geojson + component: suburb + - filename: oakland.geojson + component: suburb + - filename: olympia.geojson + component: suburb + - filename: orlando.geojson + component: suburb + - filename: paris.geojson + component: suburb + - filename: philadelphia.geojson + component: suburb + - filename: phoenix.geojson + component: suburb + - filename: pittsburgh.geojson + component: suburb + - filename: porirua.geojson + component: suburb + - filename: portland.geojson + component: suburb + - filename: providence.geojson + component: suburb + - filename: queens.geojson + component: suburb + - filename: raleigh.geojson + component: suburb + - filename: red-deer.geojson + component: suburb + - filename: richmond.geojson + component: suburb + - filename: rochester.geojson + component: suburb + - filename: rockville.geojson + component: suburb + - filename: rotterdam.geojson + component: city_district + - filename: sacramento.geojson + component: suburb + - filename: salt-lake-city.geojson + component: suburb + - filename: san-antonio.geojson + component: suburb + - filename: san-diego.geojson + component: suburb + - filename: san-francisco.geojson + component: suburb + - filename: san-jose.geojson + component: suburb + - filename: saskatoon.geojson + component: suburb + - filename: seattle.geojson + component: suburb + - filename: springfield.geojson + component: suburb + - filename: st-louis.geojson + component: suburb + - filename: st-petersburg.geojson + component: suburb + - filename: stamford.geojson + component: suburb + - filename: staten-island.geojson + component: suburb + - filename: surrey.geojson + component: suburb + - filename: sydney.geojson + component: suburb + - filename: szczecin.geojson + component: suburb + - filename: tampa.geojson + component: suburb + - filename: the-hague.geojson + component: suburb + - filename: toronto.geojson + component: suburb + - filename: turku.geojson + component: suburb + - filename: ulm.geojson + component: suburb + - filename: unna.geojson + component: city_district + - filename: utrecht.geojson + component: city_district + - filename: vancouver.geojson + component: suburb + - filename: venice.geojson + component: suburb + - filename: venlo.geojson + component: city_district + - filename: vienna.geojson + component: city_district + - filename: washington.geojson + component: suburb + - filename: wellington.geojson + component: suburb + - filename: west-linn.geojson + component: suburb + - filename: west-palm-beach.geojson + component: suburb + - filename: williamsburg.geojson + component: suburb + - filename: windsor.geojson + component: suburb + - filename: winterthur.geojson + component: city_district + - filename: zurich-city.geojson + component: suburb \ No newline at end of file diff --git a/scripts/geodata/neighborhoods/reverse_geocode.py b/scripts/geodata/neighborhoods/reverse_geocode.py index 161646a7..920d90f7 100644 --- a/scripts/geodata/neighborhoods/reverse_geocode.py +++ b/scripts/geodata/neighborhoods/reverse_geocode.py @@ -7,6 +7,7 @@ import re import six import subprocess import sys +import yaml this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) @@ -18,6 +19,7 @@ from geodata.file_utils import ensure_dir, download_file from geodata.i18n.unicode_properties import get_chars_by_script from geodata.i18n.word_breaks import ideographic_scripts from geodata.names.deduping import NameDeduper +from geodata.osm.admin_boundaries import OSMNeighborhoodPolygonReader from geodata.osm.components import osm_address_components from geodata.osm.definitions import osm_definitions from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS @@ -126,15 +128,18 @@ class NeighborhoodDeduper(NameDeduper): class ClickThatHoodReverseGeocoder(GeohashPolygonIndex): - simplify_tolerance = 0.00001 - preserve_topology = True persistent_polygons = False cache_size = 0 SCRATCH_DIR = '/tmp' # Contains accurate boundaries for neighborhoods sans weird GeoPlanet names like "Adelphi" or "Crown Heights South" - NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods' + NEIGHBORHOODS_REPO = 'https://github.com/codeforamerica/click_that_hood' + + config_path = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'neighborhoods', 'click_that_hood.yaml') + + config = yaml.load(open(config_path)) @classmethod def clone_repo(cls, path): @@ -143,6 +148,49 @@ class ClickThatHoodReverseGeocoder(GeohashPolygonIndex): @classmethod def create_neighborhoods_index(cls): + scratch_dir = cls.SCRATCH_DIR + repo_path = os.path.join(scratch_dir, 'click_that_hood') + cls.clone_repo(repo_path) + + data_path = os.path.join(repo_path, 'public', 'data') + + neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods') + ensure_dir(neighborhoods_dir) + + index = cls(save_dir=neighborhoods_dir) + + for c in cls.config['files']: + filename = c['filename'] + component = c['component'] + + print('doing {}'.format(filename)) + + path = os.path.join(data_path, filename) + features = json.load(open(path))['features'] + for f in features: + f['properties']['component'] = component + + try: + index.add_geojson_like_file(features) + except ValueError: + continue + + return index + + +class OSMNeighborhoodReverseGeocoder(OSMReverseGeocoder): + persistent_polygons = False + cache_size = 10000 + simplify_polygons = False + polygon_reader = OSMNeighborhoodPolygonReader + include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['postal_code']) + + cache_size = 0 + + SCRATCH_DIR = '/tmp' + + @classmethod + def create_neighborhoods_index(cls, osm_neighborhoods_file): scratch_dir = cls.SCRATCH_DIR repo_path = os.path.join(scratch_dir, 'neighborhoods') cls.clone_repo(repo_path) @@ -150,37 +198,7 @@ class ClickThatHoodReverseGeocoder(GeohashPolygonIndex): neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index') ensure_dir(neighborhoods_dir) - index = cls(save_dir=neighborhoods_dir) - - have_geonames = set() - is_neighborhood = set() - - for filename in os.listdir(repo_path): - path = os.path.join(repo_path, filename) - base_name = filename.split('.')[0].split('gn-')[-1] - if filename.endswith('.geojson') and filename.startswith('gn-'): - have_geonames.add(base_name) - elif filename.endswith('metadata.json'): - data = json.load(open(os.path.join(repo_path, filename))) - if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'): - is_neighborhood.add(base_name) - - for filename in os.listdir(repo_path): - if not filename.endswith('.geojson'): - continue - base_name = filename.rsplit('.geojson')[0] - if base_name in have_geonames: - f = open(os.path.join(repo_path, 'gn-{}'.format(filename))) - elif base_name in is_neighborhood: - f = open(os.path.join(repo_path, filename)) - else: - continue - try: - index.add_geojson_like_file(json.load(f)['features']) - except ValueError: - continue - - return index + return cls.create_from_osm_file(osm_neighborhoods_file, output_dir=neighborhoods_dir) class NeighborhoodReverseGeocoder(RTreePolygonIndex): @@ -209,10 +227,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): cache_size = 100000 source_priorities = { - 'clickthathood': 0, # Best names/polygons - 'osm_cth': 1, # OSM names matched with ClickThatHood polygon - 'osm_quattro': 2, # OSM names matched with Quattroshapes polygon - 'quattroshapes': 3, # Good results in some countries/areas + 'osm': 0, # Best names/polygons, same coordinate system + 'osm_cth': 1, # Prefer the OSM names if possible + 'clickthathood': 2, # Better names/polygons than Quattroshapes + 'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon + 'quattroshapes': 4, # Good results in some countries/areas } level_priorities = { @@ -240,7 +259,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return doc @classmethod - def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, output_dir): + def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) @@ -259,6 +278,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods') ensure_dir(qs_scratch_dir) + osm_neighborhoods_scratch_dir = os.path.join(tmp_dir) + logger.info('Creating Quattroshapes neighborhoods') qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) @@ -270,12 +291,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) osm_admin_rtree.cache_size = 1000 + osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file) + logger.info('Creating IDF index') idf = IDFIndex() char_scripts = get_chars_by_script() - for idx in (cth, qs): + for idx in (cth, qs, osmn): for i in xrange(idx.i): props = idx.get_properties(i) name = props.get('name') @@ -289,6 +312,15 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): doc = cls.count_words(v) idf.update(doc) + for i in six.moves.xrange(osmn.i): + props = osmn.get_properties(i) + poly = osmn.get_polygon(i) + + props['source'] = 'osm' + props['polygon_type'] = 'neighborhood' + index.index_polygon(poly) + index.add_polygon(poly, props) + qs.matched = [False] * qs.i cth.matched = [False] * cth.i @@ -311,8 +343,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): props['type'] = id_type props['id'] = element_id - possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) - is_neighborhood = attrs.get('place') in ('neighbourhood', 'neighborhood') + possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD) + is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) country, candidate_languages = country_rtree.country_and_languages(lat, lon) @@ -378,21 +410,26 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): score, props, poly, idx, i = ranks[0] existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True) + existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True) skip_node = False - for poly_index, osm_props in enumerate(existing_osm_boundaries): - containing_component = None - name = osm_props.get('name') - # Only exact name matches here since we're comparins OSM to OSM - if name and name.lower() != attrs.get('name', '').lower(): - continue - containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]] + for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries): + for poly_index, osm_props in enumerate(existing_osm_boundaries): + containing_component = None + name = osm_props.get('name') + # Only exact name matches here since we're comparins OSM to OSM + if name and name.lower() != attrs.get('name', '').lower(): + continue - containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids) + containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]] - if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]: - skip_node = True + containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids) + + if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]: + skip_node = True + break + if skip_node: break # Skip this element @@ -504,6 +541,9 @@ if __name__ == '__main__': parser.add_argument('-c', '--country-rtree-dir', help='Path to country rtree dir') + parser.add_argument('-b', '--osm-neighborhood-borders-file', + help='Path to OSM neighborhood borders file (with dependencies, .osm format)') + parser.add_argument('-n', '--osm-neighborhoods-file', help='Path to OSM neighborhoods file (no dependencies, .osm format)') @@ -514,12 +554,13 @@ if __name__ == '__main__': logging.basicConfig(level=logging.INFO) args = parser.parse_args() - if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir: + if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file: index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( args.osm_neighborhoods_file, args.quattroshapes_dir, args.country_rtree_dir, args.osm_admin_rtree_dir, + args.osm_neighborhood_borders_file, args.out_dir ) else: