[neighborhoods] adding ClickThatHood config to whitelist/specify what kind of polygon is specified in each file. Adding OSM neighborhoods (ways/relations where place=neighbourhood to reduce ambiguity) as the highest priority, followed by CTH/OSM, CTH, Quattro/OSM, Quattro

This commit is contained in:
Al
2017-02-14 01:46:23 -05:00
parent 2003e08623
commit 6c68d446a0
2 changed files with 358 additions and 53 deletions

View File

@@ -0,0 +1,264 @@
files:
- filename: akron.geojson
component: suburb
- filename: alameda.geojson
component: suburb
- filename: albany.geojson
component: suburb
- filename: amsterdam.geojson
component: suburb
- filename: anchorage.geojson
component: suburb
- filename: angers.geojson
component: suburb
- filename: antwerp.geojson
component: suburb
- filename: atlanta.geojson
component: suburb
- filename: augsburg.geojson
component: city_district
- filename: austin.geojson
component: suburb
- filename: baltimore.geojson
component: suburb
- filename: bari.geojson
component: suburb
- filename: berlin.geojson
component: suburb
- filename: birmingham.geojson
component: suburb
- filename: blacksburg.geojson
component: suburb
- filename: blumenau.geojson
component: suburb
- filename: boston.geojson
component: suburb
- filename: braunschweig.geojson
component: city_district
- filename: bremen.geojson
component: city_district
- filename: bronx.geojson
component: suburb
- filename: brooklyn.geojson
component: suburb
- filename: calgary.geojson
component: suburb
- filename: canberra.geojson
component: suburb
- filename: chapel-hill.geojson
component: suburb
- filename: charlottesville.geojson
component: suburb
- filename: chemnitz.geojson
component: city_district
- filename: chesapeake.geojson
component: suburb
- filename: chicago.geojson
component: suburb
- filename: cincinnati.geojson
component: suburb
- filename: cleveland.geojson
component: suburb
- filename: cologne.geojson
component: city_district
- filename: columbus.geojson
component: suburb
- filename: copenhagen.geojson
component: suburb
- filename: dallas.geojson
component: suburb
- filename: denver.geojson
component: suburb
- filename: des-moines.geojson
component: suburb
- filename: detroit.geojson
component: suburb
- filename: dresden.geojson
component: suburb
- filename: dublin.geojson
component: suburb
- filename: duesseldorf.geojson
component: city_district
- filename: edmonton.geojson
component: suburb
- filename: eindhoven.geojson
component: suburb
- filename: esztergom.geojson
component: suburb
- filename: fairbanks.geojson
component: suburb
- filename: fargo.geojson
component: suburb
- filename: fort-lauderdale.geojson
component: suburb
- filename: frankfurt-main.geojson
component: suburb
- filename: freiburg.geojson
component: city_district
- filename: ghent.geojson
component: suburb
- filename: gisborne.geojson
component: suburb
- filename: grand-rapids.geojson
component: suburb
- filename: hamburg.geojson
component: suburb
- filename: hampton.geojson
component: suburb
- filename: hartford.geojson
component: suburb
- filename: henderson.geojson
component: suburb
- filename: honolulu.geojson
component: suburb
- filename: houston.geojson
component: suburb
- filename: indianapolis.geojson
component: suburb
- filename: kansas-city.geojson
component: suburb
- filename: las-vegas.geojson
component: suburb
- filename: lexington.geojson
component: suburb
- filename: long-beach.geojson
component: suburb
- filename: los-angeles-county.geojson
component: suburb
- filename: louisville.geojson
component: suburb
- filename: macon.geojson
component: suburb
- filename: madrid.geojson
component: suburb
- filename: manhattan.geojson
component: suburb
- filename: melbourne.geojson
component: suburb
- filename: miami.geojson
component: suburb
- filename: milan.geojson
component: suburb
- filename: milwaukee.geojson
component: suburb
- filename: minneapolis.geojson
component: suburb
- filename: mississauga.geojson
component: suburb
- filename: montreal.geojson
component: suburb
- filename: moscow.geojson
component: suburb
- filename: muenster.geojson
component: suburb
- filename: new-haven.geojson
component: suburb
- filename: new-orleans.geojson
component: suburb
- filename: norfolk.geojson
component: suburb
- filename: oakland.geojson
component: suburb
- filename: olympia.geojson
component: suburb
- filename: orlando.geojson
component: suburb
- filename: paris.geojson
component: suburb
- filename: philadelphia.geojson
component: suburb
- filename: phoenix.geojson
component: suburb
- filename: pittsburgh.geojson
component: suburb
- filename: porirua.geojson
component: suburb
- filename: portland.geojson
component: suburb
- filename: providence.geojson
component: suburb
- filename: queens.geojson
component: suburb
- filename: raleigh.geojson
component: suburb
- filename: red-deer.geojson
component: suburb
- filename: richmond.geojson
component: suburb
- filename: rochester.geojson
component: suburb
- filename: rockville.geojson
component: suburb
- filename: rotterdam.geojson
component: city_district
- filename: sacramento.geojson
component: suburb
- filename: salt-lake-city.geojson
component: suburb
- filename: san-antonio.geojson
component: suburb
- filename: san-diego.geojson
component: suburb
- filename: san-francisco.geojson
component: suburb
- filename: san-jose.geojson
component: suburb
- filename: saskatoon.geojson
component: suburb
- filename: seattle.geojson
component: suburb
- filename: springfield.geojson
component: suburb
- filename: st-louis.geojson
component: suburb
- filename: st-petersburg.geojson
component: suburb
- filename: stamford.geojson
component: suburb
- filename: staten-island.geojson
component: suburb
- filename: surrey.geojson
component: suburb
- filename: sydney.geojson
component: suburb
- filename: szczecin.geojson
component: suburb
- filename: tampa.geojson
component: suburb
- filename: the-hague.geojson
component: suburb
- filename: toronto.geojson
component: suburb
- filename: turku.geojson
component: suburb
- filename: ulm.geojson
component: suburb
- filename: unna.geojson
component: city_district
- filename: utrecht.geojson
component: city_district
- filename: vancouver.geojson
component: suburb
- filename: venice.geojson
component: suburb
- filename: venlo.geojson
component: city_district
- filename: vienna.geojson
component: city_district
- filename: washington.geojson
component: suburb
- filename: wellington.geojson
component: suburb
- filename: west-linn.geojson
component: suburb
- filename: west-palm-beach.geojson
component: suburb
- filename: williamsburg.geojson
component: suburb
- filename: windsor.geojson
component: suburb
- filename: winterthur.geojson
component: city_district
- filename: zurich-city.geojson
component: suburb

View File

@@ -7,6 +7,7 @@ import re
import six import six
import subprocess import subprocess
import sys import sys
import yaml
this_dir = os.path.realpath(os.path.dirname(__file__)) this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
@@ -18,6 +19,7 @@ from geodata.file_utils import ensure_dir, download_file
from geodata.i18n.unicode_properties import get_chars_by_script from geodata.i18n.unicode_properties import get_chars_by_script
from geodata.i18n.word_breaks import ideographic_scripts from geodata.i18n.word_breaks import ideographic_scripts
from geodata.names.deduping import NameDeduper from geodata.names.deduping import NameDeduper
from geodata.osm.admin_boundaries import OSMNeighborhoodPolygonReader
from geodata.osm.components import osm_address_components from geodata.osm.components import osm_address_components
from geodata.osm.definitions import osm_definitions from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
@@ -126,15 +128,18 @@ class NeighborhoodDeduper(NameDeduper):
class ClickThatHoodReverseGeocoder(GeohashPolygonIndex): class ClickThatHoodReverseGeocoder(GeohashPolygonIndex):
simplify_tolerance = 0.00001
preserve_topology = True
persistent_polygons = False persistent_polygons = False
cache_size = 0 cache_size = 0
SCRATCH_DIR = '/tmp' SCRATCH_DIR = '/tmp'
# Contains accurate boundaries for neighborhoods sans weird GeoPlanet names like "Adelphi" or "Crown Heights South" # Contains accurate boundaries for neighborhoods sans weird GeoPlanet names like "Adelphi" or "Crown Heights South"
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods' NEIGHBORHOODS_REPO = 'https://github.com/codeforamerica/click_that_hood'
config_path = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'neighborhoods', 'click_that_hood.yaml')
config = yaml.load(open(config_path))
@classmethod @classmethod
def clone_repo(cls, path): def clone_repo(cls, path):
@@ -143,6 +148,49 @@ class ClickThatHoodReverseGeocoder(GeohashPolygonIndex):
@classmethod @classmethod
def create_neighborhoods_index(cls): def create_neighborhoods_index(cls):
scratch_dir = cls.SCRATCH_DIR
repo_path = os.path.join(scratch_dir, 'click_that_hood')
cls.clone_repo(repo_path)
data_path = os.path.join(repo_path, 'public', 'data')
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods')
ensure_dir(neighborhoods_dir)
index = cls(save_dir=neighborhoods_dir)
for c in cls.config['files']:
filename = c['filename']
component = c['component']
print('doing {}'.format(filename))
path = os.path.join(data_path, filename)
features = json.load(open(path))['features']
for f in features:
f['properties']['component'] = component
try:
index.add_geojson_like_file(features)
except ValueError:
continue
return index
class OSMNeighborhoodReverseGeocoder(OSMReverseGeocoder):
persistent_polygons = False
cache_size = 10000
simplify_polygons = False
polygon_reader = OSMNeighborhoodPolygonReader
include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['postal_code'])
cache_size = 0
SCRATCH_DIR = '/tmp'
@classmethod
def create_neighborhoods_index(cls, osm_neighborhoods_file):
scratch_dir = cls.SCRATCH_DIR scratch_dir = cls.SCRATCH_DIR
repo_path = os.path.join(scratch_dir, 'neighborhoods') repo_path = os.path.join(scratch_dir, 'neighborhoods')
cls.clone_repo(repo_path) cls.clone_repo(repo_path)
@@ -150,37 +198,7 @@ class ClickThatHoodReverseGeocoder(GeohashPolygonIndex):
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index') neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
ensure_dir(neighborhoods_dir) ensure_dir(neighborhoods_dir)
index = cls(save_dir=neighborhoods_dir) return cls.create_from_osm_file(osm_neighborhoods_file, output_dir=neighborhoods_dir)
have_geonames = set()
is_neighborhood = set()
for filename in os.listdir(repo_path):
path = os.path.join(repo_path, filename)
base_name = filename.split('.')[0].split('gn-')[-1]
if filename.endswith('.geojson') and filename.startswith('gn-'):
have_geonames.add(base_name)
elif filename.endswith('metadata.json'):
data = json.load(open(os.path.join(repo_path, filename)))
if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'):
is_neighborhood.add(base_name)
for filename in os.listdir(repo_path):
if not filename.endswith('.geojson'):
continue
base_name = filename.rsplit('.geojson')[0]
if base_name in have_geonames:
f = open(os.path.join(repo_path, 'gn-{}'.format(filename)))
elif base_name in is_neighborhood:
f = open(os.path.join(repo_path, filename))
else:
continue
try:
index.add_geojson_like_file(json.load(f)['features'])
except ValueError:
continue
return index
class NeighborhoodReverseGeocoder(RTreePolygonIndex): class NeighborhoodReverseGeocoder(RTreePolygonIndex):
@@ -209,10 +227,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
cache_size = 100000 cache_size = 100000
source_priorities = { source_priorities = {
'clickthathood': 0, # Best names/polygons 'osm': 0, # Best names/polygons, same coordinate system
'osm_cth': 1, # OSM names matched with ClickThatHood polygon 'osm_cth': 1, # Prefer the OSM names if possible
'osm_quattro': 2, # OSM names matched with Quattroshapes polygon 'clickthathood': 2, # Better names/polygons than Quattroshapes
'quattroshapes': 3, # Good results in some countries/areas 'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon
'quattroshapes': 4, # Good results in some countries/areas
} }
level_priorities = { level_priorities = {
@@ -240,7 +259,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return doc return doc
@classmethod @classmethod
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, output_dir): def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
''' '''
Given an OSM file (planet or some other bounds) containing neighborhoods Given an OSM file (planet or some other bounds) containing neighborhoods
as points (some suburbs have boundaries) as points (some suburbs have boundaries)
@@ -259,6 +278,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods') qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
ensure_dir(qs_scratch_dir) ensure_dir(qs_scratch_dir)
osm_neighborhoods_scratch_dir = os.path.join(tmp_dir)
logger.info('Creating Quattroshapes neighborhoods') logger.info('Creating Quattroshapes neighborhoods')
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
@@ -270,12 +291,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
osm_admin_rtree.cache_size = 1000 osm_admin_rtree.cache_size = 1000
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
logger.info('Creating IDF index') logger.info('Creating IDF index')
idf = IDFIndex() idf = IDFIndex()
char_scripts = get_chars_by_script() char_scripts = get_chars_by_script()
for idx in (cth, qs): for idx in (cth, qs, osmn):
for i in xrange(idx.i): for i in xrange(idx.i):
props = idx.get_properties(i) props = idx.get_properties(i)
name = props.get('name') name = props.get('name')
@@ -289,6 +312,15 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
doc = cls.count_words(v) doc = cls.count_words(v)
idf.update(doc) idf.update(doc)
for i in six.moves.xrange(osmn.i):
props = osmn.get_properties(i)
poly = osmn.get_polygon(i)
props['source'] = 'osm'
props['polygon_type'] = 'neighborhood'
index.index_polygon(poly)
index.add_polygon(poly, props)
qs.matched = [False] * qs.i qs.matched = [False] * qs.i
cth.matched = [False] * cth.i cth.matched = [False] * cth.i
@@ -311,8 +343,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
props['type'] = id_type props['type'] = id_type
props['id'] = element_id props['id'] = element_id
possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD)
is_neighborhood = attrs.get('place') in ('neighbourhood', 'neighborhood') is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)
country, candidate_languages = country_rtree.country_and_languages(lat, lon) country, candidate_languages = country_rtree.country_and_languages(lat, lon)
@@ -378,8 +410,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
score, props, poly, idx, i = ranks[0] score, props, poly, idx, i = ranks[0]
existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True) existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True)
existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True)
skip_node = False skip_node = False
for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries):
for poly_index, osm_props in enumerate(existing_osm_boundaries): for poly_index, osm_props in enumerate(existing_osm_boundaries):
containing_component = None containing_component = None
name = osm_props.get('name') name = osm_props.get('name')
@@ -394,6 +429,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]: if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]:
skip_node = True skip_node = True
break break
if skip_node:
break
# Skip this element # Skip this element
if skip_node: if skip_node:
@@ -504,6 +541,9 @@ if __name__ == '__main__':
parser.add_argument('-c', '--country-rtree-dir', parser.add_argument('-c', '--country-rtree-dir',
help='Path to country rtree dir') help='Path to country rtree dir')
parser.add_argument('-b', '--osm-neighborhood-borders-file',
help='Path to OSM neighborhood borders file (with dependencies, .osm format)')
parser.add_argument('-n', '--osm-neighborhoods-file', parser.add_argument('-n', '--osm-neighborhoods-file',
help='Path to OSM neighborhoods file (no dependencies, .osm format)') help='Path to OSM neighborhoods file (no dependencies, .osm format)')
@@ -514,12 +554,13 @@ if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
args = parser.parse_args() args = parser.parse_args()
if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir: if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
args.osm_neighborhoods_file, args.osm_neighborhoods_file,
args.quattroshapes_dir, args.quattroshapes_dir,
args.country_rtree_dir, args.country_rtree_dir,
args.osm_admin_rtree_dir, args.osm_admin_rtree_dir,
args.osm_neighborhood_borders_file,
args.out_dir args.out_dir
) )
else: else: