[polygons/neighborhoods] refactoring Zetashapes download, adding in PediaCities polygons for NYC neighborhoods

This commit is contained in:
Al
2016-04-09 21:32:39 -04:00
parent 38b39887ec
commit dee143798a

View File

@@ -30,7 +30,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.coordinates.conversion import latlon_to_decimal from geodata.coordinates.conversion import latlon_to_decimal
from geodata.encoding import safe_decode from geodata.encoding import safe_decode
from geodata.file_utils import ensure_dir from geodata.file_utils import ensure_dir, download_file
from geodata.i18n.unicode_properties import get_chars_by_script from geodata.i18n.unicode_properties import get_chars_by_script
from geodata.i18n.word_breaks import ideographic_scripts from geodata.i18n.word_breaks import ideographic_scripts
from geodata.names.deduping import NameDeduper from geodata.names.deduping import NameDeduper
@@ -142,55 +142,20 @@ class NeighborhoodDeduper(NameDeduper):
]) ])
class NeighborhoodReverseGeocoder(RTreePolygonIndex): class ZetashapesReverseGeocoder(GeohashPolygonIndex):
''' simplify_tolerance = 0.00001
Neighborhoods are very important in cities like NYC, SF, Chicago, London preserve_topology = True
and many others. We want the address parser to be trained with addresses persistent_polygons = False
that sufficiently capture variations in address patterns, including cache_size = 0
neighborhoods. Quattroshapes neighborhood data (in the US at least)
is not great in terms of names, mostly becasue GeoPlanet has so many
incorrect names. The neighborhoods project, also known as Zetashapes
has very accurate polygons with correct names, but only for a handful
of cities. OSM usually lists neighborhoods and some other local admin
areas like boroughs as points rather than polygons.
This index merges all of the above data sets in prioritized order
(Zetashapes > OSM > Quattroshapes) to provide unified point-in-polygon
tests for neighborhoods. The properties vary by source but each has
source has least a "name" key which in practice is what we care about.
'''
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods'
PRIORITIES_FILENAME = 'priorities.json'
SCRATCH_DIR = '/tmp' SCRATCH_DIR = '/tmp'
DUPE_THRESHOLD = 0.9 supplemental_neighborhood_urls = [
('http://catalog.civicdashboards.com/dataset/eea7c03e-9917-40b0-bba5-82e8e37d6739/resource/91778048-3c58-449c-a3f9-365ed203e914/download/06463a12c2104adf86335df0170c25e3pediacitiesnycneighborhoods.geojson', 'pediacities_nyc.geojson'),
source_priorities = {
'zetashapes': 0, # Best names/polygons
'osm_zeta': 1, # OSM names matched with Zetashapes polygon
'osm_quattro': 2, # OSM names matched with Quattroshapes polygon
'quattroshapes': 3, # Good results in some countries/areas
}
level_priorities = {
'neighborhood': 0,
'local_admin': 1,
}
regex_replacements = [
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
(re.compile('^paris-(?=[\d])', re.I), ''),
] ]
@classmethod @classmethod
def clone_repo(cls, path): def create_index(cls):
subprocess.check_call(['rm', '-rf', path])
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
@classmethod
def create_zetashapes_neighborhoods_index(cls):
scratch_dir = cls.SCRATCH_DIR scratch_dir = cls.SCRATCH_DIR
repo_path = os.path.join(scratch_dir, 'neighborhoods') repo_path = os.path.join(scratch_dir, 'neighborhoods')
cls.clone_repo(repo_path) cls.clone_repo(repo_path)
@@ -198,7 +163,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index') neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
ensure_dir(neighborhoods_dir) ensure_dir(neighborhoods_dir)
index = GeohashPolygonIndex() download_file(cls.PEDIACITIES_NYC, repo_path)
index = cls(save_dir=neighborhoods_dir)
have_geonames = set() have_geonames = set()
is_neighborhood = set() is_neighborhood = set()
@@ -225,8 +192,61 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
continue continue
index.add_geojson_like_file(json.load(f)['features']) index.add_geojson_like_file(json.load(f)['features'])
for url, filename in cls.supplemental_geojson_urls:
local_path = os.path.join(scratch_dir, filename)
download_file(url, local_path)
index.add_geojson_like_file(json.load(open(local_path))['features'])
return index return index
class NeighborhoodReverseGeocoder(RTreePolygonIndex):
'''
Neighborhoods are very important in cities like NYC, SF, Chicago, London
and many others. We want the address parser to be trained with addresses
that sufficiently capture variations in address patterns, including
neighborhoods. Quattroshapes neighborhood data (in the US at least)
is not great in terms of names, mostly becasue GeoPlanet has so many
incorrect names. The neighborhoods project, also known as Zetashapes
has very accurate polygons with correct names, but only for a handful
of cities. OSM usually lists neighborhoods and some other local admin
areas like boroughs as points rather than polygons.
This index merges all of the above data sets in prioritized order
(Zetashapes > OSM > Quattroshapes) to provide unified point-in-polygon
tests for neighborhoods. The properties vary by source but each has
source has least a "name" key which in practice is what we care about.
'''
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods'
SCRATCH_DIR = '/tmp'
PRIORITIES_FILENAME = 'priorities.json'
DUPE_THRESHOLD = 0.9
source_priorities = {
'zetashapes': 0, # Best names/polygons
'osm_zeta': 1, # OSM names matched with Zetashapes polygon
'osm_quattro': 2, # OSM names matched with Quattroshapes polygon
'quattroshapes': 3, # Good results in some countries/areas
}
level_priorities = {
'neighborhood': 0,
'local_admin': 1,
}
regex_replacements = [
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
(re.compile('^paris-(?=[\d])', re.I), ''),
]
@classmethod
def clone_repo(cls, path):
subprocess.check_call(['rm', '-rf', path])
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
@classmethod @classmethod
def count_words(cls, s): def count_words(cls, s):
doc = defaultdict(int) doc = defaultdict(int)
@@ -259,7 +279,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
logger.info('Creating Zetashapes neighborhoods') logger.info('Creating Zetashapes neighborhoods')
zs = cls.create_zetashapes_neighborhoods_index() zs = ZetashapesReverseGeocoder.create_index()
logger.info('Creating IDF index') logger.info('Creating IDF index')
idf = IDFIndex() idf = IDFIndex()
@@ -267,7 +287,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
char_scripts = get_chars_by_script() char_scripts = get_chars_by_script()
for idx in (zs, qs): for idx in (zs, qs):
for i, (props, poly) in enumerate(idx.polygons): for i in xrange(idx.i):
props = idx.get_properties(i)
name = props.get('name') name = props.get('name')
if name is not None: if name is not None:
doc = cls.count_words(name) doc = cls.count_words(name)
@@ -323,7 +344,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
for c in safe_decode(osm_name))) for c in safe_decode(osm_name)))
for i in candidates: for i in candidates:
props, poly = idx.polygons[i] props = self.get_properties(i)
name = normalized_qs_names.get(i) name = normalized_qs_names.get(i)
if not name: if not name:
name = props.get('name') name = props.get('name')
@@ -344,6 +365,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
if sim > max_sim: if sim > max_sim:
max_sim = sim max_sim = sim
poly = self.get_polygon(i)
arg_max = (max_sim, props, poly.context, idx, i) arg_max = (max_sim, props, poly.context, idx, i)
if arg_max: if arg_max: