[polygons/neighborhoods] refactoring Zetashapes download, adding in PediaCities polygons for NYC neighborhoods
This commit is contained in:
@@ -30,7 +30,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|||||||
|
|
||||||
from geodata.coordinates.conversion import latlon_to_decimal
|
from geodata.coordinates.conversion import latlon_to_decimal
|
||||||
from geodata.encoding import safe_decode
|
from geodata.encoding import safe_decode
|
||||||
from geodata.file_utils import ensure_dir
|
from geodata.file_utils import ensure_dir, download_file
|
||||||
from geodata.i18n.unicode_properties import get_chars_by_script
|
from geodata.i18n.unicode_properties import get_chars_by_script
|
||||||
from geodata.i18n.word_breaks import ideographic_scripts
|
from geodata.i18n.word_breaks import ideographic_scripts
|
||||||
from geodata.names.deduping import NameDeduper
|
from geodata.names.deduping import NameDeduper
|
||||||
@@ -142,55 +142,20 @@ class NeighborhoodDeduper(NameDeduper):
|
|||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
class ZetashapesReverseGeocoder(GeohashPolygonIndex):
|
||||||
'''
|
simplify_tolerance = 0.00001
|
||||||
Neighborhoods are very important in cities like NYC, SF, Chicago, London
|
preserve_topology = True
|
||||||
and many others. We want the address parser to be trained with addresses
|
persistent_polygons = False
|
||||||
that sufficiently capture variations in address patterns, including
|
cache_size = 0
|
||||||
neighborhoods. Quattroshapes neighborhood data (in the US at least)
|
|
||||||
is not great in terms of names, mostly becasue GeoPlanet has so many
|
|
||||||
incorrect names. The neighborhoods project, also known as Zetashapes
|
|
||||||
has very accurate polygons with correct names, but only for a handful
|
|
||||||
of cities. OSM usually lists neighborhoods and some other local admin
|
|
||||||
areas like boroughs as points rather than polygons.
|
|
||||||
|
|
||||||
This index merges all of the above data sets in prioritized order
|
|
||||||
(Zetashapes > OSM > Quattroshapes) to provide unified point-in-polygon
|
|
||||||
tests for neighborhoods. The properties vary by source but each has
|
|
||||||
source has least a "name" key which in practice is what we care about.
|
|
||||||
'''
|
|
||||||
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods'
|
|
||||||
|
|
||||||
PRIORITIES_FILENAME = 'priorities.json'
|
|
||||||
|
|
||||||
SCRATCH_DIR = '/tmp'
|
SCRATCH_DIR = '/tmp'
|
||||||
|
|
||||||
DUPE_THRESHOLD = 0.9
|
supplemental_neighborhood_urls = [
|
||||||
|
('http://catalog.civicdashboards.com/dataset/eea7c03e-9917-40b0-bba5-82e8e37d6739/resource/91778048-3c58-449c-a3f9-365ed203e914/download/06463a12c2104adf86335df0170c25e3pediacitiesnycneighborhoods.geojson', 'pediacities_nyc.geojson'),
|
||||||
source_priorities = {
|
|
||||||
'zetashapes': 0, # Best names/polygons
|
|
||||||
'osm_zeta': 1, # OSM names matched with Zetashapes polygon
|
|
||||||
'osm_quattro': 2, # OSM names matched with Quattroshapes polygon
|
|
||||||
'quattroshapes': 3, # Good results in some countries/areas
|
|
||||||
}
|
|
||||||
|
|
||||||
level_priorities = {
|
|
||||||
'neighborhood': 0,
|
|
||||||
'local_admin': 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
regex_replacements = [
|
|
||||||
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
|
|
||||||
(re.compile('^paris-(?=[\d])', re.I), ''),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def clone_repo(cls, path):
|
def create_index(cls):
|
||||||
subprocess.check_call(['rm', '-rf', path])
|
|
||||||
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_zetashapes_neighborhoods_index(cls):
|
|
||||||
scratch_dir = cls.SCRATCH_DIR
|
scratch_dir = cls.SCRATCH_DIR
|
||||||
repo_path = os.path.join(scratch_dir, 'neighborhoods')
|
repo_path = os.path.join(scratch_dir, 'neighborhoods')
|
||||||
cls.clone_repo(repo_path)
|
cls.clone_repo(repo_path)
|
||||||
@@ -198,7 +163,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
|
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
|
||||||
ensure_dir(neighborhoods_dir)
|
ensure_dir(neighborhoods_dir)
|
||||||
|
|
||||||
index = GeohashPolygonIndex()
|
download_file(cls.PEDIACITIES_NYC, repo_path)
|
||||||
|
|
||||||
|
index = cls(save_dir=neighborhoods_dir)
|
||||||
|
|
||||||
have_geonames = set()
|
have_geonames = set()
|
||||||
is_neighborhood = set()
|
is_neighborhood = set()
|
||||||
@@ -225,8 +192,61 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
continue
|
continue
|
||||||
index.add_geojson_like_file(json.load(f)['features'])
|
index.add_geojson_like_file(json.load(f)['features'])
|
||||||
|
|
||||||
|
for url, filename in cls.supplemental_geojson_urls:
|
||||||
|
local_path = os.path.join(scratch_dir, filename)
|
||||||
|
download_file(url, local_path)
|
||||||
|
index.add_geojson_like_file(json.load(open(local_path))['features'])
|
||||||
|
|
||||||
return index
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
||||||
|
'''
|
||||||
|
Neighborhoods are very important in cities like NYC, SF, Chicago, London
|
||||||
|
and many others. We want the address parser to be trained with addresses
|
||||||
|
that sufficiently capture variations in address patterns, including
|
||||||
|
neighborhoods. Quattroshapes neighborhood data (in the US at least)
|
||||||
|
is not great in terms of names, mostly becasue GeoPlanet has so many
|
||||||
|
incorrect names. The neighborhoods project, also known as Zetashapes
|
||||||
|
has very accurate polygons with correct names, but only for a handful
|
||||||
|
of cities. OSM usually lists neighborhoods and some other local admin
|
||||||
|
areas like boroughs as points rather than polygons.
|
||||||
|
|
||||||
|
This index merges all of the above data sets in prioritized order
|
||||||
|
(Zetashapes > OSM > Quattroshapes) to provide unified point-in-polygon
|
||||||
|
tests for neighborhoods. The properties vary by source but each has
|
||||||
|
source has least a "name" key which in practice is what we care about.
|
||||||
|
'''
|
||||||
|
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods'
|
||||||
|
|
||||||
|
SCRATCH_DIR = '/tmp'
|
||||||
|
|
||||||
|
PRIORITIES_FILENAME = 'priorities.json'
|
||||||
|
|
||||||
|
DUPE_THRESHOLD = 0.9
|
||||||
|
|
||||||
|
source_priorities = {
|
||||||
|
'zetashapes': 0, # Best names/polygons
|
||||||
|
'osm_zeta': 1, # OSM names matched with Zetashapes polygon
|
||||||
|
'osm_quattro': 2, # OSM names matched with Quattroshapes polygon
|
||||||
|
'quattroshapes': 3, # Good results in some countries/areas
|
||||||
|
}
|
||||||
|
|
||||||
|
level_priorities = {
|
||||||
|
'neighborhood': 0,
|
||||||
|
'local_admin': 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
regex_replacements = [
|
||||||
|
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
|
||||||
|
(re.compile('^paris-(?=[\d])', re.I), ''),
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def clone_repo(cls, path):
|
||||||
|
subprocess.check_call(['rm', '-rf', path])
|
||||||
|
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def count_words(cls, s):
|
def count_words(cls, s):
|
||||||
doc = defaultdict(int)
|
doc = defaultdict(int)
|
||||||
@@ -259,7 +279,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
|
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
|
||||||
logger.info('Creating Zetashapes neighborhoods')
|
logger.info('Creating Zetashapes neighborhoods')
|
||||||
zs = cls.create_zetashapes_neighborhoods_index()
|
zs = ZetashapesReverseGeocoder.create_index()
|
||||||
|
|
||||||
logger.info('Creating IDF index')
|
logger.info('Creating IDF index')
|
||||||
idf = IDFIndex()
|
idf = IDFIndex()
|
||||||
@@ -267,7 +287,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
char_scripts = get_chars_by_script()
|
char_scripts = get_chars_by_script()
|
||||||
|
|
||||||
for idx in (zs, qs):
|
for idx in (zs, qs):
|
||||||
for i, (props, poly) in enumerate(idx.polygons):
|
for i in xrange(idx.i):
|
||||||
|
props = idx.get_properties(i)
|
||||||
name = props.get('name')
|
name = props.get('name')
|
||||||
if name is not None:
|
if name is not None:
|
||||||
doc = cls.count_words(name)
|
doc = cls.count_words(name)
|
||||||
@@ -323,7 +344,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
for c in safe_decode(osm_name)))
|
for c in safe_decode(osm_name)))
|
||||||
|
|
||||||
for i in candidates:
|
for i in candidates:
|
||||||
props, poly = idx.polygons[i]
|
props = self.get_properties(i)
|
||||||
name = normalized_qs_names.get(i)
|
name = normalized_qs_names.get(i)
|
||||||
if not name:
|
if not name:
|
||||||
name = props.get('name')
|
name = props.get('name')
|
||||||
@@ -344,6 +365,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
|
|||||||
|
|
||||||
if sim > max_sim:
|
if sim > max_sim:
|
||||||
max_sim = sim
|
max_sim = sim
|
||||||
|
poly = self.get_polygon(i)
|
||||||
arg_max = (max_sim, props, poly.context, idx, i)
|
arg_max = (max_sim, props, poly.context, idx, i)
|
||||||
|
|
||||||
if arg_max:
|
if arg_max:
|
||||||
|
|||||||
Reference in New Issue
Block a user