[neighborhoods] Moving neighborhoods index to its own package

This commit is contained in:
Al
2016-04-26 14:26:46 -04:00
parent 2a37837412
commit 799bbe4912
3 changed files with 439 additions and 397 deletions

View File

@@ -54,378 +54,6 @@ def str_id(v):
return str(v)
class NeighborhoodDeduper(NameDeduper):
# Lossless conversions only
replacements = {
u'saint': u'st',
u'and': u'&',
}
discriminative_words = set([
# Han numbers
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
u'', u'',
# Roman numerals
u'i', u'ii',
u'iii', u'iv',
u'v', u'vi',
u'vii', u'viii',
u'ix', u'x',
u'xi', u'xii',
u'xiii', u'xiv',
u'xv', u'xvi',
u'xvii', u'xviii',
u'xix', u'xx',
# English directionals
u'north', u'south',
u'east', u'west',
u'northeast', u'northwest',
u'southeast', u'southwest',
# Spanish, Portguese and Italian directionals
u'norte', u'nord', u'sur', u'sul', u'sud',
u'est', u'este', u'leste', u'oeste', u'ovest',
# New in various languages
u'new',
u'nova',
u'novo',
u'nuevo',
u'nueva',
u'nuovo',
u'nuova',
# Qualifiers
u'heights',
u'hills',
u'upper', u'lower',
u'little', u'great',
u'park',
u'parque',
u'village',
])
stopwords = set([
u'cp',
u'de',
u'la',
u'urbanizacion',
u'do',
u'da',
u'dos',
u'del',
u'community',
u'bairro',
u'barrio',
u'le',
u'el',
u'mah',
u'раион',
u'vila',
u'villa',
u'kampung',
u'ahupua`a',
])
class ZetashapesReverseGeocoder(GeohashPolygonIndex):
simplify_tolerance = 0.00001
preserve_topology = True
persistent_polygons = False
cache_size = 0
SCRATCH_DIR = '/tmp'
# Contains accurate boundaries for neighborhoods sans weird GeoPlanet names like "Adelphi" or "Crown Heights South"
NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods'
@classmethod
def clone_repo(cls, path):
subprocess.check_call(['rm', '-rf', path])
subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
@classmethod
def create_neighborhoods_index(cls):
scratch_dir = cls.SCRATCH_DIR
repo_path = os.path.join(scratch_dir, 'neighborhoods')
cls.clone_repo(repo_path)
neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
ensure_dir(neighborhoods_dir)
index = cls(save_dir=neighborhoods_dir)
have_geonames = set()
is_neighborhood = set()
for filename in os.listdir(repo_path):
path = os.path.join(repo_path, filename)
base_name = filename.split('.')[0].split('gn-')[-1]
if filename.endswith('.geojson') and filename.startswith('gn-'):
have_geonames.add(base_name)
elif filename.endswith('metadata.json'):
data = json.load(open(os.path.join(repo_path, filename)))
if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'):
is_neighborhood.add(base_name)
for filename in os.listdir(repo_path):
if not filename.endswith('.geojson'):
continue
base_name = filename.rsplit('.geojson')[0]
if base_name in have_geonames:
f = open(os.path.join(repo_path, 'gn-{}'.format(filename)))
elif base_name in is_neighborhood:
f = open(os.path.join(repo_path, filename))
else:
continue
index.add_geojson_like_file(json.load(f)['features'])
return index
class NeighborhoodReverseGeocoder(RTreePolygonIndex):
'''
Neighborhoods are very important in cities like NYC, SF, Chicago, London
and many others. We want the address parser to be trained with addresses
that sufficiently capture variations in address patterns, including
neighborhoods. Quattroshapes neighborhood data (in the US at least)
is not great in terms of names, mostly becasue GeoPlanet has so many
incorrect names. The neighborhoods project, also known as Zetashapes
has very accurate polygons with correct names, but only for a handful
of cities. OSM usually lists neighborhoods and some other local admin
areas like boroughs as points rather than polygons.
This index merges all of the above data sets in prioritized order
(Zetashapes > OSM > Quattroshapes) to provide unified point-in-polygon
tests for neighborhoods. The properties vary by source but each has
source has least a "name" key which in practice is what we care about.
'''
SCRATCH_DIR = '/tmp'
PRIORITIES_FILENAME = 'priorities.json'
DUPE_THRESHOLD = 0.9
persistent_polygons = True
cache_size = 100000
source_priorities = {
'zetashapes': 0, # Best names/polygons
'osm_zeta': 1, # OSM names matched with Zetashapes polygon
'osm_quattro': 2, # OSM names matched with Quattroshapes polygon
'quattroshapes': 3, # Good results in some countries/areas
}
level_priorities = {
'neighborhood': 0,
'local_admin': 1,
}
regex_replacements = [
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes
(re.compile('^paris-(?=[\d])', re.I), ''),
]
@classmethod
def count_words(cls, s):
doc = defaultdict(int)
for t, c in NeighborhoodDeduper.content_tokens(s):
doc[t] += 1
return doc
@classmethod
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR):
'''
Given an OSM file (planet or some other bounds) containing neighborhoods
as points (some suburbs have boundaries)
and their dependencies, create an R-tree index for coarse-grained
reverse geocoding.
Note: the input file is expected to have been created using
osmfilter. Use fetch_osm_address_data.sh for planet or copy the
admin borders commands if using other geometries.
'''
index = cls(save_dir=output_dir)
ensure_dir(scratch_dir)
logger = logging.getLogger('neighborhoods')
qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods')
ensure_dir(qs_scratch_dir)
logger.info('Creating Quattroshapes neighborhoods')
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir)
logger.info('Creating Zetashapes neighborhoods')
zs = ZetashapesReverseGeocoder.create_neighborhoods_index()
logger.info('Creating IDF index')
idf = IDFIndex()
char_scripts = get_chars_by_script()
for idx in (zs, qs):
for i in xrange(idx.i):
props = idx.get_properties(i)
name = props.get('name')
if name is not None:
doc = cls.count_words(name)
idf.update(doc)
for key, attrs, deps in parse_osm(filename):
for k, v in six.iteritems(attrs):
if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
doc = cls.count_words(v)
idf.update(doc)
qs.matched = [False] * qs.i
zs.matched = [False] * zs.i
logger.info('Matching OSM points to neighborhood polygons')
# Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons
num_polys = 0
for node_id, attrs, deps in parse_osm(filename):
try:
lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
except ValueError:
continue
osm_name = attrs.get('name')
if not osm_name:
continue
is_neighborhood = attrs.get('place') == 'neighbourhood'
ranks = []
osm_names = []
for key in OSM_NAME_TAGS:
name = attrs.get(key)
if name:
osm_names.append(name)
for name_key in OSM_NAME_TAGS:
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
for idx in (zs, qs):
candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
if candidates:
max_sim = 0.0
arg_max = None
normalized_qs_names = {}
for osm_name in osm_names:
contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
for c in safe_decode(osm_name)))
for i in candidates:
props = idx.get_properties(i)
name = normalized_qs_names.get(i)
if not name:
name = props.get('name')
if not name:
continue
for pattern, repl in cls.regex_replacements:
name = pattern.sub(repl, name)
normalized_qs_names[i] = name
if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood':
continue
if not contains_ideographs:
sim = NeighborhoodDeduper.compare(osm_name, name, idf)
else:
# Many Han/Hangul characters are common, shouldn't use IDF
sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)
if sim > max_sim:
max_sim = sim
poly = idx.get_polygon(i)
arg_max = (max_sim, props, poly.context, idx, i)
if arg_max:
ranks.append(arg_max)
ranks.sort(key=operator.itemgetter(0), reverse=True)
if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
score, props, poly, idx, i = ranks[0]
if idx is zs:
attrs['polygon_type'] = 'neighborhood'
source = 'osm_zeta'
else:
level = props.get(QuattroshapesReverseGeocoder.LEVEL, None)
source = 'osm_quattro'
if level == 'neighborhood':
attrs['polygon_type'] = 'neighborhood'
else:
attrs['polygon_type'] = 'local_admin'
attrs['source'] = source
index.index_polygon(poly)
index.add_polygon(poly, attrs)
idx.matched[i] = True
num_polys += 1
if num_polys % 1000 == 0 and num_polys > 0:
logger.info('did {} neighborhoods'.format(num_polys))
for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')):
for i in xrange(idx.i):
props = idx.get_properties(i)
poly = idx.get_polygon(i)
if idx.matched[i]:
continue
props['source'] = source
if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood':
props['polygon_type'] = 'neighborhood'
else:
# We don't actually care about local admin polygons unless they match OSM
continue
index.index_polygon(poly.context)
index.add_polygon(poly.context, props)
return index
def setup(self):
self.priorities = []
def index_polygon_properties(self, properties):
self.priorities.append((self.level_priorities[properties['polygon_type']], self.source_priorities[properties['source']]))
def load_polygon_properties(self, d):
self.priorities = json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))
def save_polygon_properties(self, d):
json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
def priority(self, i):
return self.priorities[i]
def get_candidate_polygons(self, lat, lon):
candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon)
return sorted(candidates, key=self.priority)
class QuattroshapesReverseGeocoder(RTreePolygonIndex):
'''
Quattroshapes polygons, for levels up to localities, are relatively
@@ -634,22 +262,6 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex):
return sorted(candidates, key=self.sort_level, reverse=True)
class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder):
persistent_polygons = False
cache_size = None
@classmethod
def create_neighborhoods_index(cls, quattroshapes_dir,
output_dir,
index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME):
local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME)
neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME)
return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename],
output_dir, index_filename=index_filename,
polys_filename=polys_filename)
class OSMReverseGeocoder(RTreePolygonIndex):
'''
OSM has among the best, most carefully-crafted, accurate administrative
@@ -855,9 +467,6 @@ if __name__ == '__main__':
parser.add_argument('-b', '--osm-building-polygons-file',
help='Path to OSM building polygons file (with dependencies, .osm format)')
parser.add_argument('-n', '--osm-neighborhoods-file',
help='Path to OSM neighborhoods file (no dependencies, .osm format)')
parser.add_argument('-o', '--out-dir',
default=os.getcwd(),
help='Output directory')
@@ -871,12 +480,6 @@ if __name__ == '__main__':
index = OSMSubdivisionReverseGeocoder.create_from_osm_file(args.osm_subdivisions_file, args.out_dir)
elif args.osm_building_polygons_file:
index = OSMBuildingReverseGeocoder.create_from_osm_file(args.osm_building_polygons_file, args.out_dir)
elif args.osm_neighborhoods_file and args.quattroshapes_dir:
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
args.osm_neighborhoods_file,
args.quattroshapes_dir,
args.out_dir
)
elif args.quattroshapes_dir:
index = QuattroshapesReverseGeocoder.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir)
else: