[osm] Neighborhood index now uses OSM admin R-tree to check whether nodes that are otherwise classified as non-suburbs (e.g. Santa Monica is a city) but may still match one of the neighborhood data sets, are excluded from the final neighborhood index

This commit is contained in:
Al
2016-07-14 17:03:23 -04:00
parent 3eae77c148
commit 99fa6eee61

View File

@@ -11,6 +11,7 @@ import sys
this_dir = os.path.realpath(os.path.dirname(__file__)) this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.address_formatting.formatter import AddressFormatter
from geodata.coordinates.conversion import latlon_to_decimal from geodata.coordinates.conversion import latlon_to_decimal
from geodata.encoding import safe_decode from geodata.encoding import safe_decode
from geodata.file_utils import ensure_dir, download_file from geodata.file_utils import ensure_dir, download_file
@@ -20,7 +21,8 @@ from geodata.names.deduping import NameDeduper
from geodata.osm.definitions import osm_definitions from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
from geodata.polygons.index import * from geodata.polygons.index import *
from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder from geodata.polygons.language_polys import LanguagePolygonIndex
from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMReverseGeocoder
from geodata.statistics.tf_idf import IDFIndex from geodata.statistics.tf_idf import IDFIndex
@@ -233,7 +235,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return doc return doc
@classmethod @classmethod
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, language_rtree_dir, osm_rtree_dir, output_dir, scratch_dir=SCRATCH_DIR):
''' '''
Given an OSM file (planet or some other bounds) containing neighborhoods Given an OSM file (planet or some other bounds) containing neighborhoods
as points (some suburbs have boundaries) as points (some suburbs have boundaries)
@@ -259,6 +261,10 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
logger.info('Creating ClickThatHood neighborhoods') logger.info('Creating ClickThatHood neighborhoods')
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
language_rtree = LanguagePolygonIndex.load(language_rtree_dir)
osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
logger.info('Creating IDF index') logger.info('Creating IDF index')
idf = IDFIndex() idf = IDFIndex()
@@ -300,7 +306,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
props['type'] = id_type props['type'] = id_type
props['id'] = element_id props['id'] = element_id
possible_neighborhood = osm_definitions.meets_definition(attrs, osm_defintiions.LOCALITY) possible_neighborhood = osm_definitions.meets_definition(attrs, osm_defintions.NEIGHBORHOOD)
country, candidate_languages, language_props = language_rtree.country_and_languages(lat, lon)
component_name = None
for k, v in six.iteritems(attrs):
component_name = osm_address_components.get_component(country, k, v)
if component_name:
break
else:
component_name = None
ranks = [] ranks = []
osm_names = [] osm_names = []
@@ -313,6 +328,26 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
for name_key in OSM_NAME_TAGS: for name_key in OSM_NAME_TAGS:
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
if component_name and component_name != AddressFormatter.SUBURB:
existing_osm_candidates = osm_admin_rtree.get_candidate_polygons(lat, lon)
skip_node = False
for i in existing_osm_candidates:
props = osm_admin_rtree.get_properties(i)
containing_component = None
name = props.get('name')
# Only exact name matches here since we're comparins OSM to OSM
if name and name == attrs.get('name'):
continue
containing_component = osm_components.get_first_component(country, props)
if containing_component != AddressFormatter.SUBURB:
skip_node = True
break
# Skip this element
if skip_node:
continue
for idx in (cth, qs): for idx in (cth, qs):
candidates = idx.get_candidate_polygons(lat, lon, return_all=True) candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
@@ -439,6 +474,12 @@ if __name__ == '__main__':
parser.add_argument('-q', '--quattroshapes-dir', parser.add_argument('-q', '--quattroshapes-dir',
help='Path to quattroshapes dir') help='Path to quattroshapes dir')
parser.add_argument('-a', '--osm-admin-rtree-dir',
help='Path to OSM admin rtree dir')
parser.add_argument('-l', '--language-rtree-dir',
help='Path to language rtree dir')
parser.add_argument('-n', '--osm-neighborhoods-file', parser.add_argument('-n', '--osm-neighborhoods-file',
help='Path to OSM neighborhoods file (no dependencies, .osm format)') help='Path to OSM neighborhoods file (no dependencies, .osm format)')
@@ -449,10 +490,12 @@ if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
args = parser.parse_args() args = parser.parse_args()
if args.osm_neighborhoods_file and args.quattroshapes_dir: if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.language_rtree_dir:
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
args.osm_neighborhoods_file, args.osm_neighborhoods_file,
args.quattroshapes_dir, args.quattroshapes_dir,
args.language_rtree_dir,
args.osm_rtree_dir,
args.out_dir args.out_dir
) )
else: else: