From 195278cfeacf8b1312a2d8174a8abeb383261b97 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 6 Aug 2016 19:37:29 -0400 Subject: [PATCH] [osm] Reverse geocoding to metro station only for addresess in Japan --- resources/parser/data_sets/osm.yaml | 1 + scripts/geodata/osm/formatter.py | 40 ++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/resources/parser/data_sets/osm.yaml b/resources/parser/data_sets/osm.yaml index 69a9fa25..453f1f98 100644 --- a/resources/parser/data_sets/osm.yaml +++ b/resources/parser/data_sets/osm.yaml @@ -32,3 +32,4 @@ countries: combine_block_house_number_probability: 1.0 block_phrase_probability: 0.4 romaji_probability: 0.2 + add_metro_probability: 0.6 \ No newline at end of file diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 8e2b063b..4d9abe40 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -143,7 +143,7 @@ class OSMAddressFormatter(object): boundary_component_priorities = {k: i for i, k in enumerate(AddressFormatter.BOUNDARY_COMPONENTS_ORDERED)} - def __init__(self, components, subdivisions_rtree=None, buildings_rtree=None): + def __init__(self, components, subdivisions_rtree=None, buildings_rtree=None, metro_stations_index=None): # Instance of AddressComponents, contains structures for reverse geocoding, etc. self.components = components self.language_rtree = components.language_rtree @@ -151,6 +151,8 @@ class OSMAddressFormatter(object): self.subdivisions_rtree = subdivisions_rtree self.buildings_rtree = buildings_rtree + self.metro_stations_index = metro_stations_index + self.config = yaml.load(open(OSM_PARSER_DATA_DEFAULT_CONFIG)) self.formatter = AddressFormatter() @@ -326,6 +328,37 @@ class OSMAddressFormatter(object): house_number = separator.join([block, house_number]) address_components['addr:housenumber'] = house_number + def add_metro_station(self, address_components, latitude, longitude, language=None, default_language=None): + ''' + Metro stations + -------------- + + Particularly in Japan, where there are rarely named streets, metro stations are + often used to help locate an address (landmarks may be used as well). Unlike in the + rest of the world, metro stations in Japan are a semi-official component and used + almost as frequently as street names or house number in other countries, so we would + want libpostal's address parser to recognize Japanese train stations in both Kanji and Romaji. + + It's possible at some point to extend this to generate the sorts of natural language + directions we sometimes see in NYC and other large cities where a subway stop might be + included parenthetically after the address e.g. 61 Wythe Ave (L train to Bedford). + The subway stations in OSM are in a variety of formats, so this would need some massaging + and a slightly more sophisticated phrase generator than what we employ for numeric components + like apartment numbers. + ''' + nearest_metro = self.metro_stations_index.nearest_point(latitude, longitude) + if nearest_metro: + name = None + if language is not None: + name = nearest_metro.get('name:{}'.format(language.lower())) + if language == default_language: + name = nearest_metro.get('name') + else: + name = nearest_metro.get('name') + + if name: + address_components[AddressFormatter.METRO_STATION] = name + def venue_names(self, props, languages): ''' Venue names @@ -681,6 +714,11 @@ class OSMAddressFormatter(object): sub_building_tags = self.normalize_sub_building_components(tags) revised_tags.update(sub_building_tags) + # Only including nearest metro station in Japan + if country == JAPAN: + if random.random() < float(nested_get(self.config, ('countries', 'jp', 'add_metro_probability'), default=0.0)): + self.add_metro_station(revised_tags, latitude, longitude, language, default_language=JAPANESE) + num_floors = None num_basements = None zone = None