From 6ef54bcc6fcabd755349f1d2d4891c806f7c7484 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 6 Aug 2016 19:36:57 -0400 Subject: [PATCH] [addresses] Adding metro stations to AddressComponents expansion --- scripts/geodata/addresses/components.py | 26 ++++++++++++++++++++++--- scripts/geodata/addresses/numbering.py | 7 +++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 80b1777a..a102e16a 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -15,6 +15,7 @@ from geodata.addresses.config import address_config from geodata.addresses.floors import Floor from geodata.addresses.entrances import Entrance from geodata.addresses.house_numbers import HouseNumber +from geodata.addresses.metro_stations import MetroStation from geodata.addresses.po_boxes import POBox from geodata.addresses.postcodes import PostCode from geodata.addresses.staircases import Staircase @@ -84,8 +85,11 @@ class AddressComponents(object): iso_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries]) iso_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries]) + latin_alphabet_lower = set([unichr(c) for c in xrange(ord('a'), ord('z') + 1)]) + BOUNDARY_COMPONENTS = OrderedDict.fromkeys(( AddressFormatter.SUBDIVISION, + AddressFormatter.METRO_STATION, AddressFormatter.SUBURB, AddressFormatter.CITY_DISTRICT, AddressFormatter.CITY, @@ -95,6 +99,11 @@ class AddressComponents(object): AddressFormatter.COUNTRY, )) + LOCALITY_COMPONENTS = OrderedDict.fromkeys(( + AddressFormatter.SUBDIVISION, + AddressFormatter.METRO_STATION, + )) + NAME_COMPONENTS = { AddressFormatter.ATTENTION, AddressFormatter.CARE_OF, @@ -134,7 +143,7 @@ class AddressComponents(object): AddressFormatter.UNIT: Unit, } - def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames): + def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, metro_stations_index): self.config = yaml.load(open(PARSER_DEFAULT_CONFIG)) self.use_admin_center_ids = set([(r['type'], safe_encode(r['id'])) for r in nested_get(self.config, ('boundaries', 'override_with_admin_center'), default=[])]) @@ -148,6 +157,7 @@ class AddressComponents(object): self.neighborhoods_rtree = neighborhoods_rtree self.quattroshapes_rtree = quattroshapes_rtree self.geonames = geonames + self.metro_stations_index = metro_stations_index def setup_component_dependencies(self): self.component_dependencies = defaultdict(dict) @@ -1054,7 +1064,6 @@ class AddressComponents(object): if venue_name and house_number and venue_name.strip() == house_number.strip(): address_components.pop(AddressFormatter.HOUSE) - def cleanup_house_number(self, address_components): ''' House number cleanup @@ -1088,12 +1097,18 @@ class AddressComponents(object): def add_house_number_phrase(self, address_components, language, country=None): house_number = address_components.get(AddressFormatter.HOUSE_NUMBER, None) - if not is_numeric(house_number): + if not is_numeric(house_number) and house_number.lower() not in self.latin_alphabet_lower: return phrase = HouseNumber.phrase(house_number, language, country=country) if phrase and phrase != house_number: address_components[AddressFormatter.HOUSE_NUMBER] = phrase + def add_metro_station_phrase(self, address_components, language, country=None): + metro_station = address_components.get(AddressFormatter.METRO_STATION, None) + phrase = MetroStation.phrase(house_number, language, country=country) + if phrase and phrase != metro_station: + address_components[AddressFormatter.METRO_STATION] = phrase + def add_postcode_phrase(self, address_components, language, country=None): postcode = address_components.get(AddressFormatter.POSTCODE, None) if postcode: @@ -1110,6 +1125,9 @@ class AddressComponents(object): def drop_places(self, address_components): return {c: v for c, v in six.iteritems(address_components) if c not in place_config.ADMIN_COMPONENTS} + def drop_localities(self, address_components): + return {c: v for c, v in six.iteritems(address_components) if c not in self.LOCALITY_COMPONENTS} + def drop_postcode(self, address_components): if AddressFormatter.POSTCODE not in address_components: return address_components @@ -1149,6 +1167,7 @@ class AddressComponents(object): drop_places_probability = po_box_config['drop_places_probability'] if random.random() < drop_places_probability: address_components = self.drop_places(address_components) + address_components = self.drop_localities(address_components) drop_postcode_probability = po_box_config['drop_postcode_probability'] if random.random() < drop_postcode_probability: @@ -1240,6 +1259,7 @@ class AddressComponents(object): self.cleanup_boundary_names(address_components) self.add_house_number_phrase(address_components, language, country=country) self.add_postcode_phrase(address_components, language, country=country) + self.add_metro_station_phrase(address_components, language, country=country) if add_sub_building_components: self.add_sub_building_components(address_components, language, country=country, diff --git a/scripts/geodata/addresses/numbering.py b/scripts/geodata/addresses/numbering.py index c5b7cf66..f28aa683 100644 --- a/scripts/geodata/addresses/numbering.py +++ b/scripts/geodata/addresses/numbering.py @@ -161,11 +161,10 @@ class NumericPhrase(object): values.append(num_type) probs.append(prob) - probs = cdf(probs) - - if len(values) < 2: + if not probs: num_type = cls.NUMERIC else: + probs = cdf(probs) num_type = weighted_choice(values, probs) return num_type, phrase, phrase_props[num_type] @@ -265,7 +264,7 @@ class NumberedComponent(object): for t, c in tokens: if c == token_types.NUMERIC: has_numeric = True - if t.isalnum(): + if any((ch.isalpha() for ch in t)): has_alpha = True if strict_numeric and has_alpha: