[addresses] Adding metro stations to AddressComponents expansion
This commit is contained in:
@@ -15,6 +15,7 @@ from geodata.addresses.config import address_config
|
|||||||
from geodata.addresses.floors import Floor
|
from geodata.addresses.floors import Floor
|
||||||
from geodata.addresses.entrances import Entrance
|
from geodata.addresses.entrances import Entrance
|
||||||
from geodata.addresses.house_numbers import HouseNumber
|
from geodata.addresses.house_numbers import HouseNumber
|
||||||
|
from geodata.addresses.metro_stations import MetroStation
|
||||||
from geodata.addresses.po_boxes import POBox
|
from geodata.addresses.po_boxes import POBox
|
||||||
from geodata.addresses.postcodes import PostCode
|
from geodata.addresses.postcodes import PostCode
|
||||||
from geodata.addresses.staircases import Staircase
|
from geodata.addresses.staircases import Staircase
|
||||||
@@ -84,8 +85,11 @@ class AddressComponents(object):
|
|||||||
iso_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
iso_alpha2_codes = set([c.alpha2.lower() for c in pycountry.countries])
|
||||||
iso_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries])
|
iso_alpha3_codes = set([c.alpha3.lower() for c in pycountry.countries])
|
||||||
|
|
||||||
|
latin_alphabet_lower = set([unichr(c) for c in xrange(ord('a'), ord('z') + 1)])
|
||||||
|
|
||||||
BOUNDARY_COMPONENTS = OrderedDict.fromkeys((
|
BOUNDARY_COMPONENTS = OrderedDict.fromkeys((
|
||||||
AddressFormatter.SUBDIVISION,
|
AddressFormatter.SUBDIVISION,
|
||||||
|
AddressFormatter.METRO_STATION,
|
||||||
AddressFormatter.SUBURB,
|
AddressFormatter.SUBURB,
|
||||||
AddressFormatter.CITY_DISTRICT,
|
AddressFormatter.CITY_DISTRICT,
|
||||||
AddressFormatter.CITY,
|
AddressFormatter.CITY,
|
||||||
@@ -95,6 +99,11 @@ class AddressComponents(object):
|
|||||||
AddressFormatter.COUNTRY,
|
AddressFormatter.COUNTRY,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
LOCALITY_COMPONENTS = OrderedDict.fromkeys((
|
||||||
|
AddressFormatter.SUBDIVISION,
|
||||||
|
AddressFormatter.METRO_STATION,
|
||||||
|
))
|
||||||
|
|
||||||
NAME_COMPONENTS = {
|
NAME_COMPONENTS = {
|
||||||
AddressFormatter.ATTENTION,
|
AddressFormatter.ATTENTION,
|
||||||
AddressFormatter.CARE_OF,
|
AddressFormatter.CARE_OF,
|
||||||
@@ -134,7 +143,7 @@ class AddressComponents(object):
|
|||||||
AddressFormatter.UNIT: Unit,
|
AddressFormatter.UNIT: Unit,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames):
|
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, metro_stations_index):
|
||||||
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
||||||
|
|
||||||
self.use_admin_center_ids = set([(r['type'], safe_encode(r['id'])) for r in nested_get(self.config, ('boundaries', 'override_with_admin_center'), default=[])])
|
self.use_admin_center_ids = set([(r['type'], safe_encode(r['id'])) for r in nested_get(self.config, ('boundaries', 'override_with_admin_center'), default=[])])
|
||||||
@@ -148,6 +157,7 @@ class AddressComponents(object):
|
|||||||
self.neighborhoods_rtree = neighborhoods_rtree
|
self.neighborhoods_rtree = neighborhoods_rtree
|
||||||
self.quattroshapes_rtree = quattroshapes_rtree
|
self.quattroshapes_rtree = quattroshapes_rtree
|
||||||
self.geonames = geonames
|
self.geonames = geonames
|
||||||
|
self.metro_stations_index = metro_stations_index
|
||||||
|
|
||||||
def setup_component_dependencies(self):
|
def setup_component_dependencies(self):
|
||||||
self.component_dependencies = defaultdict(dict)
|
self.component_dependencies = defaultdict(dict)
|
||||||
@@ -1054,7 +1064,6 @@ class AddressComponents(object):
|
|||||||
if venue_name and house_number and venue_name.strip() == house_number.strip():
|
if venue_name and house_number and venue_name.strip() == house_number.strip():
|
||||||
address_components.pop(AddressFormatter.HOUSE)
|
address_components.pop(AddressFormatter.HOUSE)
|
||||||
|
|
||||||
|
|
||||||
def cleanup_house_number(self, address_components):
|
def cleanup_house_number(self, address_components):
|
||||||
'''
|
'''
|
||||||
House number cleanup
|
House number cleanup
|
||||||
@@ -1088,12 +1097,18 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
def add_house_number_phrase(self, address_components, language, country=None):
|
def add_house_number_phrase(self, address_components, language, country=None):
|
||||||
house_number = address_components.get(AddressFormatter.HOUSE_NUMBER, None)
|
house_number = address_components.get(AddressFormatter.HOUSE_NUMBER, None)
|
||||||
if not is_numeric(house_number):
|
if not is_numeric(house_number) and house_number.lower() not in self.latin_alphabet_lower:
|
||||||
return
|
return
|
||||||
phrase = HouseNumber.phrase(house_number, language, country=country)
|
phrase = HouseNumber.phrase(house_number, language, country=country)
|
||||||
if phrase and phrase != house_number:
|
if phrase and phrase != house_number:
|
||||||
address_components[AddressFormatter.HOUSE_NUMBER] = phrase
|
address_components[AddressFormatter.HOUSE_NUMBER] = phrase
|
||||||
|
|
||||||
|
def add_metro_station_phrase(self, address_components, language, country=None):
|
||||||
|
metro_station = address_components.get(AddressFormatter.METRO_STATION, None)
|
||||||
|
phrase = MetroStation.phrase(house_number, language, country=country)
|
||||||
|
if phrase and phrase != metro_station:
|
||||||
|
address_components[AddressFormatter.METRO_STATION] = phrase
|
||||||
|
|
||||||
def add_postcode_phrase(self, address_components, language, country=None):
|
def add_postcode_phrase(self, address_components, language, country=None):
|
||||||
postcode = address_components.get(AddressFormatter.POSTCODE, None)
|
postcode = address_components.get(AddressFormatter.POSTCODE, None)
|
||||||
if postcode:
|
if postcode:
|
||||||
@@ -1110,6 +1125,9 @@ class AddressComponents(object):
|
|||||||
def drop_places(self, address_components):
|
def drop_places(self, address_components):
|
||||||
return {c: v for c, v in six.iteritems(address_components) if c not in place_config.ADMIN_COMPONENTS}
|
return {c: v for c, v in six.iteritems(address_components) if c not in place_config.ADMIN_COMPONENTS}
|
||||||
|
|
||||||
|
def drop_localities(self, address_components):
|
||||||
|
return {c: v for c, v in six.iteritems(address_components) if c not in self.LOCALITY_COMPONENTS}
|
||||||
|
|
||||||
def drop_postcode(self, address_components):
|
def drop_postcode(self, address_components):
|
||||||
if AddressFormatter.POSTCODE not in address_components:
|
if AddressFormatter.POSTCODE not in address_components:
|
||||||
return address_components
|
return address_components
|
||||||
@@ -1149,6 +1167,7 @@ class AddressComponents(object):
|
|||||||
drop_places_probability = po_box_config['drop_places_probability']
|
drop_places_probability = po_box_config['drop_places_probability']
|
||||||
if random.random() < drop_places_probability:
|
if random.random() < drop_places_probability:
|
||||||
address_components = self.drop_places(address_components)
|
address_components = self.drop_places(address_components)
|
||||||
|
address_components = self.drop_localities(address_components)
|
||||||
|
|
||||||
drop_postcode_probability = po_box_config['drop_postcode_probability']
|
drop_postcode_probability = po_box_config['drop_postcode_probability']
|
||||||
if random.random() < drop_postcode_probability:
|
if random.random() < drop_postcode_probability:
|
||||||
@@ -1240,6 +1259,7 @@ class AddressComponents(object):
|
|||||||
self.cleanup_boundary_names(address_components)
|
self.cleanup_boundary_names(address_components)
|
||||||
self.add_house_number_phrase(address_components, language, country=country)
|
self.add_house_number_phrase(address_components, language, country=country)
|
||||||
self.add_postcode_phrase(address_components, language, country=country)
|
self.add_postcode_phrase(address_components, language, country=country)
|
||||||
|
self.add_metro_station_phrase(address_components, language, country=country)
|
||||||
|
|
||||||
if add_sub_building_components:
|
if add_sub_building_components:
|
||||||
self.add_sub_building_components(address_components, language, country=country,
|
self.add_sub_building_components(address_components, language, country=country,
|
||||||
|
|||||||
@@ -161,11 +161,10 @@ class NumericPhrase(object):
|
|||||||
values.append(num_type)
|
values.append(num_type)
|
||||||
probs.append(prob)
|
probs.append(prob)
|
||||||
|
|
||||||
probs = cdf(probs)
|
if not probs:
|
||||||
|
|
||||||
if len(values) < 2:
|
|
||||||
num_type = cls.NUMERIC
|
num_type = cls.NUMERIC
|
||||||
else:
|
else:
|
||||||
|
probs = cdf(probs)
|
||||||
num_type = weighted_choice(values, probs)
|
num_type = weighted_choice(values, probs)
|
||||||
|
|
||||||
return num_type, phrase, phrase_props[num_type]
|
return num_type, phrase, phrase_props[num_type]
|
||||||
@@ -265,7 +264,7 @@ class NumberedComponent(object):
|
|||||||
for t, c in tokens:
|
for t, c in tokens:
|
||||||
if c == token_types.NUMERIC:
|
if c == token_types.NUMERIC:
|
||||||
has_numeric = True
|
has_numeric = True
|
||||||
if t.isalnum():
|
if any((ch.isalpha() for ch in t)):
|
||||||
has_alpha = True
|
has_alpha = True
|
||||||
|
|
||||||
if strict_numeric and has_alpha:
|
if strict_numeric and has_alpha:
|
||||||
|
|||||||
Reference in New Issue
Block a user