[addresses] Removing subdivisions/buildings rtree, moving probabilities to config
This commit is contained in:
@@ -7,7 +7,6 @@ import yaml
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from geodata.address_formatting.formatter import AddressFormatter
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
from geodata.address_formatting.aliases import Aliases
|
|
||||||
|
|
||||||
from geodata.addresses.floors import Floor
|
from geodata.addresses.floors import Floor
|
||||||
from geodata.addresses.units import Unit
|
from geodata.addresses.units import Unit
|
||||||
@@ -80,14 +79,12 @@ class AddressExpander(object):
|
|||||||
'ISO3166-1:alpha2', 'ISO3166-1:alpha3',
|
'ISO3166-1:alpha2', 'ISO3166-1:alpha3',
|
||||||
'short_name', 'alt_name', 'official_name'])
|
'short_name', 'alt_name', 'official_name'])
|
||||||
|
|
||||||
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames):
|
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames):
|
||||||
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
||||||
|
|
||||||
self.osm_admin_rtree = osm_admin_rtree
|
self.osm_admin_rtree = osm_admin_rtree
|
||||||
self.language_rtree = language_rtree
|
self.language_rtree = language_rtree
|
||||||
self.neighborhoods_rtree = neighborhoods_rtree
|
self.neighborhoods_rtree = neighborhoods_rtree
|
||||||
self.subdivisions_rtree = subdivisions_rtree
|
|
||||||
self.buildings_rtree = buildings_rtree
|
|
||||||
self.quattroshapes_rtree = quattroshapes_rtree
|
self.quattroshapes_rtree = quattroshapes_rtree
|
||||||
self.geonames = geonames
|
self.geonames = geonames
|
||||||
|
|
||||||
@@ -333,7 +330,7 @@ class AddressExpander(object):
|
|||||||
return sample_random_language()
|
return sample_random_language()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def state_name(self, address_components, country, language, non_local_language=None, state_full_name_prob=0.4):
|
def state_name(self, address_components, country, language, non_local_language=None, always_use_full_names=False):
|
||||||
'''
|
'''
|
||||||
States
|
States
|
||||||
------
|
------
|
||||||
@@ -347,7 +344,9 @@ class AddressExpander(object):
|
|||||||
if address_state and country and not non_local_language:
|
if address_state and country and not non_local_language:
|
||||||
state_full_name = state_abbreviations.get_full_name(country, language, address_state)
|
state_full_name = state_abbreviations.get_full_name(country, language, address_state)
|
||||||
|
|
||||||
if state_full_name and random.random() < state_full_name_prob:
|
state_full_name_prob = float(nested_get(self.config, ('state', 'full_name_probability')))
|
||||||
|
|
||||||
|
if state_full_name and (always_use_full_names or random.random() < state_full_name_prob):
|
||||||
address_state = state_full_name
|
address_state = state_full_name
|
||||||
elif address_state and non_local_language:
|
elif address_state and non_local_language:
|
||||||
_ = address_components.pop(AddressFormatter.STATE, None)
|
_ = address_components.pop(AddressFormatter.STATE, None)
|
||||||
@@ -369,6 +368,7 @@ class AddressExpander(object):
|
|||||||
osm_suffix='',
|
osm_suffix='',
|
||||||
non_local_language=None,
|
non_local_language=None,
|
||||||
random_key=True,
|
random_key=True,
|
||||||
|
always_use_full_names=False,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
OSM boundaries
|
OSM boundaries
|
||||||
@@ -424,21 +424,24 @@ class AddressExpander(object):
|
|||||||
|
|
||||||
for component, vals in poly_components.iteritems():
|
for component, vals in poly_components.iteritems():
|
||||||
if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
|
if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
|
||||||
if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob:
|
if not always_use_full_names:
|
||||||
num = random.randrange(1, len(vals) + 1)
|
if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob:
|
||||||
val = six.u(', ').join(vals[:num])
|
num = random.randrange(1, len(vals) + 1)
|
||||||
else:
|
val = six.u(', ').join(vals[:num])
|
||||||
val = random.choice(vals)
|
elif len(vals) == 1:
|
||||||
|
val = vals[0]
|
||||||
|
else:
|
||||||
|
val = random.choice(vals)
|
||||||
|
|
||||||
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
||||||
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
||||||
|
|
||||||
address_components[component] = val
|
address_components[component] = val
|
||||||
|
|
||||||
def quattroshapes_city(self, address_components,
|
def quattroshapes_city(self, address_components,
|
||||||
latitude, longitude,
|
latitude, longitude,
|
||||||
language, non_local_language=None,
|
language, non_local_language=None,
|
||||||
abbreviated_name_prob=0.1):
|
always_use_full_names=False):
|
||||||
'''
|
'''
|
||||||
Quattroshapes/GeoNames cities
|
Quattroshapes/GeoNames cities
|
||||||
-----------------------------
|
-----------------------------
|
||||||
@@ -470,7 +473,7 @@ class AddressExpander(object):
|
|||||||
if 'abbr' not in names or non_local_language:
|
if 'abbr' not in names or non_local_language:
|
||||||
# Use the common city name in the target language
|
# Use the common city name in the target language
|
||||||
city = names[lang][0][0]
|
city = names[lang][0][0]
|
||||||
elif random.random() < abbreviated_name_prob:
|
elif not always_use_full_names and random.random() < abbreviated_name_prob:
|
||||||
# Use an abbreviation: NYC, BK, SF, etc.
|
# Use an abbreviation: NYC, BK, SF, etc.
|
||||||
city = random.choice(names['abbr'])[0]
|
city = random.choice(names['abbr'])[0]
|
||||||
|
|
||||||
@@ -548,7 +551,7 @@ class AddressExpander(object):
|
|||||||
if component not in address_components and random.random() < add_neighborhood_prob:
|
if component not in address_components and random.random() < add_neighborhood_prob:
|
||||||
address_components[component] = neighborhoods[0]
|
address_components[component] = neighborhoods[0]
|
||||||
|
|
||||||
def replace_name_affixes(self, address_components, language, replacement_prob=0.6):
|
def replace_name_affixes(self, address_components, language):
|
||||||
'''
|
'''
|
||||||
Name normalization
|
Name normalization
|
||||||
------------------
|
------------------
|
||||||
@@ -723,7 +726,7 @@ class AddressExpander(object):
|
|||||||
non_local_language = self.non_local_language()
|
non_local_language = self.non_local_language()
|
||||||
self.replace_country_name(address_components, country, non_local_language or language)
|
self.replace_country_name(address_components, country, non_local_language or language)
|
||||||
|
|
||||||
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, state_full_name_prob=1.0)
|
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, always_use_full_names=True)
|
||||||
if address_state:
|
if address_state:
|
||||||
address_components[AddressFormatter.STATE] = address_state
|
address_components[AddressFormatter.STATE] = address_state
|
||||||
|
|
||||||
@@ -743,12 +746,10 @@ class AddressExpander(object):
|
|||||||
osm_suffix=osm_suffix,
|
osm_suffix=osm_suffix,
|
||||||
non_local_language=non_local_language,
|
non_local_language=non_local_language,
|
||||||
random_key=False,
|
random_key=False,
|
||||||
alpha_3_iso_code_prob=0.0,
|
always_use_full_names=True)
|
||||||
alpha_2_iso_code_prob=0.0,
|
|
||||||
replace_with_non_local_prob=0.0,
|
|
||||||
abbreviate_state_prob=0.0)
|
|
||||||
|
|
||||||
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
|
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language,
|
||||||
|
always_use_full_names=True)
|
||||||
|
|
||||||
if city:
|
if city:
|
||||||
address_components[AddressFormatter.CITY] = city
|
address_components[AddressFormatter.CITY] = city
|
||||||
|
|||||||
Reference in New Issue
Block a user