[osm/formatting] Dropping state more often than not, except in the US and Canada where those fields are more commonly used
This commit is contained in:
@@ -537,7 +537,11 @@ class OSMAddressFormatter(object):
|
|||||||
AddressFormatter.CITY_DISTRICT,
|
AddressFormatter.CITY_DISTRICT,
|
||||||
AddressFormatter.STATE_DISTRICT,
|
AddressFormatter.STATE_DISTRICT,
|
||||||
AddressFormatter.STATE,
|
AddressFormatter.STATE,
|
||||||
AddressFormatter.COUNTRY,
|
}
|
||||||
|
|
||||||
|
state_important = {
|
||||||
|
'US',
|
||||||
|
'CA',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, splitter=None):
|
def __init__(self, admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, splitter=None):
|
||||||
@@ -1141,7 +1145,7 @@ class OSMAddressFormatter(object):
|
|||||||
|
|
||||||
return address_components, country, language
|
return address_components, country, language
|
||||||
|
|
||||||
def formatted_addresses(self, value, dropout_prob=0.5, rare_component_dropout_prob=0.7, tag_components=True):
|
def formatted_addresses(self, value, dropout_prob=0.5, rare_component_dropout_prob=0.6, tag_components=True):
|
||||||
'''
|
'''
|
||||||
Formatted addresses
|
Formatted addresses
|
||||||
-------------------
|
-------------------
|
||||||
@@ -1187,6 +1191,11 @@ class OSMAddressFormatter(object):
|
|||||||
if not address_components:
|
if not address_components:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
current_components = []
|
||||||
|
current_components_rare = []
|
||||||
|
|
||||||
|
state_important = country.upper() in self.state_important
|
||||||
|
|
||||||
current_components = [k for k in address_components.keys() if k not in self.rare_components]
|
current_components = [k for k in address_components.keys() if k not in self.rare_components]
|
||||||
current_components_rare = [k for k in address_components.keys() if k in self.rare_components]
|
current_components_rare = [k for k in address_components.keys() if k in self.rare_components]
|
||||||
random.shuffle(current_components)
|
random.shuffle(current_components)
|
||||||
@@ -1198,6 +1207,11 @@ class OSMAddressFormatter(object):
|
|||||||
for component in current_components:
|
for component in current_components:
|
||||||
prob = rare_component_dropout_prob if component in self.rare_components else dropout_prob
|
prob = rare_component_dropout_prob if component in self.rare_components else dropout_prob
|
||||||
|
|
||||||
|
if component not in self.rare_components or (component == AddressFormatter.STATE and state_important):
|
||||||
|
prob = dropout_prob
|
||||||
|
else:
|
||||||
|
prob = rare_component_dropout_prob
|
||||||
|
|
||||||
if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < prob:
|
if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < prob:
|
||||||
address_components.pop(component)
|
address_components.pop(component)
|
||||||
component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component]
|
component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component]
|
||||||
|
|||||||
Reference in New Issue
Block a user