[osm/formatting] Changing drop probabilities and doing it in random order
This commit is contained in:
@@ -116,7 +116,9 @@ OSM_ADDRESS_COMPONENTS = OrderedDict.fromkeys([
|
|||||||
AddressComponent(AddressFormatter.HOUSE_NUMBER, dependencies=(AddressFormatter.ROAD,)),
|
AddressComponent(AddressFormatter.HOUSE_NUMBER, dependencies=(AddressFormatter.ROAD,)),
|
||||||
AddressComponent(AddressFormatter.SUBURB, dependencies=(AddressFormatter.CITY, AddressFormatter.STATE,
|
AddressComponent(AddressFormatter.SUBURB, dependencies=(AddressFormatter.CITY, AddressFormatter.STATE,
|
||||||
AddressFormatter.POSTCODE)),
|
AddressFormatter.POSTCODE)),
|
||||||
|
AddressComponent(AddressFormatter.CITY_DISTRICT, dependencies=(AddressFormatter.CITY,)),
|
||||||
AddressComponent(AddressFormatter.CITY),
|
AddressComponent(AddressFormatter.CITY),
|
||||||
|
AddressComponent(AddressFormatter.STATE_DISTRICT, dependencies=(AddressFormatter.STATE, AddressFormatter.POSTCODE)),
|
||||||
AddressComponent(AddressFormatter.STATE, dependencies=(AddressFormatter.SUBURB, AddressFormatter.CITY,
|
AddressComponent(AddressFormatter.STATE, dependencies=(AddressFormatter.SUBURB, AddressFormatter.CITY,
|
||||||
AddressFormatter.POSTCODE, AddressFormatter.COUNTRY)),
|
AddressFormatter.POSTCODE, AddressFormatter.COUNTRY)),
|
||||||
AddressComponent(AddressFormatter.POSTCODE),
|
AddressComponent(AddressFormatter.POSTCODE),
|
||||||
@@ -358,7 +360,7 @@ DROP_PROBABILITIES = {
|
|||||||
AddressFormatter.CITY_DISTRICT: 0.8,
|
AddressFormatter.CITY_DISTRICT: 0.8,
|
||||||
AddressFormatter.CITY: 0.6,
|
AddressFormatter.CITY: 0.6,
|
||||||
AddressFormatter.STATE_DISTRICT: 0.8,
|
AddressFormatter.STATE_DISTRICT: 0.8,
|
||||||
AddressFormatter.STATE: 0.8,
|
AddressFormatter.STATE: 0.7,
|
||||||
AddressFormatter.POSTCODE: 0.7,
|
AddressFormatter.POSTCODE: 0.7,
|
||||||
AddressFormatter.COUNTRY: 0.8
|
AddressFormatter.COUNTRY: 0.8
|
||||||
}
|
}
|
||||||
@@ -666,12 +668,15 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
|||||||
if not address_components:
|
if not address_components:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
current_components = component_bitset(address_components.keys())
|
current_components = address_components.keys()
|
||||||
|
random.shuffle(current_components)
|
||||||
|
|
||||||
for component in address_components.keys():
|
component_set = component_bitset(address_components.keys())
|
||||||
if current_components ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < DROP_PROBABILITIES[component]:
|
|
||||||
|
for component in current_components:
|
||||||
|
if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < DROP_PROBABILITIES[component]:
|
||||||
address_components.pop(component)
|
address_components.pop(component)
|
||||||
current_components ^= OSM_ADDRESS_COMPONENT_VALUES[component]
|
component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component]
|
||||||
if not address_components:
|
if not address_components:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user