[places] Place config with dropout probabilities for admin boundaries by country, exceptions by OSM relation id

This commit is contained in:
Al
2016-05-19 03:18:06 -04:00
parent 135824f667
commit 30751cc342
3 changed files with 292 additions and 0 deletions

View File

@@ -0,0 +1,201 @@
# This config specifies isolated and conditional probabilities
# of including various admin components (applies to both addresses
# and standalone place queries)
global:
# Probability of including individual components
components:
suburb:
probability: 0.2
city_district:
probability: 0.2
city:
probability: 0.85
island:
# Islands are usually not represented in addresses
probability: 0.0
state_district:
probability: 0.05
state:
probability: 0.1
country:
probability: 0.6
countries:
# Australia
au:
components:
state:
probability: 0.6
# Canada
ca:
components:
state:
probability: 0.7
# France
fr:
components:
city_district:
probability: 0.4
# United Kingdom
gb:
components:
suburb:
probability: 0.5
# Indonesia
id:
components:
island:
probability: 0.4
# Hong Kong
hk:
components:
country:
probability: 0.85
state:
probability: 0.2
island:
probability: 0.4
containing:
- id: 2278450 # Hong Kong Island (already a "state")
type: relation
probability: 0.0
# Japan
jp:
components:
island:
probability: 0.4
suburb:
probability: 0.6
city_district:
probability: 0.8
state:
probability: 0.8
country:
probability: 0.4
hu:
components:
country:
probability: 0.1
# Saint Kitts and Nevis
kn:
components:
island:
probability: 0.8
# Malaysia
my:
components:
island:
probability: 0.3
# Pitcairn Islands
pn:
components:
island:
probability: 0.8
# Seychelles
sc:
components:
island:
probability: 0.8
# United States
us:
# Definitions
kings_county: &kings_county
id: 369518 # Kings County (Brooklyn, NY)
type: relation
queens_county: &queens_county
id: 369519 # Queens County (Queens, NY)
type: relation
bronx_county: &bronx_county
id: 2552450 # Bronx County (Bronx, NY)
type: relation
richmond_county: &richmond_county
id: 962876 # Richmond County (Staten Island, NY)
type: relation
hawaii: &hawaii
id: 166563 # State of Hawaii
type: relation
components:
suburb:
probability: 0.4
city_district:
probability: 0.2
containing:
- <<: *kings_county
probability: 0.85
- <<: *queens_county
probability: 0.85
- <<: *bronx_county
probability: 0.85
- <<: *richmond_county
probability: 0.85
city:
containing:
- <<: *kings_county
probability: 0.1
- <<: *queens_county
probability: 0.1
- <<: *bronx_county
probability: 0.1
- <<: *richmond_county
probability: 0.1
island:
order:
direction: before
component: state_district
containing:
# Island is more common in Hawaiian addresses
- <<: *hawaii
probability: 0.8
state:
probability: 0.7
# Higher probability of Brooklyn/city_district NY/state
containing:
- <<: *kings_county
probability: 0.8
- <<: *queens_county
probability: 0.8
- <<: *bronx_county
probability: 0.8
- <<: *richmond_county
probability: 0.8
state_district:
probability: 0.1
country:
probability: 0.1
# Tuvalu
tv:
components:
island:
probability: 0.8
# US Virgin Islands
vi:
components:
island:
probability: 0.8
# British Virgin Islands
vg:
components:
island:
probability: 0.8

View File

View File

@@ -0,0 +1,91 @@
import copy
import os
import random
import six
import yaml
from collections import Mapping
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
from geodata.math.sampling import cdf, check_probability_distribution
from geodata.encoding import safe_encode
this_dir = os.path.realpath(os.path.dirname(__file__))
PLACE_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'places', 'countries', 'global.yaml')
class PlaceConfig(object):
def __init__(self, config_file=PLACE_CONFIG_FILE):
self.cache = {}
place_config = yaml.load(open(config_file))
self.global_config = place_config['global']
self.country_configs = {}
countries = place_config.pop('countries', {})
for k, v in six.iteritems(countries):
country_config = countries[k]
global_config_copy = copy.deepcopy(self.global_config)
self.country_configs[k] = recursive_merge(global_config_copy, country_config)
def get_property(self, key, country=None, default=None):
if isinstance(key, six.string_types):
key = key.split('.')
config = self.global_config
if country:
country_config = self.country_configs.get(country, {})
if country_config:
config = country_config
return nested_get(config, key, default=default)
def include_component(self, component, containing_ids, country=None):
containing = self.get_property(('components', component, 'containing'), country=country, default=None)
if containing is not None:
for c in containing:
if (c['type'], safe_encode(c['id'])) in containing_ids:
return random.random() < c['probability']
probability = self.get_property(('components', component, 'probability'), country=country, default=0.0)
return random.random() < probability
def drop_components(self, components, boundaries=(), country=None):
containing_ids = set()
for boundary in boundaries:
object_type = boundary.get('type')
object_id = safe_encode(boundary.get('id', ''))
containing_ids.add((object_type, object_id))
return {c: v for c, v in six.iteritems(components) if self.include_component(c, containing_ids, country=country)}
all_ids = set()
for component in components:
object_type = component.get('type')
object_id = safe_encode(component.get('id', ''))
all_ids.add((object_type, object_id))
for object_type, object_id in list(all_ids):
if (object_type, object_id) in self.omit_conditions:
conditions = self.omit_conditions[(object_type, object_id)]
if all_ids & conditions:
all_ids.remove((object_type, object_id))
if (object_type, object_id) in self.include_probabilities and random.random() > self.include_probabilities[(object_type, object_id)]:
all_ids.remove((object_type, object_id))
if len(all_ids) == len(components):
return components
return [c for c in components if (c.get('type'), safe_encode(c.get('id', ''))) in all_ids]
place_config = PlaceConfig()