[places] Place config with dropout probabilities for admin boundaries by country, exceptions by OSM relation id
This commit is contained in:
201
resources/places/countries/global.yaml
Normal file
201
resources/places/countries/global.yaml
Normal file
@@ -0,0 +1,201 @@
|
||||
# This config specifies isolated and conditional probabilities
|
||||
# of including various admin components (applies to both addresses
|
||||
# and standalone place queries)
|
||||
|
||||
global:
|
||||
# Probability of including individual components
|
||||
components:
|
||||
suburb:
|
||||
probability: 0.2
|
||||
city_district:
|
||||
probability: 0.2
|
||||
city:
|
||||
probability: 0.85
|
||||
island:
|
||||
# Islands are usually not represented in addresses
|
||||
probability: 0.0
|
||||
state_district:
|
||||
probability: 0.05
|
||||
state:
|
||||
probability: 0.1
|
||||
country:
|
||||
probability: 0.6
|
||||
|
||||
countries:
|
||||
# Australia
|
||||
au:
|
||||
components:
|
||||
state:
|
||||
probability: 0.6
|
||||
|
||||
# Canada
|
||||
ca:
|
||||
components:
|
||||
state:
|
||||
probability: 0.7
|
||||
|
||||
# France
|
||||
fr:
|
||||
components:
|
||||
city_district:
|
||||
probability: 0.4
|
||||
|
||||
# United Kingdom
|
||||
gb:
|
||||
components:
|
||||
suburb:
|
||||
probability: 0.5
|
||||
|
||||
# Indonesia
|
||||
id:
|
||||
components:
|
||||
island:
|
||||
probability: 0.4
|
||||
|
||||
# Hong Kong
|
||||
hk:
|
||||
components:
|
||||
country:
|
||||
probability: 0.85
|
||||
state:
|
||||
probability: 0.2
|
||||
island:
|
||||
probability: 0.4
|
||||
containing:
|
||||
- id: 2278450 # Hong Kong Island (already a "state")
|
||||
type: relation
|
||||
probability: 0.0
|
||||
|
||||
# Japan
|
||||
jp:
|
||||
components:
|
||||
island:
|
||||
probability: 0.4
|
||||
suburb:
|
||||
probability: 0.6
|
||||
city_district:
|
||||
probability: 0.8
|
||||
state:
|
||||
probability: 0.8
|
||||
country:
|
||||
probability: 0.4
|
||||
|
||||
hu:
|
||||
components:
|
||||
country:
|
||||
probability: 0.1
|
||||
|
||||
# Saint Kitts and Nevis
|
||||
kn:
|
||||
components:
|
||||
island:
|
||||
probability: 0.8
|
||||
|
||||
# Malaysia
|
||||
my:
|
||||
components:
|
||||
island:
|
||||
probability: 0.3
|
||||
|
||||
# Pitcairn Islands
|
||||
pn:
|
||||
components:
|
||||
island:
|
||||
probability: 0.8
|
||||
|
||||
# Seychelles
|
||||
sc:
|
||||
components:
|
||||
island:
|
||||
probability: 0.8
|
||||
|
||||
# United States
|
||||
us:
|
||||
# Definitions
|
||||
kings_county: &kings_county
|
||||
id: 369518 # Kings County (Brooklyn, NY)
|
||||
type: relation
|
||||
|
||||
queens_county: &queens_county
|
||||
id: 369519 # Queens County (Queens, NY)
|
||||
type: relation
|
||||
|
||||
bronx_county: &bronx_county
|
||||
id: 2552450 # Bronx County (Bronx, NY)
|
||||
type: relation
|
||||
|
||||
richmond_county: &richmond_county
|
||||
id: 962876 # Richmond County (Staten Island, NY)
|
||||
type: relation
|
||||
|
||||
hawaii: &hawaii
|
||||
id: 166563 # State of Hawaii
|
||||
type: relation
|
||||
|
||||
components:
|
||||
suburb:
|
||||
probability: 0.4
|
||||
city_district:
|
||||
probability: 0.2
|
||||
containing:
|
||||
- <<: *kings_county
|
||||
probability: 0.85
|
||||
- <<: *queens_county
|
||||
probability: 0.85
|
||||
- <<: *bronx_county
|
||||
probability: 0.85
|
||||
- <<: *richmond_county
|
||||
probability: 0.85
|
||||
city:
|
||||
containing:
|
||||
- <<: *kings_county
|
||||
probability: 0.1
|
||||
- <<: *queens_county
|
||||
probability: 0.1
|
||||
- <<: *bronx_county
|
||||
probability: 0.1
|
||||
- <<: *richmond_county
|
||||
probability: 0.1
|
||||
island:
|
||||
order:
|
||||
direction: before
|
||||
component: state_district
|
||||
containing:
|
||||
# Island is more common in Hawaiian addresses
|
||||
- <<: *hawaii
|
||||
probability: 0.8
|
||||
state:
|
||||
probability: 0.7
|
||||
# Higher probability of Brooklyn/city_district NY/state
|
||||
containing:
|
||||
- <<: *kings_county
|
||||
probability: 0.8
|
||||
- <<: *queens_county
|
||||
probability: 0.8
|
||||
- <<: *bronx_county
|
||||
probability: 0.8
|
||||
- <<: *richmond_county
|
||||
probability: 0.8
|
||||
|
||||
state_district:
|
||||
probability: 0.1
|
||||
country:
|
||||
probability: 0.1
|
||||
|
||||
# Tuvalu
|
||||
tv:
|
||||
components:
|
||||
island:
|
||||
probability: 0.8
|
||||
|
||||
# US Virgin Islands
|
||||
vi:
|
||||
components:
|
||||
island:
|
||||
probability: 0.8
|
||||
|
||||
# British Virgin Islands
|
||||
vg:
|
||||
components:
|
||||
island:
|
||||
probability: 0.8
|
||||
0
scripts/geodata/places/__init__.py
Normal file
0
scripts/geodata/places/__init__.py
Normal file
91
scripts/geodata/places/config.py
Normal file
91
scripts/geodata/places/config.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import copy
|
||||
import os
|
||||
import random
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from collections import Mapping
|
||||
|
||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
|
||||
from geodata.math.sampling import cdf, check_probability_distribution
|
||||
|
||||
from geodata.encoding import safe_encode
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
PLACE_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'places', 'countries', 'global.yaml')
|
||||
|
||||
|
||||
class PlaceConfig(object):
|
||||
def __init__(self, config_file=PLACE_CONFIG_FILE):
|
||||
self.cache = {}
|
||||
place_config = yaml.load(open(config_file))
|
||||
|
||||
self.global_config = place_config['global']
|
||||
self.country_configs = {}
|
||||
|
||||
countries = place_config.pop('countries', {})
|
||||
|
||||
for k, v in six.iteritems(countries):
|
||||
country_config = countries[k]
|
||||
global_config_copy = copy.deepcopy(self.global_config)
|
||||
self.country_configs[k] = recursive_merge(global_config_copy, country_config)
|
||||
|
||||
def get_property(self, key, country=None, default=None):
|
||||
if isinstance(key, six.string_types):
|
||||
key = key.split('.')
|
||||
|
||||
config = self.global_config
|
||||
|
||||
if country:
|
||||
country_config = self.country_configs.get(country, {})
|
||||
if country_config:
|
||||
config = country_config
|
||||
|
||||
return nested_get(config, key, default=default)
|
||||
|
||||
def include_component(self, component, containing_ids, country=None):
|
||||
containing = self.get_property(('components', component, 'containing'), country=country, default=None)
|
||||
|
||||
if containing is not None:
|
||||
for c in containing:
|
||||
if (c['type'], safe_encode(c['id'])) in containing_ids:
|
||||
return random.random() < c['probability']
|
||||
|
||||
probability = self.get_property(('components', component, 'probability'), country=country, default=0.0)
|
||||
|
||||
return random.random() < probability
|
||||
|
||||
def drop_components(self, components, boundaries=(), country=None):
|
||||
containing_ids = set()
|
||||
for boundary in boundaries:
|
||||
object_type = boundary.get('type')
|
||||
object_id = safe_encode(boundary.get('id', ''))
|
||||
containing_ids.add((object_type, object_id))
|
||||
|
||||
return {c: v for c, v in six.iteritems(components) if self.include_component(c, containing_ids, country=country)}
|
||||
|
||||
all_ids = set()
|
||||
|
||||
for component in components:
|
||||
object_type = component.get('type')
|
||||
object_id = safe_encode(component.get('id', ''))
|
||||
all_ids.add((object_type, object_id))
|
||||
|
||||
for object_type, object_id in list(all_ids):
|
||||
if (object_type, object_id) in self.omit_conditions:
|
||||
conditions = self.omit_conditions[(object_type, object_id)]
|
||||
if all_ids & conditions:
|
||||
all_ids.remove((object_type, object_id))
|
||||
|
||||
if (object_type, object_id) in self.include_probabilities and random.random() > self.include_probabilities[(object_type, object_id)]:
|
||||
all_ids.remove((object_type, object_id))
|
||||
|
||||
if len(all_ids) == len(components):
|
||||
return components
|
||||
|
||||
return [c for c in components if (c.get('type'), safe_encode(c.get('id', ''))) in all_ids]
|
||||
|
||||
place_config = PlaceConfig()
|
||||
Reference in New Issue
Block a user