From 30751cc342087d6b7551ecb468b164d8cfe9f7ec Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 19 May 2016 03:18:06 -0400 Subject: [PATCH] [places] Place config with dropout probabilities for admin boundaries by country, exceptions by OSM relation id --- resources/places/countries/global.yaml | 201 +++++++++++++++++++++++++ scripts/geodata/places/__init__.py | 0 scripts/geodata/places/config.py | 91 +++++++++++ 3 files changed, 292 insertions(+) create mode 100644 resources/places/countries/global.yaml create mode 100644 scripts/geodata/places/__init__.py create mode 100644 scripts/geodata/places/config.py diff --git a/resources/places/countries/global.yaml b/resources/places/countries/global.yaml new file mode 100644 index 00000000..18a55cad --- /dev/null +++ b/resources/places/countries/global.yaml @@ -0,0 +1,201 @@ +# This config specifies isolated and conditional probabilities +# of including various admin components (applies to both addresses +# and standalone place queries) + +global: + # Probability of including individual components + components: + suburb: + probability: 0.2 + city_district: + probability: 0.2 + city: + probability: 0.85 + island: + # Islands are usually not represented in addresses + probability: 0.0 + state_district: + probability: 0.05 + state: + probability: 0.1 + country: + probability: 0.6 + +countries: + # Australia + au: + components: + state: + probability: 0.6 + + # Canada + ca: + components: + state: + probability: 0.7 + + # France + fr: + components: + city_district: + probability: 0.4 + + # United Kingdom + gb: + components: + suburb: + probability: 0.5 + + # Indonesia + id: + components: + island: + probability: 0.4 + + # Hong Kong + hk: + components: + country: + probability: 0.85 + state: + probability: 0.2 + island: + probability: 0.4 + containing: + - id: 2278450 # Hong Kong Island (already a "state") + type: relation + probability: 0.0 + + # Japan + jp: + components: + island: + probability: 0.4 + suburb: + probability: 0.6 + city_district: + probability: 0.8 + state: + probability: 0.8 + country: + probability: 0.4 + + hu: + components: + country: + probability: 0.1 + + # Saint Kitts and Nevis + kn: + components: + island: + probability: 0.8 + + # Malaysia + my: + components: + island: + probability: 0.3 + + # Pitcairn Islands + pn: + components: + island: + probability: 0.8 + + # Seychelles + sc: + components: + island: + probability: 0.8 + + # United States + us: + # Definitions + kings_county: &kings_county + id: 369518 # Kings County (Brooklyn, NY) + type: relation + + queens_county: &queens_county + id: 369519 # Queens County (Queens, NY) + type: relation + + bronx_county: &bronx_county + id: 2552450 # Bronx County (Bronx, NY) + type: relation + + richmond_county: &richmond_county + id: 962876 # Richmond County (Staten Island, NY) + type: relation + + hawaii: &hawaii + id: 166563 # State of Hawaii + type: relation + + components: + suburb: + probability: 0.4 + city_district: + probability: 0.2 + containing: + - <<: *kings_county + probability: 0.85 + - <<: *queens_county + probability: 0.85 + - <<: *bronx_county + probability: 0.85 + - <<: *richmond_county + probability: 0.85 + city: + containing: + - <<: *kings_county + probability: 0.1 + - <<: *queens_county + probability: 0.1 + - <<: *bronx_county + probability: 0.1 + - <<: *richmond_county + probability: 0.1 + island: + order: + direction: before + component: state_district + containing: + # Island is more common in Hawaiian addresses + - <<: *hawaii + probability: 0.8 + state: + probability: 0.7 + # Higher probability of Brooklyn/city_district NY/state + containing: + - <<: *kings_county + probability: 0.8 + - <<: *queens_county + probability: 0.8 + - <<: *bronx_county + probability: 0.8 + - <<: *richmond_county + probability: 0.8 + + state_district: + probability: 0.1 + country: + probability: 0.1 + + # Tuvalu + tv: + components: + island: + probability: 0.8 + + # US Virgin Islands + vi: + components: + island: + probability: 0.8 + + # British Virgin Islands + vg: + components: + island: + probability: 0.8 diff --git a/scripts/geodata/places/__init__.py b/scripts/geodata/places/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/places/config.py b/scripts/geodata/places/config.py new file mode 100644 index 00000000..befd5978 --- /dev/null +++ b/scripts/geodata/places/config.py @@ -0,0 +1,91 @@ +import copy +import os +import random +import six +import yaml + +from collections import Mapping + +from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries +from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge +from geodata.math.sampling import cdf, check_probability_distribution + +from geodata.encoding import safe_encode + +this_dir = os.path.realpath(os.path.dirname(__file__)) + +PLACE_CONFIG_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'places', 'countries', 'global.yaml') + + +class PlaceConfig(object): + def __init__(self, config_file=PLACE_CONFIG_FILE): + self.cache = {} + place_config = yaml.load(open(config_file)) + + self.global_config = place_config['global'] + self.country_configs = {} + + countries = place_config.pop('countries', {}) + + for k, v in six.iteritems(countries): + country_config = countries[k] + global_config_copy = copy.deepcopy(self.global_config) + self.country_configs[k] = recursive_merge(global_config_copy, country_config) + + def get_property(self, key, country=None, default=None): + if isinstance(key, six.string_types): + key = key.split('.') + + config = self.global_config + + if country: + country_config = self.country_configs.get(country, {}) + if country_config: + config = country_config + + return nested_get(config, key, default=default) + + def include_component(self, component, containing_ids, country=None): + containing = self.get_property(('components', component, 'containing'), country=country, default=None) + + if containing is not None: + for c in containing: + if (c['type'], safe_encode(c['id'])) in containing_ids: + return random.random() < c['probability'] + + probability = self.get_property(('components', component, 'probability'), country=country, default=0.0) + + return random.random() < probability + + def drop_components(self, components, boundaries=(), country=None): + containing_ids = set() + for boundary in boundaries: + object_type = boundary.get('type') + object_id = safe_encode(boundary.get('id', '')) + containing_ids.add((object_type, object_id)) + + return {c: v for c, v in six.iteritems(components) if self.include_component(c, containing_ids, country=country)} + + all_ids = set() + + for component in components: + object_type = component.get('type') + object_id = safe_encode(component.get('id', '')) + all_ids.add((object_type, object_id)) + + for object_type, object_id in list(all_ids): + if (object_type, object_id) in self.omit_conditions: + conditions = self.omit_conditions[(object_type, object_id)] + if all_ids & conditions: + all_ids.remove((object_type, object_id)) + + if (object_type, object_id) in self.include_probabilities and random.random() > self.include_probabilities[(object_type, object_id)]: + all_ids.remove((object_type, object_id)) + + if len(all_ids) == len(components): + return components + + return [c for c in components if (c.get('type'), safe_encode(c.get('id', ''))) in all_ids] + +place_config = PlaceConfig()