From 1bf32c73203c776e81d53a4ad9e57bd7793c1529 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 7 May 2016 22:32:05 -0400 Subject: [PATCH] [boundaries] Config for boundary name changes (Kings County is a state_district but Brooklyn should not be used for that context) and omissions (usually we add islands as address components, but not e.g. Manhattan Island) --- resources/boundaries/names/global.yaml | 51 ++++++++++++++ scripts/geodata/boundaries/__init__.py | 0 scripts/geodata/boundaries/names.py | 92 ++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 resources/boundaries/names/global.yaml create mode 100644 scripts/geodata/boundaries/__init__.py create mode 100644 scripts/geodata/boundaries/names.py diff --git a/resources/boundaries/names/global.yaml b/resources/boundaries/names/global.yaml new file mode 100644 index 00000000..58096496 --- /dev/null +++ b/resources/boundaries/names/global.yaml @@ -0,0 +1,51 @@ +names: + keys: + default: name + probability: 0.75 + alternatives: + - alternative: short_name # e.g. NYC + probability: 0.12 + - alternative: alt_name # e.g. New York (instead of New York City) + probability: 0.12 + - alternative: official_name # e.g. United Kingdom of Great Britain and Northern Ireland + probability: 0.01 + + + # This section overrides place names + exceptions: + # Boroughs of New York City + - id: 2552485 # New York County (don't use Manhattan) + type: relation + default: New York County + probability: 1.0 + - id: 369518 # Kings County (don't use Brooklyn) + type: relation + default: Kings County + probability: 1.0 + - id: 369519 # Queens County (don't use Queens) + type: relation + default: Queens County + probability: 1.0 + - id: 2552450 # Bronx County (don't use The Bronx) + type: relation + default: Bronx County + probability: 1.0 + - id: 962876 # Richmond County (don't use Staten Island) + type: relation + default: Richmond County + probability: 1.0 + + omissions: + - id: 3954665 # Manhattan Island + type: relation + omit: + conditions: + - id: 175905 # NYC (always true) + type: relation + - id: 3955977 # Long Island + type: relation + include_probability: 0.1 + omit: + conditions: + - id: 175905 # NYC + type: relation diff --git a/scripts/geodata/boundaries/__init__.py b/scripts/geodata/boundaries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py new file mode 100644 index 00000000..937f1930 --- /dev/null +++ b/scripts/geodata/boundaries/names.py @@ -0,0 +1,92 @@ +import os +import yaml + +from collections import defaultdict + +from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities +from geodata.math.sampling import cdf, weighted_choice + +from geodata.encoding import safe_encode + +this_dir = os.path.realpath(os.path.dirname(__file__)) + +BOUNDARY_NAMES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'boundaries', 'names') + +BOUNDARY_NAMES_CONFIG = os.path.join(BOUNDARY_NAMES_DIR, 'global.yaml') + + +class BoundaryNames(object): + DEFAULT_NAME_KEY = 'name' + + def __init__(self, config_file=BOUNDARY_NAMES_CONFIG): + config = yaml.load(open(config_file)) + + default_names = nested_get(config, ('names', 'keys')) + name_keys, probs = alternative_probabilities(default_names) + + self.name_keys = name_keys + self.name_key_probs = cdf(probs) + + self.exceptions = {} + + for props in nested_get(config, ('names', 'exceptions'), default=[]): + object_type = props['type'] + object_id = safe_encode(props['id']) + keys = [props['default']] + probs = [props['probability']] + for alt in props.get('alternatives', []): + keys.append(alt['alternative']) + probs.append(alt['probability']) + + probs = cdf(probs) + self.exceptions[(object_type, object_id)] = (keys, probs) + + self.include_probabilities = {} + self.omit_conditions = defaultdict(set) + + for props in nested_get(config, ('names', 'omissions'), default=[]): + object_type = props['type'] + object_id = safe_encode(props['id']) + include_probability = props.get('include_probability') + + if include_probability is not None: + self.include_probabilities[(object_type, object_id)] = float(include_probability) + + for condition in nested_get(props, ('omit', 'conditions'), default=[]): + condition_object_id = safe_encode(condition['id']) + condition_object_type = condition['type'] + self.omit_conditions[(object_type, object_id)].add((condition_object_type, condition_object_id)) + + def name_key(self, props): + object_type = props.get('type') + object_id = safe_encode(props.get('id', '')) + + if (object_type, object_id) in self.exceptions: + values, probs = self.exceptions[(object_type, object_id)] + return weighted_choice(values, probs) + + return weighted_choice(self.name_keys, self.name_key_probs) + + def remove_excluded_components(self, components): + all_ids = set() + for component in components: + object_type = component.get('type') + object_id = safe_encode(component.get('id', '')) + all_ids.add((object_type, object_id)) + + for object_type, object_id in list(all_ids): + if (object_type, object_id) in self.omit_conditions: + conditions = self.omit_conditions[(object_type, object_id)] + if all_ids & conditions: + all_ids.remove((object_type, object_id)) + + if (object_type, object_id) in self.include_probabilities and random.random() > self.include_probabilities[(object_type, object_id)]: + all_ids.remove((object_type, object_id)) + + if len(all_ids) == len(components): + return components + + return [c for c in components if (c.get('type'), safe_encode(c.get('id', ''))) in all_ids] + +boundary_names = BoundaryNames()