From 1bf32c73203c776e81d53a4ad9e57bd7793c1529 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 7 May 2016 22:32:05 -0400
Subject: [PATCH] [boundaries] Config for boundary name changes (Kings County
 is a state_district but Brooklyn should not be used for that context) and
 omissions (usually we add islands as address components, but not e.g.
 Manhattan Island)

---
 resources/boundaries/names/global.yaml | 51 ++++++++++++++
 scripts/geodata/boundaries/__init__.py |  0
 scripts/geodata/boundaries/names.py    | 92 ++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 resources/boundaries/names/global.yaml
 create mode 100644 scripts/geodata/boundaries/__init__.py
 create mode 100644 scripts/geodata/boundaries/names.py

diff --git a/resources/boundaries/names/global.yaml b/resources/boundaries/names/global.yaml
new file mode 100644
index 00000000..58096496
--- /dev/null
+++ b/resources/boundaries/names/global.yaml
@@ -0,0 +1,51 @@
+names:
+    keys:
+        default: name
+        probability: 0.75
+        alternatives:
+            - alternative: short_name # e.g. NYC
+              probability: 0.12
+            - alternative: alt_name # e.g. New York (instead of New York City)
+              probability: 0.12
+            - alternative: official_name # e.g. United Kingdom of Great Britain and Northern Ireland
+              probability: 0.01
+
+
+    # This section overrides place names
+    exceptions:
+        # Boroughs of New York City
+        - id: 2552485 # New York County (don't use Manhattan)
+          type: relation
+          default: New York County
+          probability: 1.0
+        - id: 369518 # Kings County (don't use Brooklyn)
+          type: relation
+          default: Kings County
+          probability: 1.0
+        - id: 369519 # Queens County (don't use Queens)
+          type: relation
+          default: Queens County
+          probability: 1.0
+        - id: 2552450 # Bronx County (don't use The Bronx)
+          type: relation
+          default: Bronx County
+          probability: 1.0
+        - id: 962876 # Richmond County (don't use Staten Island)
+          type: relation
+          default: Richmond County
+          probability: 1.0
+
+    omissions:
+        - id: 3954665 # Manhattan Island
+          type: relation
+          omit:
+              conditions:
+                  - id: 175905 # NYC (always true)
+                    type: relation
+        - id: 3955977 # Long Island
+          type: relation
+          include_probability: 0.1
+          omit:
+              conditions:
+                  - id: 175905 # NYC
+                    type: relation
diff --git a/scripts/geodata/boundaries/__init__.py b/scripts/geodata/boundaries/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py
new file mode 100644
index 00000000..937f1930
--- /dev/null
+++ b/scripts/geodata/boundaries/names.py
@@ -0,0 +1,92 @@
+import os
+import yaml
+
+from collections import defaultdict
+
+from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities
+from geodata.math.sampling import cdf, weighted_choice
+
+from geodata.encoding import safe_encode
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+BOUNDARY_NAMES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                  'resources', 'boundaries', 'names')
+
+BOUNDARY_NAMES_CONFIG = os.path.join(BOUNDARY_NAMES_DIR, 'global.yaml')
+
+
+class BoundaryNames(object):
+    DEFAULT_NAME_KEY = 'name'
+
+    def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
+        config = yaml.load(open(config_file))
+
+        default_names = nested_get(config, ('names', 'keys'))
+        name_keys, probs = alternative_probabilities(default_names)
+
+        self.name_keys = name_keys
+        self.name_key_probs = cdf(probs)
+
+        self.exceptions = {}
+
+        for props in nested_get(config, ('names', 'exceptions'), default=[]):
+            object_type = props['type']
+            object_id = safe_encode(props['id'])
+            keys = [props['default']]
+            probs = [props['probability']]
+            for alt in props.get('alternatives', []):
+                keys.append(alt['alternative'])
+                probs.append(alt['probability'])
+
+            probs = cdf(probs)
+            self.exceptions[(object_type, object_id)] = (keys, probs)
+
+        self.include_probabilities = {}
+        self.omit_conditions = defaultdict(set)
+
+        for props in nested_get(config, ('names', 'omissions'), default=[]):
+            object_type = props['type']
+            object_id = safe_encode(props['id'])
+            include_probability = props.get('include_probability')
+
+            if include_probability is not None:
+                self.include_probabilities[(object_type, object_id)] = float(include_probability)
+
+            for condition in nested_get(props, ('omit', 'conditions'), default=[]):
+                condition_object_id = safe_encode(condition['id'])
+                condition_object_type = condition['type']
+                self.omit_conditions[(object_type, object_id)].add((condition_object_type, condition_object_id))
+
+    def name_key(self, props):
+        object_type = props.get('type')
+        object_id = safe_encode(props.get('id', ''))
+
+        if (object_type, object_id) in self.exceptions:
+            values, probs = self.exceptions[(object_type, object_id)]
+            return weighted_choice(values, probs)
+
+        return weighted_choice(self.name_keys, self.name_key_probs)
+
+    def remove_excluded_components(self, components):
+        all_ids = set()
+        for component in components:
+            object_type = component.get('type')
+            object_id = safe_encode(component.get('id', ''))
+            all_ids.add((object_type, object_id))
+
+        for object_type, object_id in list(all_ids):
+            if (object_type, object_id) in self.omit_conditions:
+                conditions = self.omit_conditions[(object_type, object_id)]
+                if all_ids & conditions:
+                    all_ids.remove((object_type, object_id))
+
+            if (object_type, object_id) in self.include_probabilities and random.random() > self.include_probabilities[(object_type, object_id)]:
+                all_ids.remove((object_type, object_id))
+
+        if len(all_ids) == len(components):
+            return components
+
+        return [c for c in components if (c.get('type'), safe_encode(c.get('id', ''))) in all_ids]
+
+boundary_names = BoundaryNames()