[boundaries] Config for boundary name changes (Kings County is a state_district but Brooklyn should not be used for that context) and omissions (usually we add islands as address components, but not e.g. Manhattan Island)
This commit is contained in:
51
resources/boundaries/names/global.yaml
Normal file
51
resources/boundaries/names/global.yaml
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
names:
|
||||||
|
keys:
|
||||||
|
default: name
|
||||||
|
probability: 0.75
|
||||||
|
alternatives:
|
||||||
|
- alternative: short_name # e.g. NYC
|
||||||
|
probability: 0.12
|
||||||
|
- alternative: alt_name # e.g. New York (instead of New York City)
|
||||||
|
probability: 0.12
|
||||||
|
- alternative: official_name # e.g. United Kingdom of Great Britain and Northern Ireland
|
||||||
|
probability: 0.01
|
||||||
|
|
||||||
|
|
||||||
|
# This section overrides place names
|
||||||
|
exceptions:
|
||||||
|
# Boroughs of New York City
|
||||||
|
- id: 2552485 # New York County (don't use Manhattan)
|
||||||
|
type: relation
|
||||||
|
default: New York County
|
||||||
|
probability: 1.0
|
||||||
|
- id: 369518 # Kings County (don't use Brooklyn)
|
||||||
|
type: relation
|
||||||
|
default: Kings County
|
||||||
|
probability: 1.0
|
||||||
|
- id: 369519 # Queens County (don't use Queens)
|
||||||
|
type: relation
|
||||||
|
default: Queens County
|
||||||
|
probability: 1.0
|
||||||
|
- id: 2552450 # Bronx County (don't use The Bronx)
|
||||||
|
type: relation
|
||||||
|
default: Bronx County
|
||||||
|
probability: 1.0
|
||||||
|
- id: 962876 # Richmond County (don't use Staten Island)
|
||||||
|
type: relation
|
||||||
|
default: Richmond County
|
||||||
|
probability: 1.0
|
||||||
|
|
||||||
|
omissions:
|
||||||
|
- id: 3954665 # Manhattan Island
|
||||||
|
type: relation
|
||||||
|
omit:
|
||||||
|
conditions:
|
||||||
|
- id: 175905 # NYC (always true)
|
||||||
|
type: relation
|
||||||
|
- id: 3955977 # Long Island
|
||||||
|
type: relation
|
||||||
|
include_probability: 0.1
|
||||||
|
omit:
|
||||||
|
conditions:
|
||||||
|
- id: 175905 # NYC
|
||||||
|
type: relation
|
||||||
0
scripts/geodata/boundaries/__init__.py
Normal file
0
scripts/geodata/boundaries/__init__.py
Normal file
92
scripts/geodata/boundaries/names.py
Normal file
92
scripts/geodata/boundaries/names.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities
|
||||||
|
from geodata.math.sampling import cdf, weighted_choice
|
||||||
|
|
||||||
|
from geodata.encoding import safe_encode
|
||||||
|
|
||||||
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
|
||||||
|
BOUNDARY_NAMES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
|
'resources', 'boundaries', 'names')
|
||||||
|
|
||||||
|
BOUNDARY_NAMES_CONFIG = os.path.join(BOUNDARY_NAMES_DIR, 'global.yaml')
|
||||||
|
|
||||||
|
|
||||||
|
class BoundaryNames(object):
|
||||||
|
DEFAULT_NAME_KEY = 'name'
|
||||||
|
|
||||||
|
def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
|
||||||
|
config = yaml.load(open(config_file))
|
||||||
|
|
||||||
|
default_names = nested_get(config, ('names', 'keys'))
|
||||||
|
name_keys, probs = alternative_probabilities(default_names)
|
||||||
|
|
||||||
|
self.name_keys = name_keys
|
||||||
|
self.name_key_probs = cdf(probs)
|
||||||
|
|
||||||
|
self.exceptions = {}
|
||||||
|
|
||||||
|
for props in nested_get(config, ('names', 'exceptions'), default=[]):
|
||||||
|
object_type = props['type']
|
||||||
|
object_id = safe_encode(props['id'])
|
||||||
|
keys = [props['default']]
|
||||||
|
probs = [props['probability']]
|
||||||
|
for alt in props.get('alternatives', []):
|
||||||
|
keys.append(alt['alternative'])
|
||||||
|
probs.append(alt['probability'])
|
||||||
|
|
||||||
|
probs = cdf(probs)
|
||||||
|
self.exceptions[(object_type, object_id)] = (keys, probs)
|
||||||
|
|
||||||
|
self.include_probabilities = {}
|
||||||
|
self.omit_conditions = defaultdict(set)
|
||||||
|
|
||||||
|
for props in nested_get(config, ('names', 'omissions'), default=[]):
|
||||||
|
object_type = props['type']
|
||||||
|
object_id = safe_encode(props['id'])
|
||||||
|
include_probability = props.get('include_probability')
|
||||||
|
|
||||||
|
if include_probability is not None:
|
||||||
|
self.include_probabilities[(object_type, object_id)] = float(include_probability)
|
||||||
|
|
||||||
|
for condition in nested_get(props, ('omit', 'conditions'), default=[]):
|
||||||
|
condition_object_id = safe_encode(condition['id'])
|
||||||
|
condition_object_type = condition['type']
|
||||||
|
self.omit_conditions[(object_type, object_id)].add((condition_object_type, condition_object_id))
|
||||||
|
|
||||||
|
def name_key(self, props):
|
||||||
|
object_type = props.get('type')
|
||||||
|
object_id = safe_encode(props.get('id', ''))
|
||||||
|
|
||||||
|
if (object_type, object_id) in self.exceptions:
|
||||||
|
values, probs = self.exceptions[(object_type, object_id)]
|
||||||
|
return weighted_choice(values, probs)
|
||||||
|
|
||||||
|
return weighted_choice(self.name_keys, self.name_key_probs)
|
||||||
|
|
||||||
|
def remove_excluded_components(self, components):
|
||||||
|
all_ids = set()
|
||||||
|
for component in components:
|
||||||
|
object_type = component.get('type')
|
||||||
|
object_id = safe_encode(component.get('id', ''))
|
||||||
|
all_ids.add((object_type, object_id))
|
||||||
|
|
||||||
|
for object_type, object_id in list(all_ids):
|
||||||
|
if (object_type, object_id) in self.omit_conditions:
|
||||||
|
conditions = self.omit_conditions[(object_type, object_id)]
|
||||||
|
if all_ids & conditions:
|
||||||
|
all_ids.remove((object_type, object_id))
|
||||||
|
|
||||||
|
if (object_type, object_id) in self.include_probabilities and random.random() > self.include_probabilities[(object_type, object_id)]:
|
||||||
|
all_ids.remove((object_type, object_id))
|
||||||
|
|
||||||
|
if len(all_ids) == len(components):
|
||||||
|
return components
|
||||||
|
|
||||||
|
return [c for c in components if (c.get('type'), safe_encode(c.get('id', ''))) in all_ids]
|
||||||
|
|
||||||
|
boundary_names = BoundaryNames()
|
||||||
Reference in New Issue
Block a user