[names] adding configurable prefix/suffix additions for boundary names

2016-12-27 20:32:08 -05:00
parent 80a9c1b308
commit 165056ccd8
2 changed files with 70 additions and 3 deletions
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -982,7 +982,7 @@ class AddressComponents(object):
                        name = component_value.get(k)
                        if name:
-                            name = boundary_names.name(country, name)
+                            name = boundary_names.name(country, component, name)
                        if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names):
                            name = self.cleaned_name(name, first_comma_delimited_phrase=True)
--- a/scripts/geodata/boundaries/names.py
+++ b/scripts/geodata/boundaries/names.py
@@ -7,6 +7,8 @@ import yaml
 from collections import defaultdict
 from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities
 from geodata.encoding import safe_decode
 from geodata.math.floats import isclose
 from geodata.math.sampling import cdf, weighted_choice
 from geodata.encoding import safe_encode
@@ -52,6 +54,45 @@ class BoundaryNames(object):
        self.country_regex_replacements = dict(self.country_regex_replacements)
        self.prefixes = {}
        self.prefix_regexes = {}
        self.suffixes = {}
        self.suffix_regexes = {}
        for country, components in six.iteritems(nested_get(config, ('names', 'prefixes',), default={}) ):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)
                for val in affix_values:
                    if 'prefix' not in val:
                        raise AssertionError(six.u('Invalid prefix value for (country={}, component={}): {} ').format(country, component, val))
                prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values])
                self.prefix_regexes[(country, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)
                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.prefixes[(country, component)] = affix_values, affix_probs_cdf
        for country, components in six.iteritems(nested_get(config, ('names', 'suffixes',), default={}) ):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)
                for val in affix_values:
                    if 'suffix' not in val:
                        raise AssertionError(six.u('Invalid suffix value for (country={}, component={}): {} ').format(country, component, val))
                suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values])
                self.suffix_regexes[(country, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)
                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.suffixes[(country, component)] = affix_values, affix_probs_cdf
        self.exceptions = {}
        for props in nested_get(config, ('names', 'exceptions'), default=[]):
@@ -66,6 +107,9 @@ class BoundaryNames(object):
            probs = cdf(probs)
            self.exceptions[(object_type, object_id)] = (keys, probs)
    def _string_as_regex(self, s):
        return safe_decode(s).replace(six.u('.'), six.u('\\.'))
    def name_key_dist(self, props, component):
        object_type = props.get('type')
        object_id = safe_encode(props.get('id', ''))
@@ -81,15 +125,38 @@ class BoundaryNames(object):
        name_keys, probs = self.name_key_dist(props, component)
        return weighted_choice(name_keys, probs)
-    def name(self, country, name):
+    def name(self, country, component, name):
        all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
-        if not all_replacements:
+
        prefixes, prefix_probs = self.prefixes.get((country, component), (None, None))
        suffixes, suffix_probs = self.suffixes.get((country, component), (None, None))
        if not all_replacements and not prefixes and not suffixes:
            return name
        for regex, group, prob in all_replacements:
            match = regex.match(name)
            if match and random.random() < prob:
                name = match.group(group)
        for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0),
                                                              (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
            if affixes is not None:
                regex = regexes[country, component]
                if regex.match(name):
                    continue
                affix = weighted_choice(affixes, affix_probs)
                if affix is not None:
                    whitespace = affix.get('whitespace', True)
                    space_val = six.u(' ') if whitespace else six.u('')
                    affix = affix[key]
                    if direction == 0:
                        return six.u('{}{}{}').format(affix, space_val, safe_decode(name))
                    else:
                        return six.u('{}{}{}').format(safe_decode(name), space_val, affix)
        return name