diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index d8d9cafb..65280cbd 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -982,7 +982,7 @@ class AddressComponents(object): name = component_value.get(k) if name: - name = boundary_names.name(country, name) + name = boundary_names.name(country, component, name) if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names): name = self.cleaned_name(name, first_comma_delimited_phrase=True) diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py index 67db8cd9..6b560942 100644 --- a/scripts/geodata/boundaries/names.py +++ b/scripts/geodata/boundaries/names.py @@ -7,6 +7,8 @@ import yaml from collections import defaultdict from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities +from geodata.encoding import safe_decode +from geodata.math.floats import isclose from geodata.math.sampling import cdf, weighted_choice from geodata.encoding import safe_encode @@ -52,6 +54,45 @@ class BoundaryNames(object): self.country_regex_replacements = dict(self.country_regex_replacements) + self.prefixes = {} + self.prefix_regexes = {} + self.suffixes = {} + self.suffix_regexes = {} + + for country, components in six.iteritems(nested_get(config, ('names', 'prefixes',), default={}) ): + for component, affixes in six.iteritems(components): + affix_values, probs = alternative_probabilities(affixes) + + for val in affix_values: + if 'prefix' not in val: + raise AssertionError(six.u('Invalid prefix value for (country={}, component={}): {} ').format(country, component, val)) + + prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values]) + self.prefix_regexes[(country, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U) + + if not isclose(sum(probs), 1.0): + affix_values.append(None) + probs.append(1.0 - sum(probs)) + affix_probs_cdf = cdf(probs) + self.prefixes[(country, component)] = affix_values, affix_probs_cdf + + for country, components in six.iteritems(nested_get(config, ('names', 'suffixes',), default={}) ): + for component, affixes in six.iteritems(components): + affix_values, probs = alternative_probabilities(affixes) + + for val in affix_values: + if 'suffix' not in val: + raise AssertionError(six.u('Invalid suffix value for (country={}, component={}): {} ').format(country, component, val)) + + suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values]) + self.suffix_regexes[(country, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U) + + if not isclose(sum(probs), 1.0): + affix_values.append(None) + probs.append(1.0 - sum(probs)) + affix_probs_cdf = cdf(probs) + self.suffixes[(country, component)] = affix_values, affix_probs_cdf + self.exceptions = {} for props in nested_get(config, ('names', 'exceptions'), default=[]): @@ -66,6 +107,9 @@ class BoundaryNames(object): probs = cdf(probs) self.exceptions[(object_type, object_id)] = (keys, probs) + def _string_as_regex(self, s): + return safe_decode(s).replace(six.u('.'), six.u('\\.')) + def name_key_dist(self, props, component): object_type = props.get('type') object_id = safe_encode(props.get('id', '')) @@ -81,15 +125,38 @@ class BoundaryNames(object): name_keys, probs = self.name_key_dist(props, component) return weighted_choice(name_keys, probs) - def name(self, country, name): + def name(self, country, component, name): all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, []) - if not all_replacements: + + prefixes, prefix_probs = self.prefixes.get((country, component), (None, None)) + suffixes, suffix_probs = self.suffixes.get((country, component), (None, None)) + + if not all_replacements and not prefixes and not suffixes: return name for regex, group, prob in all_replacements: match = regex.match(name) if match and random.random() < prob: name = match.group(group) + + for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0), + (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)): + if affixes is not None: + regex = regexes[country, component] + if regex.match(name): + continue + + affix = weighted_choice(affixes, affix_probs) + + if affix is not None: + whitespace = affix.get('whitespace', True) + space_val = six.u(' ') if whitespace else six.u('') + affix = affix[key] + if direction == 0: + return six.u('{}{}{}').format(affix, space_val, safe_decode(name)) + else: + return six.u('{}{}{}').format(safe_decode(name), space_val, affix) + return name