From e6e4b28e434f4e717ee2d0b5e79b6f11d06f0bba Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 28 Dec 2016 13:26:19 -0500 Subject: [PATCH] =?UTF-8?q?[addresses]=20making=20the=20=D0=B3=D0=BE=D1=80?= =?UTF-8?q?=D0=BE=D0=B4/=D0=B3.=20prefix=20apply=20to=20the=20Russian=20la?= =?UTF-8?q?nguage=20rather=20than=20the=20country?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- resources/boundaries/names/global.yaml | 25 +++++++++++++------------ scripts/geodata/addresses/components.py | 5 ++++- scripts/geodata/boundaries/names.py | 24 ++++++++++++------------ 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/resources/boundaries/names/global.yaml b/resources/boundaries/names/global.yaml index 81bee822..2bea2dca 100644 --- a/resources/boundaries/names/global.yaml +++ b/resources/boundaries/names/global.yaml @@ -36,18 +36,19 @@ names: replace_probability: 0.5 prefixes: - ru: - city: - default: - prefix: г. - probability: 0.35 - alternatives: - - alternative: - prefix: г - probability: 0.1 - - alternative: - prefix: город - probability: 0.05 + language: + ru: + city: + default: + prefix: г. + probability: 0.35 + alternatives: + - alternative: + prefix: г + probability: 0.1 + - alternative: + prefix: город + probability: 0.05 # This section overrides place names exceptions: diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 8d7d7e67..a62aa114 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -962,6 +962,8 @@ class AddressComponents(object): include these qualifiers in the training data. ''' + suffix_lang = None if not language_suffixe else language_suffix.lstrip(':') + if osm_components: name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix)) raw_name_key = boundary_names.DEFAULT_NAME_KEY @@ -1018,7 +1020,8 @@ class AddressComponents(object): name = component_value.get(k) if name: - name = boundary_names.name(country, component, name) + name_lang = language if not suffix_lang or not k.endswith(language_suffix) else suffix_lang + name = boundary_names.name(country, name_lang, component, name) if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names): name = self.cleaned_name(name, first_comma_delimited_phrase=True) diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py index 6b560942..d8fa7326 100644 --- a/scripts/geodata/boundaries/names.py +++ b/scripts/geodata/boundaries/names.py @@ -59,39 +59,39 @@ class BoundaryNames(object): self.suffixes = {} self.suffix_regexes = {} - for country, components in six.iteritems(nested_get(config, ('names', 'prefixes',), default={}) ): + for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ): for component, affixes in six.iteritems(components): affix_values, probs = alternative_probabilities(affixes) for val in affix_values: if 'prefix' not in val: - raise AssertionError(six.u('Invalid prefix value for (country={}, component={}): {} ').format(country, component, val)) + raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val)) prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values]) - self.prefix_regexes[(country, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U) + self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U) if not isclose(sum(probs), 1.0): affix_values.append(None) probs.append(1.0 - sum(probs)) affix_probs_cdf = cdf(probs) - self.prefixes[(country, component)] = affix_values, affix_probs_cdf + self.prefixes[(language, component)] = affix_values, affix_probs_cdf - for country, components in six.iteritems(nested_get(config, ('names', 'suffixes',), default={}) ): + for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ): for component, affixes in six.iteritems(components): affix_values, probs = alternative_probabilities(affixes) for val in affix_values: if 'suffix' not in val: - raise AssertionError(six.u('Invalid suffix value for (country={}, component={}): {} ').format(country, component, val)) + raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val)) suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values]) - self.suffix_regexes[(country, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U) + self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U) if not isclose(sum(probs), 1.0): affix_values.append(None) probs.append(1.0 - sum(probs)) affix_probs_cdf = cdf(probs) - self.suffixes[(country, component)] = affix_values, affix_probs_cdf + self.suffixes[(language, component)] = affix_values, affix_probs_cdf self.exceptions = {} @@ -125,11 +125,11 @@ class BoundaryNames(object): name_keys, probs = self.name_key_dist(props, component) return weighted_choice(name_keys, probs) - def name(self, country, component, name): + def name(self, country, language, component, name): all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, []) - prefixes, prefix_probs = self.prefixes.get((country, component), (None, None)) - suffixes, suffix_probs = self.suffixes.get((country, component), (None, None)) + prefixes, prefix_probs = self.prefixes.get((language, component), (None, None)) + suffixes, suffix_probs = self.suffixes.get((language, component), (None, None)) if not all_replacements and not prefixes and not suffixes: return name @@ -142,7 +142,7 @@ class BoundaryNames(object): for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0), (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)): if affixes is not None: - regex = regexes[country, component] + regex = regexes[language, component] if regex.match(name): continue