From 6f009fb8a68566ca9c3bdff87bc1c785af5803af Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 28 Dec 2016 04:48:32 -0500 Subject: [PATCH] [addresses] adding pymorphy2 for converting Russian and Ukrainian place names (sticking with state and staet_district for the moment) to the locative case as mentioned in #125 --- resources/parser/default.yaml | 7 +++++ scripts/geodata/addresses/components.py | 36 +++++++++++++++++++++++++ scripts/requirements.txt | 3 +++ 3 files changed, 46 insertions(+) diff --git a/resources/parser/default.yaml b/resources/parser/default.yaml index 22a2a262..419cfe9e 100644 --- a/resources/parser/default.yaml +++ b/resources/parser/default.yaml @@ -121,6 +121,13 @@ state: full_name_probability: 0.2 abbreviated_probability: 0.8 +# Currently for Russian and Ukrainian, convert some names to the genitive/locative case +slavic_names: + state: + locative_probability: 0.4 + state_district: + locative_probability: 0.4 + country: # If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages) cldr_country_probability: 0.5 diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 65280cbd..59401fc6 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -8,6 +8,11 @@ import re import six import yaml +# Russian/Ukrainian parsing and inflection +import pymorphy2 +import pymorphy2_dicts_ru +import pymorphy2_dicts_uk + from collections import defaultdict, OrderedDict from itertools import combinations @@ -161,6 +166,11 @@ class AddressComponents(object): 'zh_py': 'zh_pinyin' } + slavic_morphology_analyzers = { + 'ru': pymorphy2.MorphAnalyzer(pymorphy2_dicts_ru.get_path(), lang='ru'), + 'uk': pymorphy2.MorphAnalyzer(pymorphy2_dicts_uk.get_path(), lang='uk'), + } + sub_building_component_class_map = { AddressFormatter.ENTRANCE: Entrance, AddressFormatter.STAIRCASE: Staircase, @@ -848,6 +858,29 @@ class AddressComponents(object): else: return self.japanese_node_admin_level_map.get(val.get('place'), 1000) + def locative_name(self, name, language): + morph = self.slavic_morphology_analyzers.get(language) + if not morph: + return None + norm = [] + words = safe_decode(name).split() + n = len(words) + for i, word in enumerate(words): + parsed = morph.parse(word)[0] + word_class = {'gent'} if i < n - 1 else {'loct'} + inflected = parsed.inflect(word_class) + if inflected and inflected.word: + norm.append(inflected.word) + else: + norm.append(word) + return six.u(' ').join(norm) + + def add_locatives(self, address_components, language): + for component in address_components: + locative_probability = float(nested_get(self.config, ('slavic_names', component, 'locative_probability'))) + if locative_probability is not None and random.random() < locative_probability: + address_components[component] = self.locative_name(address_components[component], language) + def abbreviated_state(self, state, country, language): abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability'))) @@ -1680,6 +1713,9 @@ class AddressComponents(object): self.drop_invalid_components(address_components, country) + if language in self.slavic_morphology_analyzers and AddressFormatter.CITY in address_components: + self.add_locatives(address_components, language) + if language_suffix and not non_local_language and not language_altered: language = language_suffix.lstrip(':').lower() if '_' in language: diff --git a/scripts/requirements.txt b/scripts/requirements.txt index aa7b734e..633e0387 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -22,6 +22,9 @@ lru-dict==1.1.3 marisa-trie==0.7.2 numpy==1.10.4 pycountry==1.20 +git+https://github.com/kmike/pymorphy2 +pymorphy2-dicts-ru==2.4.394633.4298366 +pymorphy2-dicts-uk==2.4.1.1.1460299261 pyproj==1.9.5.1 pystache==0.5.4 python-Levenshtein==0.12.0