[addresses] adding pymorphy2 for converting Russian and Ukrainian place names (sticking with state and staet_district for the moment) to the locative case as mentioned in #125
This commit is contained in:
@@ -121,6 +121,13 @@ state:
|
|||||||
full_name_probability: 0.2
|
full_name_probability: 0.2
|
||||||
abbreviated_probability: 0.8
|
abbreviated_probability: 0.8
|
||||||
|
|
||||||
|
# Currently for Russian and Ukrainian, convert some names to the genitive/locative case
|
||||||
|
slavic_names:
|
||||||
|
state:
|
||||||
|
locative_probability: 0.4
|
||||||
|
state_district:
|
||||||
|
locative_probability: 0.4
|
||||||
|
|
||||||
country:
|
country:
|
||||||
# If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
|
# If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
|
||||||
cldr_country_probability: 0.5
|
cldr_country_probability: 0.5
|
||||||
|
|||||||
@@ -8,6 +8,11 @@ import re
|
|||||||
import six
|
import six
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
# Russian/Ukrainian parsing and inflection
|
||||||
|
import pymorphy2
|
||||||
|
import pymorphy2_dicts_ru
|
||||||
|
import pymorphy2_dicts_uk
|
||||||
|
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import defaultdict, OrderedDict
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
|
|
||||||
@@ -161,6 +166,11 @@ class AddressComponents(object):
|
|||||||
'zh_py': 'zh_pinyin'
|
'zh_py': 'zh_pinyin'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slavic_morphology_analyzers = {
|
||||||
|
'ru': pymorphy2.MorphAnalyzer(pymorphy2_dicts_ru.get_path(), lang='ru'),
|
||||||
|
'uk': pymorphy2.MorphAnalyzer(pymorphy2_dicts_uk.get_path(), lang='uk'),
|
||||||
|
}
|
||||||
|
|
||||||
sub_building_component_class_map = {
|
sub_building_component_class_map = {
|
||||||
AddressFormatter.ENTRANCE: Entrance,
|
AddressFormatter.ENTRANCE: Entrance,
|
||||||
AddressFormatter.STAIRCASE: Staircase,
|
AddressFormatter.STAIRCASE: Staircase,
|
||||||
@@ -848,6 +858,29 @@ class AddressComponents(object):
|
|||||||
else:
|
else:
|
||||||
return self.japanese_node_admin_level_map.get(val.get('place'), 1000)
|
return self.japanese_node_admin_level_map.get(val.get('place'), 1000)
|
||||||
|
|
||||||
|
def locative_name(self, name, language):
|
||||||
|
morph = self.slavic_morphology_analyzers.get(language)
|
||||||
|
if not morph:
|
||||||
|
return None
|
||||||
|
norm = []
|
||||||
|
words = safe_decode(name).split()
|
||||||
|
n = len(words)
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
parsed = morph.parse(word)[0]
|
||||||
|
word_class = {'gent'} if i < n - 1 else {'loct'}
|
||||||
|
inflected = parsed.inflect(word_class)
|
||||||
|
if inflected and inflected.word:
|
||||||
|
norm.append(inflected.word)
|
||||||
|
else:
|
||||||
|
norm.append(word)
|
||||||
|
return six.u(' ').join(norm)
|
||||||
|
|
||||||
|
def add_locatives(self, address_components, language):
|
||||||
|
for component in address_components:
|
||||||
|
locative_probability = float(nested_get(self.config, ('slavic_names', component, 'locative_probability')))
|
||||||
|
if locative_probability is not None and random.random() < locative_probability:
|
||||||
|
address_components[component] = self.locative_name(address_components[component], language)
|
||||||
|
|
||||||
def abbreviated_state(self, state, country, language):
|
def abbreviated_state(self, state, country, language):
|
||||||
abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability')))
|
abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability')))
|
||||||
|
|
||||||
@@ -1680,6 +1713,9 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
self.drop_invalid_components(address_components, country)
|
self.drop_invalid_components(address_components, country)
|
||||||
|
|
||||||
|
if language in self.slavic_morphology_analyzers and AddressFormatter.CITY in address_components:
|
||||||
|
self.add_locatives(address_components, language)
|
||||||
|
|
||||||
if language_suffix and not non_local_language and not language_altered:
|
if language_suffix and not non_local_language and not language_altered:
|
||||||
language = language_suffix.lstrip(':').lower()
|
language = language_suffix.lstrip(':').lower()
|
||||||
if '_' in language:
|
if '_' in language:
|
||||||
|
|||||||
@@ -22,6 +22,9 @@ lru-dict==1.1.3
|
|||||||
marisa-trie==0.7.2
|
marisa-trie==0.7.2
|
||||||
numpy==1.10.4
|
numpy==1.10.4
|
||||||
pycountry==1.20
|
pycountry==1.20
|
||||||
|
git+https://github.com/kmike/pymorphy2
|
||||||
|
pymorphy2-dicts-ru==2.4.394633.4298366
|
||||||
|
pymorphy2-dicts-uk==2.4.1.1.1460299261
|
||||||
pyproj==1.9.5.1
|
pyproj==1.9.5.1
|
||||||
pystache==0.5.4
|
pystache==0.5.4
|
||||||
python-Levenshtein==0.12.0
|
python-Levenshtein==0.12.0
|
||||||
|
|||||||
Reference in New Issue
Block a user