[languages] Adding canonical string and dictionary type to Python trie, modifying disambiguate_languages accordingly, and adding lists of alternate forms
This commit is contained in:
@@ -15,7 +15,7 @@ from geodata.string_utils import wide_iter, wide_ord
|
|||||||
from geodata.i18n.unicode_paths import DATA_DIR
|
from geodata.i18n.unicode_paths import DATA_DIR
|
||||||
from geodata.i18n.normalize import strip_accents
|
from geodata.i18n.normalize import strip_accents
|
||||||
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
|
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
|
||||||
from geodata.text.normalize import normalized_tokens
|
from geodata.text.normalize import normalized_tokens, normalize_string
|
||||||
from geodata.text.tokenize import tokenize, token_types
|
from geodata.text.tokenize import tokenize, token_types
|
||||||
from geodata.text.phrases import PhraseFilter
|
from geodata.text.phrases import PhraseFilter
|
||||||
|
|
||||||
@@ -53,6 +53,7 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
|
|
||||||
def __init__(self, *dictionaries):
|
def __init__(self, *dictionaries):
|
||||||
self.dictionaries = dictionaries
|
self.dictionaries = dictionaries
|
||||||
|
self.canonicals = {}
|
||||||
|
|
||||||
def serialize(self, s):
|
def serialize(self, s):
|
||||||
return s
|
return s
|
||||||
@@ -66,8 +67,8 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
for filename in self.dictionaries:
|
for filename in self.dictionaries:
|
||||||
is_suffix_dictionary = 'suffixes' in filename
|
is_suffix_dictionary = 'suffixes' in filename
|
||||||
is_prefix_dictionary = 'prefixes' in filename
|
is_prefix_dictionary = 'prefixes' in filename
|
||||||
is_street_types_dictionary = 'street_types' in filename
|
|
||||||
is_stopword_dictionary = 'stopwords' in filename
|
dictionary_name = filename.split('.', 1)[0]
|
||||||
|
|
||||||
path = os.path.join(DICTIONARIES_DIR, lang, filename)
|
path = os.path.join(DICTIONARIES_DIR, lang, filename)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
@@ -81,24 +82,27 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
phrases = safe_decode(line).split(u'|')
|
phrases = safe_decode(line).split(u'|')
|
||||||
if not phrases:
|
if not phrases:
|
||||||
continue
|
continue
|
||||||
canonical = strip_accents(phrases[0])
|
|
||||||
|
|
||||||
for phrase in phrases:
|
canonical = phrases[0]
|
||||||
|
canonical_normalized = normalize_string(canonical)
|
||||||
|
|
||||||
|
self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]
|
||||||
|
|
||||||
|
for i, phrase in enumerate(phrases):
|
||||||
|
|
||||||
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
is_canonical = strip_accents(phrase) == canonical
|
is_canonical = normalize_string(phrase) == canonical_normalized
|
||||||
|
|
||||||
if is_suffix_dictionary:
|
if is_suffix_dictionary:
|
||||||
phrase = SUFFIX_KEY + phrase[::-1]
|
phrase = SUFFIX_KEY + phrase[::-1]
|
||||||
elif is_prefix_dictionary:
|
elif is_prefix_dictionary:
|
||||||
phrase = PREFIX_KEY + phrase
|
phrase = PREFIX_KEY + phrase
|
||||||
|
|
||||||
if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary:
|
kvs[phrase][(lang, dictionary_name)] = (is_canonical, canonical)
|
||||||
kvs[phrase][lang] = (is_canonical, is_stopword_dictionary)
|
|
||||||
|
|
||||||
kvs = [(k, '|'.join([v, str(int(c)), str(int(s))])) for k, vals in kvs.iteritems() for v, (c, s) in vals.iteritems()]
|
kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d), (i, c) in vals.iteritems()]
|
||||||
|
|
||||||
self.trie = BytesTrie(kvs)
|
self.trie = BytesTrie(kvs)
|
||||||
self.configured = True
|
self.configured = True
|
||||||
@@ -140,15 +144,15 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
|
|
||||||
suffix_search, suffix_len = self.search_suffix(token)
|
suffix_search, suffix_len = self.search_suffix(token)
|
||||||
if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
|
if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
|
||||||
yield (t, PHRASE, suffix_search)
|
yield (t, PHRASE, map(safe_decode, suffix_search))
|
||||||
continue
|
continue
|
||||||
prefix_search, prefix_len = self.search_prefix(token)
|
prefix_search, prefix_len = self.search_prefix(token)
|
||||||
if prefix_search and self.trie.get(token[:prefix_len]):
|
if prefix_search and self.trie.get(token[:prefix_len]):
|
||||||
yield (t, PHRASE, prefix_search)
|
yield (t, PHRASE, map(safe_decode, prefix_search))
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
c = PHRASE
|
c = PHRASE
|
||||||
yield t, c, data
|
yield t, c, map(safe_decode, data)
|
||||||
|
|
||||||
STREET_TYPES_DICTIONARIES = ('street_types.txt',
|
STREET_TYPES_DICTIONARIES = ('street_types.txt',
|
||||||
'directionals.txt',
|
'directionals.txt',
|
||||||
@@ -192,8 +196,9 @@ script_languages = {}
|
|||||||
|
|
||||||
def init_disambiguation():
|
def init_disambiguation():
|
||||||
global char_scripts, script_languages
|
global char_scripts, script_languages
|
||||||
char_scripts = get_chars_by_script()
|
char_scripts[:] = []
|
||||||
script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
|
char_scripts.extend(get_chars_by_script())
|
||||||
|
script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})
|
||||||
|
|
||||||
UNKNOWN_SCRIPT = 'Unknown'
|
UNKNOWN_SCRIPT = 'Unknown'
|
||||||
COMMON_SCRIPT = 'Common'
|
COMMON_SCRIPT = 'Common'
|
||||||
@@ -257,24 +262,24 @@ def disambiguate_language(text, languages):
|
|||||||
if c is PHRASE:
|
if c is PHRASE:
|
||||||
valid = []
|
valid = []
|
||||||
data = [d.split('|') for d in data]
|
data = [d.split('|') for d in data]
|
||||||
potentials = [l for l, c, s in data if l in valid_languages]
|
potentials = [l for l, d, i, c in data if l in valid_languages]
|
||||||
|
|
||||||
for lang, canonical, stopword in data:
|
for lang, dictionary, is_canonical, canonical in data:
|
||||||
canonical = int(canonical)
|
is_canonical = int(is_canonical)
|
||||||
stopword = int(stopword)
|
is_stopword = dictionary == 'stopword'
|
||||||
if lang not in valid_languages or (stopword and len(potentials) > 1):
|
if lang not in valid_languages or (is_stopword and len(potentials) > 1):
|
||||||
continue
|
continue
|
||||||
is_default = valid_languages[lang]
|
is_default = valid_languages[lang]
|
||||||
|
|
||||||
lang_valid = is_default or not seen_languages or lang in seen_languages
|
lang_valid = is_default or not seen_languages or lang in seen_languages
|
||||||
|
|
||||||
if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)):
|
if lang_valid and ((is_canonical and not is_stopword) or (is_default and len(potentials) == 1)):
|
||||||
valid.append(lang)
|
valid.append(lang)
|
||||||
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
|
elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
elif stopword and canonical and not is_default and lang in seen_languages:
|
elif is_stopword and is_canonical and not is_default and lang in seen_languages:
|
||||||
valid.append(lang)
|
valid.append(lang)
|
||||||
elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1:
|
elif not seen_languages and len(potentials) == 1 and len(t[0][0]) > 1:
|
||||||
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
possible_lang = lang if possible_lang is None or possible_lang == lang else None
|
||||||
|
|
||||||
if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
|
if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
|
||||||
|
|||||||
Reference in New Issue
Block a user