[languages] Adding canonical string and dictionary type to Python trie, modifying disambiguate_languages accordingly, and adding lists of alternate forms

2016-01-21 02:30:02 -05:00
parent 2e15db06dd
commit 0269d92e3d
1 changed files with 27 additions and 22 deletions
--- a/scripts/geodata/language_id/disambiguation.py
+++ b/scripts/geodata/language_id/disambiguation.py
@@ -15,7 +15,7 @@ from geodata.string_utils import wide_iter, wide_ord
 from geodata.i18n.unicode_paths import DATA_DIR
 from geodata.i18n.normalize import strip_accents
 from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
-from geodata.text.normalize import normalized_tokens
+from geodata.text.normalize import normalized_tokens, normalize_string
 from geodata.text.tokenize import tokenize, token_types
 from geodata.text.phrases import PhraseFilter

@@ -53,6 +53,7 @@ class DictionaryPhraseFilter(PhraseFilter):

    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
+        self.canonicals = {}

    def serialize(self, s):
        return s
@@ -66,8 +67,8 @@ class DictionaryPhraseFilter(PhraseFilter):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename
-                is_street_types_dictionary = 'street_types' in filename
-                is_stopword_dictionary = 'stopwords' in filename
+
+                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
@@ -81,24 +82,27 @@ class DictionaryPhraseFilter(PhraseFilter):
                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue
-                    canonical = strip_accents(phrases[0])

-                    for phrase in phrases:
+                    canonical = phrases[0]
+                    canonical_normalized = normalize_string(canonical)
+
+                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]
+
+                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

-                        is_canonical = strip_accents(phrase) == canonical
+                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

-                        if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary:
-                            kvs[phrase][lang] = (is_canonical, is_stopword_dictionary)
+                        kvs[phrase][(lang, dictionary_name)] = (is_canonical, canonical)

-        kvs = [(k, '|'.join([v, str(int(c)), str(int(s))])) for k, vals in kvs.iteritems() for v, (c, s) in vals.iteritems()]
+        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d), (i, c) in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
@@ -140,15 +144,15 @@ class DictionaryPhraseFilter(PhraseFilter):

                suffix_search, suffix_len = self.search_suffix(token)
                if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
-                    yield (t, PHRASE, suffix_search)
+                    yield (t, PHRASE, map(safe_decode, suffix_search))
                    continue
                prefix_search, prefix_len = self.search_prefix(token)
                if prefix_search and self.trie.get(token[:prefix_len]):
-                    yield (t, PHRASE, prefix_search)
+                    yield (t, PHRASE, map(safe_decode, prefix_search))
                    continue
            else:
                c = PHRASE
-            yield t, c, data
+            yield t, c, map(safe_decode, data)

 STREET_TYPES_DICTIONARIES = ('street_types.txt',
                             'directionals.txt',
@@ -192,8 +196,9 @@ script_languages = {}

 def init_disambiguation():
    global char_scripts, script_languages
-    char_scripts = get_chars_by_script()
-    script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
+    char_scripts[:] = []
+    char_scripts.extend(get_chars_by_script())
+    script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})

 UNKNOWN_SCRIPT = 'Unknown'
 COMMON_SCRIPT = 'Common'
@@ -257,24 +262,24 @@ def disambiguate_language(text, languages):
        if c is PHRASE:
            valid = []
            data = [d.split('|') for d in data]
-            potentials = [l for l, c, s in data if l in valid_languages]
+            potentials = [l for l, d, i, c in data if l in valid_languages]

-            for lang, canonical, stopword in data:
-                canonical = int(canonical)
-                stopword = int(stopword)
-                if lang not in valid_languages or (stopword and len(potentials) > 1):
+            for lang, dictionary, is_canonical, canonical in data:
+                is_canonical = int(is_canonical)
+                is_stopword = dictionary == 'stopword'
+                if lang not in valid_languages or (is_stopword and len(potentials) > 1):
                    continue
                is_default = valid_languages[lang]

                lang_valid = is_default or not seen_languages or lang in seen_languages

-                if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)):
+                if lang_valid and ((is_canonical and not is_stopword) or (is_default and len(potentials) == 1)):
                    valid.append(lang)
                elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
                    return AMBIGUOUS_LANGUAGE
-                elif stopword and canonical and not is_default and lang in seen_languages:
+                elif is_stopword and is_canonical and not is_default and lang in seen_languages:
                    valid.append(lang)
-                elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1:
+                elif not seen_languages and len(potentials) == 1 and len(t[0][0]) > 1:
                    possible_lang = lang if possible_lang is None or possible_lang == lang else None

            if seen_languages and valid and not any((l in seen_languages for l in valid)) and \