diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py
index 36878284..5099ff49 100644
--- a/scripts/geodata/language_id/disambiguation.py
+++ b/scripts/geodata/language_id/disambiguation.py
@@ -15,7 +15,7 @@ from geodata.string_utils import wide_iter, wide_ord
 from geodata.i18n.unicode_paths import DATA_DIR
 from geodata.i18n.normalize import strip_accents
 from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
-from geodata.text.normalize import normalized_tokens
+from geodata.text.normalize import normalized_tokens, normalize_string
 from geodata.text.tokenize import tokenize, token_types
 from geodata.text.phrases import PhraseFilter
 
@@ -53,6 +53,7 @@ class DictionaryPhraseFilter(PhraseFilter):
 
     def __init__(self, *dictionaries):
         self.dictionaries = dictionaries
+        self.canonicals = {}
 
     def serialize(self, s):
         return s
@@ -66,8 +67,8 @@ class DictionaryPhraseFilter(PhraseFilter):
             for filename in self.dictionaries:
                 is_suffix_dictionary = 'suffixes' in filename
                 is_prefix_dictionary = 'prefixes' in filename
-                is_street_types_dictionary = 'street_types' in filename
-                is_stopword_dictionary = 'stopwords' in filename
+
+                dictionary_name = filename.split('.', 1)[0]
 
                 path = os.path.join(DICTIONARIES_DIR, lang, filename)
                 if not os.path.exists(path):
@@ -81,24 +82,27 @@ class DictionaryPhraseFilter(PhraseFilter):
                     phrases = safe_decode(line).split(u'|')
                     if not phrases:
                         continue
-                    canonical = strip_accents(phrases[0])
 
-                    for phrase in phrases:
+                    canonical = phrases[0]
+                    canonical_normalized = normalize_string(canonical)
+
+                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]
+
+                    for i, phrase in enumerate(phrases):
 
                         if phrase in POSSIBLE_ROMAN_NUMERALS:
                             continue
 
-                        is_canonical = strip_accents(phrase) == canonical
+                        is_canonical = normalize_string(phrase) == canonical_normalized
 
                         if is_suffix_dictionary:
                             phrase = SUFFIX_KEY + phrase[::-1]
                         elif is_prefix_dictionary:
                             phrase = PREFIX_KEY + phrase
 
-                        if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary:
-                            kvs[phrase][lang] = (is_canonical, is_stopword_dictionary)
+                        kvs[phrase][(lang, dictionary_name)] = (is_canonical, canonical)
 
-        kvs = [(k, '|'.join([v, str(int(c)), str(int(s))])) for k, vals in kvs.iteritems() for v, (c, s) in vals.iteritems()]
+        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d), (i, c) in vals.iteritems()]
 
         self.trie = BytesTrie(kvs)
         self.configured = True
@@ -140,15 +144,15 @@ class DictionaryPhraseFilter(PhraseFilter):
 
                 suffix_search, suffix_len = self.search_suffix(token)
                 if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
-                    yield (t, PHRASE, suffix_search)
+                    yield (t, PHRASE, map(safe_decode, suffix_search))
                     continue
                 prefix_search, prefix_len = self.search_prefix(token)
                 if prefix_search and self.trie.get(token[:prefix_len]):
-                    yield (t, PHRASE, prefix_search)
+                    yield (t, PHRASE, map(safe_decode, prefix_search))
                     continue
             else:
                 c = PHRASE
-            yield t, c, data
+            yield t, c, map(safe_decode, data)
 
 STREET_TYPES_DICTIONARIES = ('street_types.txt',
                              'directionals.txt',
@@ -192,8 +196,9 @@ script_languages = {}
 
 def init_disambiguation():
     global char_scripts, script_languages
-    char_scripts = get_chars_by_script()
-    script_languages = {script: set(langs) for script, langs in get_script_languages().iteritems()}
+    char_scripts[:] = []
+    char_scripts.extend(get_chars_by_script())
+    script_languages.update({script: set(langs) for script, langs in get_script_languages().iteritems()})
 
 UNKNOWN_SCRIPT = 'Unknown'
 COMMON_SCRIPT = 'Common'
@@ -257,24 +262,24 @@ def disambiguate_language(text, languages):
         if c is PHRASE:
             valid = []
             data = [d.split('|') for d in data]
-            potentials = [l for l, c, s in data if l in valid_languages]
+            potentials = [l for l, d, i, c in data if l in valid_languages]
 
-            for lang, canonical, stopword in data:
-                canonical = int(canonical)
-                stopword = int(stopword)
-                if lang not in valid_languages or (stopword and len(potentials) > 1):
+            for lang, dictionary, is_canonical, canonical in data:
+                is_canonical = int(is_canonical)
+                is_stopword = dictionary == 'stopword'
+                if lang not in valid_languages or (is_stopword and len(potentials) > 1):
                     continue
                 is_default = valid_languages[lang]
 
                 lang_valid = is_default or not seen_languages or lang in seen_languages
 
-                if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)):
+                if lang_valid and ((is_canonical and not is_stopword) or (is_default and len(potentials) == 1)):
                     valid.append(lang)
                 elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
                     return AMBIGUOUS_LANGUAGE
-                elif stopword and canonical and not is_default and lang in seen_languages:
+                elif is_stopword and is_canonical and not is_default and lang in seen_languages:
                     valid.append(lang)
-                elif not seen_languages and len(potentials) == 1 and len(t[0][1]) > 1:
+                elif not seen_languages and len(potentials) == 1 and len(t[0][0]) > 1:
                     possible_lang = lang if possible_lang is None or possible_lang == lang else None
 
             if seen_languages and valid and not any((l in seen_languages for l in valid)) and \