From a419dad63079eab977a4338b3aa6ec933d6349d6 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sun, 23 Aug 2015 00:43:37 -0400
Subject: [PATCH] [languages] Adding canonical back in to language
 disambiguation (for prefixes/suffixes too), using
 non-canonicals/abbreviations in non-default languages if there are no other
 abbreviations found, adding in stopwords dictionaries

---
 resources/dictionaries/en/stopwords.txt       |  3 +++
 scripts/geodata/language_id/disambiguation.py | 27 +++++++++++++------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/resources/dictionaries/en/stopwords.txt b/resources/dictionaries/en/stopwords.txt
index 219a7b8f..80d8e611 100644
--- a/resources/dictionaries/en/stopwords.txt
+++ b/resources/dictionaries/en/stopwords.txt
@@ -1,5 +1,8 @@
 and
 between|betw|btwn|btw|btween|b/t
 by
+of
 near
+the
+to
 via
\ No newline at end of file
diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py
index f0c2bf16..b9eea796 100644
--- a/scripts/geodata/language_id/disambiguation.py
+++ b/scripts/geodata/language_id/disambiguation.py
@@ -62,17 +62,20 @@ class DictionaryPhraseFilter(PhraseFilter):
                     canonical = strip_accents(phrases[0])
 
                     for phrase in phrases:
+
                         if phrase in POSSIBLE_ROMAN_NUMERALS:
                             continue
+
+                        is_canonical = strip_accents(phrase) == canonical
+
                         if is_suffix_dictionary:
                             phrase = SUFFIX_KEY + phrase[::-1]
                         elif is_prefix_dictionary:
                             phrase = PREFIX_KEY + phrase
 
-                        if strip_accents(phrase) == canonical:
-                            kvs[phrase][lang] = None
+                        kvs[phrase][lang] = is_canonical
 
-        kvs = [(k, v) for k, vals in kvs.iteritems() for v in vals.iterkeys()]
+        kvs = [(k, '|'.join([v, str(int(c))])) for k, vals in kvs.iteritems() for v, c in vals.iteritems()]
 
         self.trie = BytesTrie(kvs)
         self.configured = True
@@ -113,7 +116,8 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt',
                                                 'directionals.txt',
                                                 'concatenated_suffixes_separable.txt',
                                                 'concatenated_suffixes_inseparable.txt',
-                                                'concatenated_prefixes_separable.txt')
+                                                'concatenated_prefixes_separable.txt',
+                                                'stopwords.txt',)
 
 
 UNKNOWN_LANGUAGE = 'unk'
@@ -131,8 +135,15 @@ def disambiguate_language(text, languages):
     for c, t, data in street_types_gazetteer.filter(tokens):
 
         if c == token_types.PHRASE:
-            valid = [l for l in data if l in valid_languages]
+            valid = []
+            for d in data:
+                lang, canonical = d.split('|')
+                canonical = int(canonical)
+                if lang not in valid_languages:
+                    continue
 
+                if canonical or not seen_languages:
+                    valid.append(lang)
             if seen_languages and valid and not any((l in seen_languages for l in valid)):
                 return AMBIGUOUS_LANGUAGE
 
@@ -140,9 +151,9 @@ def disambiguate_language(text, languages):
                 current_lang = valid[0]
             else:
                 valid = [l for l in valid if valid_languages.get(l)]
-                if len(valid) == 1:
-                    if current_lang is not None and valid[0] != current_lang:
-                        return AMBIGUOUS_LANGUAGE
+                if len(valid) == 1 and current_lang is not None and valid[0] != current_lang:
+                    return AMBIGUOUS_LANGUAGE
+                elif len(valid) == 1:
                     current_lang = valid[0]
 
             seen_languages.update(valid)