From f04360732c4820aaef43bba9f175e04e6906bbfd Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Jan 2016 03:17:18 -0500 Subject: [PATCH] [languages] Single character cannot be sufficient to disambiguate with multiple languages (Avenue A for example) --- scripts/geodata/language_id/disambiguation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index df927028..6023c82c 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -102,7 +102,7 @@ def disambiguate_language(text, languages): for t, c, l, data in street_types_gazetteer.filter(tokens): if c is PHRASE: valid = [] - data = [d.split('|') for d in data] + data = [safe_decode(d).split(u'|') for d in data] potentials = [l for l, d, i, c in data if l in valid_languages] for lang, dictionary, is_canonical, canonical in data: @@ -114,7 +114,7 @@ def disambiguate_language(text, languages): lang_valid = is_default or not seen_languages or lang in seen_languages - if lang_valid and ((is_canonical and not is_stopword) or (is_default and len(potentials) == 1)): + if lang_valid and len(t[0][0]) > 1 and ((is_canonical and not is_stopword) or (is_default and len(potentials) == 1)): valid.append(lang) elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang: return AMBIGUOUS_LANGUAGE