From b713d102d1bb2ab55afc0ae3296d489392992b42 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Jan 2016 17:43:14 -0500 Subject: [PATCH] [languages] using whole phrase len, not first token, in disambiguation. Using single unambiguous observed default language or unambiguous observed language --- scripts/geodata/language_id/disambiguation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index f36d7c37..be5e5f67 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -104,7 +104,9 @@ def disambiguate_language(text, languages): valid = OrderedDict() data = [safe_decode(d).split(u'|') for d in data] potentials = [l for l, d, i, c in data if l in valid_languages] + potential_defaults = [l for l in potentials if valid_languages[l]] + phrase_len = sum((len(t_i[0]) for t_i in t)) for lang, dictionary, is_canonical, canonical in data: is_canonical = int(is_canonical) is_stopword = dictionary == 'stopword' @@ -114,13 +116,13 @@ def disambiguate_language(text, languages): lang_valid = is_default or not seen_languages or lang in seen_languages - if lang_valid and len(t[0][0]) > 1 and ((is_canonical and not is_stopword) or (is_default and num_defaults == 1)): + if lang_valid and phrase_len > 1 and ((is_canonical and not is_stopword) or (is_default and (len(potentials) == 1 or len(potential_defaults) == 1))): valid[lang] = 1 elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang: return AMBIGUOUS_LANGUAGE elif is_stopword and is_canonical and not is_default and lang in seen_languages: valid[lang] = 1 - elif not seen_languages and len(potentials) == 1 and len(t[0][0]) > 1: + elif not seen_languages and len(potentials) == 1 and phrase_len > 1: possible_lang = lang if possible_lang is None or possible_lang == lang else None if seen_languages and valid and not any((l in seen_languages for l in valid)) and \