From a419dad63079eab977a4338b3aa6ec933d6349d6 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 23 Aug 2015 00:43:37 -0400 Subject: [PATCH] [languages] Adding canonical back in to language disambiguation (for prefixes/suffixes too), using non-canonicals/abbreviations in non-default languages if there are no other abbreviations found, adding in stopwords dictionaries --- resources/dictionaries/en/stopwords.txt | 3 +++ scripts/geodata/language_id/disambiguation.py | 27 +++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/resources/dictionaries/en/stopwords.txt b/resources/dictionaries/en/stopwords.txt index 219a7b8f..80d8e611 100644 --- a/resources/dictionaries/en/stopwords.txt +++ b/resources/dictionaries/en/stopwords.txt @@ -1,5 +1,8 @@ and between|betw|btwn|btw|btween|b/t by +of near +the +to via \ No newline at end of file diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index f0c2bf16..b9eea796 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -62,17 +62,20 @@ class DictionaryPhraseFilter(PhraseFilter): canonical = strip_accents(phrases[0]) for phrase in phrases: + if phrase in POSSIBLE_ROMAN_NUMERALS: continue + + is_canonical = strip_accents(phrase) == canonical + if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase - if strip_accents(phrase) == canonical: - kvs[phrase][lang] = None + kvs[phrase][lang] = is_canonical - kvs = [(k, v) for k, vals in kvs.iteritems() for v in vals.iterkeys()] + kvs = [(k, '|'.join([v, str(int(c))])) for k, vals in kvs.iteritems() for v, c in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True @@ -113,7 +116,8 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt', 'directionals.txt', 'concatenated_suffixes_separable.txt', 'concatenated_suffixes_inseparable.txt', - 'concatenated_prefixes_separable.txt') + 'concatenated_prefixes_separable.txt', + 'stopwords.txt',) UNKNOWN_LANGUAGE = 'unk' @@ -131,8 +135,15 @@ def disambiguate_language(text, languages): for c, t, data in street_types_gazetteer.filter(tokens): if c == token_types.PHRASE: - valid = [l for l in data if l in valid_languages] + valid = [] + for d in data: + lang, canonical = d.split('|') + canonical = int(canonical) + if lang not in valid_languages: + continue + if canonical or not seen_languages: + valid.append(lang) if seen_languages and valid and not any((l in seen_languages for l in valid)): return AMBIGUOUS_LANGUAGE @@ -140,9 +151,9 @@ def disambiguate_language(text, languages): current_lang = valid[0] else: valid = [l for l in valid if valid_languages.get(l)] - if len(valid) == 1: - if current_lang is not None and valid[0] != current_lang: - return AMBIGUOUS_LANGUAGE + if len(valid) == 1 and current_lang is not None and valid[0] != current_lang: + return AMBIGUOUS_LANGUAGE + elif len(valid) == 1: current_lang = valid[0] seen_languages.update(valid)