[languages] Using stopwords only to account for how ambiguous a phrase is, not for disambiguation
This commit is contained in:
@@ -9,13 +9,11 @@ del
|
|||||||
dela
|
dela
|
||||||
des
|
des
|
||||||
detras
|
detras
|
||||||
e
|
|
||||||
el
|
el
|
||||||
en
|
en
|
||||||
encima
|
encima
|
||||||
enfrente
|
enfrente
|
||||||
entre
|
entre
|
||||||
i
|
|
||||||
junta
|
junta
|
||||||
hasta
|
hasta
|
||||||
la
|
la
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
is_suffix_dictionary = 'suffixes' in filename
|
is_suffix_dictionary = 'suffixes' in filename
|
||||||
is_prefix_dictionary = 'prefixes' in filename
|
is_prefix_dictionary = 'prefixes' in filename
|
||||||
is_street_types_dictionary = 'street_types' in filename
|
is_street_types_dictionary = 'street_types' in filename
|
||||||
|
is_stopword_dictionary = 'stopwords' in filename
|
||||||
|
|
||||||
path = os.path.join(DICTIONARIES_DIR, lang, filename)
|
path = os.path.join(DICTIONARIES_DIR, lang, filename)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
@@ -76,9 +77,9 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
phrase = PREFIX_KEY + phrase
|
phrase = PREFIX_KEY + phrase
|
||||||
|
|
||||||
if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary:
|
if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary:
|
||||||
kvs[phrase][lang] = is_canonical
|
kvs[phrase][lang] = (is_canonical, is_stopword_dictionary)
|
||||||
|
|
||||||
kvs = [(k, '|'.join([v, str(int(c))])) for k, vals in kvs.iteritems() for v, c in vals.iteritems()]
|
kvs = [(k, '|'.join([v, str(int(c)), str(int(s))])) for k, vals in kvs.iteritems() for v, (c, s) in vals.iteritems()]
|
||||||
|
|
||||||
self.trie = BytesTrie(kvs)
|
self.trie = BytesTrie(kvs)
|
||||||
self.configured = True
|
self.configured = True
|
||||||
@@ -165,15 +166,17 @@ def disambiguate_language(text, languages):
|
|||||||
for c, t, data in street_types_gazetteer.filter(tokens):
|
for c, t, data in street_types_gazetteer.filter(tokens):
|
||||||
if c == token_types.PHRASE:
|
if c == token_types.PHRASE:
|
||||||
valid = []
|
valid = []
|
||||||
|
data = [d.split('|') for d in data]
|
||||||
|
potentials = [l for l, c, s in data if l in valid_languages]
|
||||||
|
|
||||||
for d in data:
|
for lang, canonical, stopword in data:
|
||||||
lang, canonical = d.split('|')
|
|
||||||
canonical = int(canonical)
|
canonical = int(canonical)
|
||||||
if lang not in valid_languages:
|
stopword = int(stopword)
|
||||||
|
if lang not in valid_languages or stopword:
|
||||||
continue
|
continue
|
||||||
is_default = valid_languages[lang]
|
is_default = valid_languages[lang]
|
||||||
|
|
||||||
if canonical or (is_default and not current_lang):
|
if canonical or (is_default and len(potentials) == 1):
|
||||||
valid.append(lang)
|
valid.append(lang)
|
||||||
elif is_default and num_defaults > 1 and current_lang != lang:
|
elif is_default and num_defaults > 1 and current_lang != lang:
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
|
|||||||
Reference in New Issue
Block a user