[languages] Adding canonical back in to language disambiguation (for prefixes/suffixes too), using non-canonicals/abbreviations in non-default languages if there are no other abbreviations found, adding in stopwords dictionaries
This commit is contained in:
@@ -1,5 +1,8 @@
|
|||||||
and
|
and
|
||||||
between|betw|btwn|btw|btween|b/t
|
between|betw|btwn|btw|btween|b/t
|
||||||
by
|
by
|
||||||
|
of
|
||||||
near
|
near
|
||||||
|
the
|
||||||
|
to
|
||||||
via
|
via
|
||||||
@@ -62,17 +62,20 @@ class DictionaryPhraseFilter(PhraseFilter):
|
|||||||
canonical = strip_accents(phrases[0])
|
canonical = strip_accents(phrases[0])
|
||||||
|
|
||||||
for phrase in phrases:
|
for phrase in phrases:
|
||||||
|
|
||||||
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
if phrase in POSSIBLE_ROMAN_NUMERALS:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
is_canonical = strip_accents(phrase) == canonical
|
||||||
|
|
||||||
if is_suffix_dictionary:
|
if is_suffix_dictionary:
|
||||||
phrase = SUFFIX_KEY + phrase[::-1]
|
phrase = SUFFIX_KEY + phrase[::-1]
|
||||||
elif is_prefix_dictionary:
|
elif is_prefix_dictionary:
|
||||||
phrase = PREFIX_KEY + phrase
|
phrase = PREFIX_KEY + phrase
|
||||||
|
|
||||||
if strip_accents(phrase) == canonical:
|
kvs[phrase][lang] = is_canonical
|
||||||
kvs[phrase][lang] = None
|
|
||||||
|
|
||||||
kvs = [(k, v) for k, vals in kvs.iteritems() for v in vals.iterkeys()]
|
kvs = [(k, '|'.join([v, str(int(c))])) for k, vals in kvs.iteritems() for v, c in vals.iteritems()]
|
||||||
|
|
||||||
self.trie = BytesTrie(kvs)
|
self.trie = BytesTrie(kvs)
|
||||||
self.configured = True
|
self.configured = True
|
||||||
@@ -113,7 +116,8 @@ street_types_gazetteer = DictionaryPhraseFilter('street_types.txt',
|
|||||||
'directionals.txt',
|
'directionals.txt',
|
||||||
'concatenated_suffixes_separable.txt',
|
'concatenated_suffixes_separable.txt',
|
||||||
'concatenated_suffixes_inseparable.txt',
|
'concatenated_suffixes_inseparable.txt',
|
||||||
'concatenated_prefixes_separable.txt')
|
'concatenated_prefixes_separable.txt',
|
||||||
|
'stopwords.txt',)
|
||||||
|
|
||||||
|
|
||||||
UNKNOWN_LANGUAGE = 'unk'
|
UNKNOWN_LANGUAGE = 'unk'
|
||||||
@@ -131,8 +135,15 @@ def disambiguate_language(text, languages):
|
|||||||
for c, t, data in street_types_gazetteer.filter(tokens):
|
for c, t, data in street_types_gazetteer.filter(tokens):
|
||||||
|
|
||||||
if c == token_types.PHRASE:
|
if c == token_types.PHRASE:
|
||||||
valid = [l for l in data if l in valid_languages]
|
valid = []
|
||||||
|
for d in data:
|
||||||
|
lang, canonical = d.split('|')
|
||||||
|
canonical = int(canonical)
|
||||||
|
if lang not in valid_languages:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if canonical or not seen_languages:
|
||||||
|
valid.append(lang)
|
||||||
if seen_languages and valid and not any((l in seen_languages for l in valid)):
|
if seen_languages and valid and not any((l in seen_languages for l in valid)):
|
||||||
return AMBIGUOUS_LANGUAGE
|
return AMBIGUOUS_LANGUAGE
|
||||||
|
|
||||||
@@ -140,9 +151,9 @@ def disambiguate_language(text, languages):
|
|||||||
current_lang = valid[0]
|
current_lang = valid[0]
|
||||||
else:
|
else:
|
||||||
valid = [l for l in valid if valid_languages.get(l)]
|
valid = [l for l in valid if valid_languages.get(l)]
|
||||||
if len(valid) == 1:
|
if len(valid) == 1 and current_lang is not None and valid[0] != current_lang:
|
||||||
if current_lang is not None and valid[0] != current_lang:
|
return AMBIGUOUS_LANGUAGE
|
||||||
return AMBIGUOUS_LANGUAGE
|
elif len(valid) == 1:
|
||||||
current_lang = valid[0]
|
current_lang = valid[0]
|
||||||
|
|
||||||
seen_languages.update(valid)
|
seen_languages.update(valid)
|
||||||
|
|||||||
Reference in New Issue
Block a user