[fix] token_types.PHRASE

This commit is contained in:
Al
2016-04-28 17:21:58 -04:00
parent 5d34500b63
commit 9088ba6df6
2 changed files with 3 additions and 2 deletions

View File

@@ -63,7 +63,7 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
i = 0
for t, c, length, data in gazetteer.filter(norm_tokens):
if c is PHRASE:
if c == token_types.PHRASE:
valid = []
data = [d.split('|') for d in data]

View File

@@ -14,6 +14,7 @@ from geodata.string_utils import wide_iter, wide_ord
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
from geodata.text.normalize import normalized_tokens, normalize_string
from geodata.text.tokenize import tokenize
from geodata.text.token_types import token_types
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
@@ -100,7 +101,7 @@ def disambiguate_language(text, languages):
seen_languages = set()
for t, c, l, data in street_types_gazetteer.filter(tokens):
if c is PHRASE:
if c == token_types.PHRASE:
valid = OrderedDict()
data = [safe_decode(d).split(u'|') for d in data]
potentials = set([l for l, d, i, c in data if l in valid_languages])