[fix] token_types.PHRASE
This commit is contained in:
@@ -63,7 +63,7 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
|
|||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
for t, c, length, data in gazetteer.filter(norm_tokens):
|
for t, c, length, data in gazetteer.filter(norm_tokens):
|
||||||
if c is PHRASE:
|
if c == token_types.PHRASE:
|
||||||
valid = []
|
valid = []
|
||||||
data = [d.split('|') for d in data]
|
data = [d.split('|') for d in data]
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from geodata.string_utils import wide_iter, wide_ord
|
|||||||
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
|
from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
|
||||||
from geodata.text.normalize import normalized_tokens, normalize_string
|
from geodata.text.normalize import normalized_tokens, normalize_string
|
||||||
from geodata.text.tokenize import tokenize
|
from geodata.text.tokenize import tokenize
|
||||||
|
from geodata.text.token_types import token_types
|
||||||
|
|
||||||
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
|
WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
|
||||||
|
|
||||||
@@ -100,7 +101,7 @@ def disambiguate_language(text, languages):
|
|||||||
seen_languages = set()
|
seen_languages = set()
|
||||||
|
|
||||||
for t, c, l, data in street_types_gazetteer.filter(tokens):
|
for t, c, l, data in street_types_gazetteer.filter(tokens):
|
||||||
if c is PHRASE:
|
if c == token_types.PHRASE:
|
||||||
valid = OrderedDict()
|
valid = OrderedDict()
|
||||||
data = [safe_decode(d).split(u'|') for d in data]
|
data = [safe_decode(d).split(u'|') for d in data]
|
||||||
potentials = set([l for l, d, i, c in data if l in valid_languages])
|
potentials = set([l for l, d, i, c in data if l in valid_languages])
|
||||||
|
|||||||
Reference in New Issue
Block a user