[fix] token_types.PHRASE

2016-04-28 17:21:58 -04:00
parent e21b793b03
commit 7b42e52c6a
2 changed files with 3 additions and 2 deletions
--- a/scripts/geodata/address_expansions/abbreviations.py
+++ b/scripts/geodata/address_expansions/abbreviations.py
@@ -63,7 +63,7 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
    i = 0

    for t, c, length, data in gazetteer.filter(norm_tokens):
-        if c is PHRASE:
+        if c == token_types.PHRASE:
            valid = []
            data = [d.split('|') for d in data]

--- a/scripts/geodata/language_id/disambiguation.py
+++ b/scripts/geodata/language_id/disambiguation.py
@@ -14,6 +14,7 @@ from geodata.string_utils import wide_iter, wide_ord
 from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
 from geodata.text.normalize import normalized_tokens, normalize_string
 from geodata.text.tokenize import tokenize
+from geodata.text.token_types import token_types

 WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])

@@ -100,7 +101,7 @@ def disambiguate_language(text, languages):
    seen_languages = set()

    for t, c, l, data in street_types_gazetteer.filter(tokens):
-        if c is PHRASE:
+        if c == token_types.PHRASE:
            valid = OrderedDict()
            data = [safe_decode(d).split(u'|') for d in data]
            potentials = set([l for l, d, i, c in data if l in valid_languages])