diff --git a/scripts/geodata/address_expansions/abbreviations.py b/scripts/geodata/address_expansions/abbreviations.py index bab1cef4..01b3d4ae 100644 --- a/scripts/geodata/address_expansions/abbreviations.py +++ b/scripts/geodata/address_expansions/abbreviations.py @@ -63,7 +63,7 @@ def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2): i = 0 for t, c, length, data in gazetteer.filter(norm_tokens): - if c is PHRASE: + if c == token_types.PHRASE: valid = [] data = [d.split('|') for d in data] diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 1b015117..d988b517 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -14,6 +14,7 @@ from geodata.string_utils import wide_iter, wide_ord from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages from geodata.text.normalize import normalized_tokens, normalize_string from geodata.text.tokenize import tokenize +from geodata.text.token_types import token_types WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt']) @@ -100,7 +101,7 @@ def disambiguate_language(text, languages): seen_languages = set() for t, c, l, data in street_types_gazetteer.filter(tokens): - if c is PHRASE: + if c == token_types.PHRASE: valid = OrderedDict() data = [safe_decode(d).split(u'|') for d in data] potentials = set([l for l, d, i, c in data if l in valid_languages])