diff --git a/scripts/geodata/address_expansions/gazetteers.py b/scripts/geodata/address_expansions/gazetteers.py
index 33ef5fe1..3da015b0 100644
--- a/scripts/geodata/address_expansions/gazetteers.py
+++ b/scripts/geodata/address_expansions/gazetteers.py
@@ -9,6 +9,7 @@ from geodata.i18n.unicode_paths import DATA_DIR
 from geodata.text.normalize import normalized_tokens, normalize_string
 from geodata.text.tokenize import tokenize, token_types
 from geodata.text.phrases import PhraseFilter
+from geodata.enum import EnumValue
 
 from marisa_trie import BytesTrie
 
@@ -24,8 +25,6 @@ POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii',
                                'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
                                'm', 'mm', 'mmm', 'mmmm'])
 
-PHRASE = 'PHRASE'
-
 
 class DictionaryPhraseFilter(PhraseFilter):
 
@@ -37,14 +36,14 @@ class DictionaryPhraseFilter(PhraseFilter):
 
         for language in address_phrase_dictionaries.languages:
             for dictionary_name in self.dictionaries:
-                is_suffix_dictionary = 'suffixes' in filename
-                is_prefix_dictionary = 'prefixes' in filename
+                is_suffix_dictionary = 'suffixes' in dictionary_name
+                is_prefix_dictionary = 'prefixes' in dictionary_name
 
                 for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
                     canonical = phrases[0]
                     canonical_normalized = normalize_string(canonical)
 
-                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]
+                    self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]
 
                     for i, phrase in enumerate(phrases):
 
@@ -58,7 +57,7 @@ class DictionaryPhraseFilter(PhraseFilter):
                         elif is_prefix_dictionary:
                             phrase = PREFIX_KEY + phrase
 
-                        kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical
+                        kvs[phrase][(language, dictionary_name, canonical)] = is_canonical
 
         kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
 
@@ -107,14 +106,14 @@ class DictionaryPhraseFilter(PhraseFilter):
 
                 suffix_search, suffix_len = self.search_suffix(token)
                 if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
-                    yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search))
+                    yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
                     continue
                 prefix_search, prefix_len = self.search_prefix(token)
                 if prefix_search and self.trie.get(token[:prefix_len]):
-                    yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search))
+                    yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
                     continue
             else:
-                c = PHRASE
+                c = token_types.PHRASE
             yield t, c, len(t), map(safe_decode, data)
 
 STREET_TYPES_DICTIONARIES = ('street_types',
@@ -155,6 +154,7 @@ UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
                                   'level_types_numbered',
                                   'level_types_standalone',
                                   'level_types_sub_basement',
+                                  'number',
                                   'post_office',
                                   'unit_types_numbered',
                                   'unit_types_standalone',
diff --git a/scripts/geodata/text/token_types.py b/scripts/geodata/text/token_types.py
index 021b7918..e56afab8 100644
--- a/scripts/geodata/text/token_types.py
+++ b/scripts/geodata/text/token_types.py
@@ -52,6 +52,9 @@ class token_types(Enum):
     WHITESPACE = EnumValue(300)
     NEWLINE = EnumValue(301)
 
+    # Phrase, special application-level type not returned by the tokenizer
+    PHRASE = EnumValue(999)
+
     WORD_TOKEN_TYPES = set([
         WORD,
         ABBREVIATION,