diff --git a/scripts/geodata/address_expansions/gazetteers.py b/scripts/geodata/address_expansions/gazetteers.py
new file mode 100644
index 00000000..b47a486f
--- /dev/null
+++ b/scripts/geodata/address_expansions/gazetteers.py
@@ -0,0 +1,195 @@
+import os
+import sys
+
+from collections import defaultdict, OrderedDict
+
+from geodata.encoding import safe_decode, safe_encode
+from geodata.i18n.unicode_paths import DATA_DIR
+from geodata.text.normalize import normalized_tokens, normalize_string
+from geodata.text.tokenize import tokenize, token_types
+from geodata.text.phrases import PhraseFilter
+
+from marisa_trie import BytesTrie
+
+
+DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
+
+PREFIX_KEY = u'\x02'
+SUFFIX_KEY = u'\x03'
+
+POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
+                               'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
+                               'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
+                               'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
+                               'm', 'mm', 'mmm', 'mmmm'])
+
+PHRASE = 'PHRASE'
+
+
+class DictionaryPhraseFilter(PhraseFilter):
+
+    def __init__(self, *dictionaries):
+        self.dictionaries = dictionaries
+        self.canonicals = {}
+
+    def serialize(self, s):
+        return s
+
+    def deserialize(self, s):
+        return s
+
+    def configure(self, base_dir=DICTIONARIES_DIR):
+        kvs = defaultdict(OrderedDict)
+        for lang in os.listdir(DICTIONARIES_DIR):
+            for filename in self.dictionaries:
+                is_suffix_dictionary = 'suffixes' in filename
+                is_prefix_dictionary = 'prefixes' in filename
+
+                dictionary_name = filename.split('.', 1)[0]
+
+                path = os.path.join(DICTIONARIES_DIR, lang, filename)
+                if not os.path.exists(path):
+                    continue
+
+                for line in open(path):
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    phrases = safe_decode(line).split(u'|')
+                    if not phrases:
+                        continue
+
+                    canonical = phrases[0]
+                    canonical_normalized = normalize_string(canonical)
+
+                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]
+
+                    for i, phrase in enumerate(phrases):
+
+                        if phrase in POSSIBLE_ROMAN_NUMERALS:
+                            continue
+
+                        is_canonical = normalize_string(phrase) == canonical_normalized
+
+                        if is_suffix_dictionary:
+                            phrase = SUFFIX_KEY + phrase[::-1]
+                        elif is_prefix_dictionary:
+                            phrase = PREFIX_KEY + phrase
+
+                        kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical
+
+        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
+
+        self.trie = BytesTrie(kvs)
+        self.configured = True
+
+    def search_substring(self, s):
+        if len(s) == 0:
+            return None, 0
+
+        for i in xrange(len(s) + 1):
+            if not self.trie.has_keys_with_prefix(s[:i]):
+                i -= 1
+                break
+        if i > 0:
+            return (self.trie.get(s[:i]), i)
+        else:
+            return None, 0
+
+    def search_suffix(self, token):
+        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
+        if suffix_len > 0:
+            suffix_len -= len(SUFFIX_KEY)
+        return suffix_search, suffix_len
+
+    def search_prefix(self, token):
+        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
+        if prefix_len > 0:
+            prefix_len -= len(PREFIX_KEY)
+        return prefix_search, prefix_len
+
+    def basic_filter(self, tokens):
+        return super(DictionaryPhraseFilter, self).filter(tokens)
+
+    def filter(self, tokens):
+        for p, t, data in self.basic_filter(tokens):
+            if not p:
+                t, c = t
+                token = t
+                token_len = len(token)
+
+                suffix_search, suffix_len = self.search_suffix(token)
+                if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
+                    yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search))
+                    continue
+                prefix_search, prefix_len = self.search_prefix(token)
+                if prefix_search and self.trie.get(token[:prefix_len]):
+                    yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search))
+                    continue
+            else:
+                c = PHRASE
+            yield t, c, len(t), map(safe_decode, data)
+
+STREET_TYPES_DICTIONARIES = ('street_types.txt',
+                             'directionals.txt',
+                             'concatenated_suffixes_separable.txt',
+                             'concatenated_suffixes_inseparable.txt',
+                             'concatenated_prefixes_separable.txt',
+                             'organizations.txt',
+                             'people.txt',
+                             'personal_suffixes.txt',
+                             'personal_titles.txt',
+                             'qualifiers.txt',
+                             'stopwords.txt',)
+
+GIVEN_NAME_DICTIONARY = 'given_names.txt'
+SURNAME_DICTIONARY = 'surnames.txt'
+
+NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
+                     SURNAME_DICTIONARY,)
+
+
+
+NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees.txt',
+                                                              'building_types.txt',
+                                                              'company_types.txt',
+                                                              'place_names.txt',
+                                                              'qualifiers.txt',
+                                                              'synonyms.txt',
+                                                              'toponyms.txt',
+                                                              )
+
+
+UNIT_ABBREVIATION_DICTIONARIES = ('level_types.txt',
+                                  'post_office.txt',
+                                  'unit_types.txt',
+                                  )
+
+
+ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
+    NAME_ABBREVIATION_DICTIONARIES + \
+    UNIT_ABBREVIATION_DICTIONARIES + \
+    ('no_number.txt', 'nulls.txt',)
+
+
+_gazetteers = []
+
+
+def create_gazetteer(*dictionaries):
+    g = DictionaryPhraseFilter(*dictionaries)
+    _gazetteers.append(g)
+    return g
+
+
+street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
+names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES)
+unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
+street_and_unit_types_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + UNIT_ABBREVIATION_DICTIONARIES))
+abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
+given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
+
+
+def init_gazetteers():
+    for g in _gazetteers:
+        g.configure()
diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py
index 026709d2..581ab28f 100644
--- a/scripts/geodata/language_id/disambiguation.py
+++ b/scripts/geodata/language_id/disambiguation.py
@@ -3,21 +3,17 @@ import sys
 
 from collections import defaultdict, OrderedDict
 
-from marisa_trie import BytesTrie
-
 this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
 
+from geodata.address_expansions.gazetteers import *
 from geodata.encoding import safe_decode, safe_encode
 from geodata.string_utils import wide_iter, wide_ord
-from geodata.i18n.unicode_paths import DATA_DIR
-from geodata.i18n.normalize import strip_accents
 from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
 from geodata.text.normalize import normalized_tokens, normalize_string
-from geodata.text.tokenize import tokenize, token_types
-from geodata.text.phrases import PhraseFilter
+from geodata.text.tokenize import tokenize
 
 WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
 
@@ -35,184 +31,6 @@ WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
     'pt': set(['pt', 'br']),
 }
 
-DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
-
-PREFIX_KEY = u'\x02'
-SUFFIX_KEY = u'\x03'
-
-POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
-                               'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
-                               'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
-                               'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
-                               'm', 'mm', 'mmm', 'mmmm'])
-
-PHRASE = 'PHRASE'
-
-
-class DictionaryPhraseFilter(PhraseFilter):
-
-    def __init__(self, *dictionaries):
-        self.dictionaries = dictionaries
-        self.canonicals = {}
-
-    def serialize(self, s):
-        return s
-
-    def deserialize(self, s):
-        return s
-
-    def configure(self, base_dir=DICTIONARIES_DIR):
-        kvs = defaultdict(OrderedDict)
-        for lang in os.listdir(DICTIONARIES_DIR):
-            for filename in self.dictionaries:
-                is_suffix_dictionary = 'suffixes' in filename
-                is_prefix_dictionary = 'prefixes' in filename
-
-                dictionary_name = filename.split('.', 1)[0]
-
-                path = os.path.join(DICTIONARIES_DIR, lang, filename)
-                if not os.path.exists(path):
-                    continue
-
-                for line in open(path):
-                    line = line.strip()
-                    if not line:
-                        continue
-
-                    phrases = safe_decode(line).split(u'|')
-                    if not phrases:
-                        continue
-
-                    canonical = phrases[0]
-                    canonical_normalized = normalize_string(canonical)
-
-                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]
-
-                    for i, phrase in enumerate(phrases):
-
-                        if phrase in POSSIBLE_ROMAN_NUMERALS:
-                            continue
-
-                        is_canonical = normalize_string(phrase) == canonical_normalized
-
-                        if is_suffix_dictionary:
-                            phrase = SUFFIX_KEY + phrase[::-1]
-                        elif is_prefix_dictionary:
-                            phrase = PREFIX_KEY + phrase
-
-                        kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical
-
-        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
-
-        self.trie = BytesTrie(kvs)
-        self.configured = True
-
-    def search_substring(self, s):
-        if len(s) == 0:
-            return None, 0
-
-        for i in xrange(len(s) + 1):
-            if not self.trie.has_keys_with_prefix(s[:i]):
-                i -= 1
-                break
-        if i > 0:
-            return (self.trie.get(s[:i]), i)
-        else:
-            return None, 0
-
-    def search_suffix(self, token):
-        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
-        if suffix_len > 0:
-            suffix_len -= len(SUFFIX_KEY)
-        return suffix_search, suffix_len
-
-    def search_prefix(self, token):
-        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
-        if prefix_len > 0:
-            prefix_len -= len(PREFIX_KEY)
-        return prefix_search, prefix_len
-
-    def basic_filter(self, tokens):
-        return super(DictionaryPhraseFilter, self).filter(tokens)
-
-    def filter(self, tokens):
-        for p, t, data in self.basic_filter(tokens):
-            if not p:
-                t, c = t
-                token = t
-                token_len = len(token)
-
-                suffix_search, suffix_len = self.search_suffix(token)
-                if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
-                    yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search))
-                    continue
-                prefix_search, prefix_len = self.search_prefix(token)
-                if prefix_search and self.trie.get(token[:prefix_len]):
-                    yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search))
-                    continue
-            else:
-                c = PHRASE
-            yield t, c, len(t), map(safe_decode, data)
-
-STREET_TYPES_DICTIONARIES = ('street_types.txt',
-                             'directionals.txt',
-                             'concatenated_suffixes_separable.txt',
-                             'concatenated_suffixes_inseparable.txt',
-                             'concatenated_prefixes_separable.txt',
-                             'organizations.txt',
-                             'people.txt',
-                             'personal_suffixes.txt',
-                             'personal_titles.txt',
-                             'qualifiers.txt',
-                             'stopwords.txt',)
-
-GIVEN_NAME_DICTIONARY = 'given_names.txt'
-SURNAME_DICTIONARY = 'surnames.txt'
-
-NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
-                     SURNAME_DICTIONARY,)
-
-
-
-NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees.txt',
-                                                              'building_types.txt',
-                                                              'company_types.txt',
-                                                              'place_names.txt',
-                                                              'qualifiers.txt',
-                                                              'synonyms.txt',
-                                                              'toponyms.txt',
-                                                              )
-
-
-UNIT_ABBREVIATION_DICTIONARIES = ('level_types.txt',
-                                  'post_office.txt',
-                                  'unit_types.txt',
-                                  )
-
-
-ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
-    NAME_ABBREVIATION_DICTIONARIES + \
-    UNIT_ABBREVIATION_DICTIONARIES + \
-    ('no_number.txt', 'nulls.txt',)
-
-
-gazetteers = []
-
-
-def create_gazetteer(*dictionaries):
-    g = DictionaryPhraseFilter(*dictionaries)
-    gazetteers.append(g)
-    return g
-
-
-street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
-names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES)
-unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
-street_and_unit_types_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + UNIT_ABBREVIATION_DICTIONARIES))
-abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
-given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
-
-
 char_scripts = []
 script_languages = {}
 
diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py
index 6f09d4c5..56ddc532 100644
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -54,6 +54,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
 
+from geodata.address_expansions.gazetteers import *
 from geodata.coordinates.conversion import *
 from geodata.countries.country_names import *
 from geodata.geonames.db import GeoNamesDB
@@ -1591,6 +1592,7 @@ if __name__ == '__main__':
     init_country_names()
     init_languages()
     init_disambiguation()
+    init_gazetteers()
 
     language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir)
     osm_rtree = None
@@ -1610,8 +1612,6 @@ if __name__ == '__main__':
     if args.geonames_db:
         geonames = GeoNamesDB(args.geonames_db)
 
-    street_types_gazetteer.configure()
-
     # Can parallelize
     if args.streets_file:
         build_ways_training_data(language_rtree, args.streets_file, args.out_dir)