diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py
index ce12df21..6f09d4c5 100644
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -131,6 +131,16 @@ def num_deps(c):
     return len(c.dependencies)
 
 
+RANDOM_VALUE_REPLACEMENTS = {
+    # Key: address component
+    AddressFormatter.COUNTRY: {
+        # value: (replacement, probability)
+        'GB': ('UK', 0.3),
+        'United Kingdom': ('UK', 0.3),
+    }
+}
+
+
 OSM_ADDRESS_COMPONENTS_SORTED = sorted(OSM_ADDRESS_COMPONENTS, key=num_deps)
 
 OSM_ADDRESS_COMPONENT_COMBINATIONS = []
@@ -309,6 +319,155 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
     return country, name_language
 
 
+ALL_LANGUAGES = 'all'
+
+LOWER, UPPER, TITLE, MIXED = range(4)
+
+
+def token_capitalization(s):
+    if s.istitle():
+        return TITLE
+    elif s.islower():
+        return LOWER
+    elif s.isupper():
+        return UPPER
+    else:
+        return MIXED
+
+
+def recase_abbreviation(expansion, tokens):
+    expansion_tokens = expansion.split()
+    if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
+        return expansion.upper()
+    elif len(tokens) == len(expansion_tokens):
+        strings = []
+        for (t, c), e in zip(tokens, expansion_tokens):
+            cap = token_capitalization(t)
+            if cap == LOWER:
+                strings.append(e.lower())
+            elif cap == UPPER:
+                strings.append(e.upper())
+            elif cap == TITLE:
+                strings.append(e.title())
+            elif t.lower() == e.lower():
+                strings.append(t)
+            else:
+                strings.append(e.title())
+        return u' '.join(strings)
+    else:
+        return u' '.join([t.title() for t in expansion_tokens])
+
+
+def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
+    '''
+    Abbreviations
+    -------------
+
+    OSM discourages abbreviations, but to make our training data map better
+    to real-world input, we can safely replace the canonical phrase with an
+    abbreviated version and retain the meaning of the words
+    '''
+    tokens = tokenize(s)
+    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
+
+    abbreviated = []
+
+    i = 0
+
+    for t, c, length, data in gazetteer.filter(norm_tokens):
+        if c is PHRASE:
+            valid = []
+            data = [d.split('|') for d in data]
+
+            added = False
+
+            for lang, dictionary, is_canonical, canonical in data:
+                if lang not in (language, 'all'):
+                    continue
+
+                is_canonical = int(is_canonical)
+                is_stopword = dictionary == 'stopword'
+                is_prefix = dictionary.startswith('concatenated_prefixes')
+                is_suffix = dictionary.startswith('concatenated_suffixes')
+                is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
+
+                suffix = None
+                prefix = None
+
+                if not is_canonical or random.random() > abbreviate_prob:
+                    continue
+
+                if not is_prefix and not is_suffix:
+                    abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
+                    token = random.choice(abbreviations) if abbreviations else canonical
+                    token = recase_abbreviation(token, tokens[i:i + len(t)])
+                    abbreviated.append(token)
+                    if t[-1][1] != token_types.IDEOGRAPHIC_CHAR:
+                        abbreviated.append(u' ')
+                    added = True
+                elif is_prefix:
+                    token = tokens[i][0]
+                    prefix, token = token[:length], token[length:]
+                    abbreviated.append(prefix)
+                    if random.random() < separate_prob:
+                        abbreviated.append(u' ')
+                    if token.islower():
+                        abbreviated.append(token.title())
+                    else:
+                        abbreviated.append(token)
+                    abbreviated.append(u' ')
+                    added = True
+                elif is_suffix:
+                    token = tokens[i][0]
+
+                    token, suffix = token[:-length], token[-length:]
+
+                    concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
+
+                    separated_abbreviations = []
+                    phrase = gazetteer.trie.get(suffix.rstrip('.'))
+                    suffix_data = [safe_decode(d).split(u'|') for d in (phrase or [])]
+                    for l, d, _, c in suffix_data:
+                        if l == lang and c == canonical:
+                            separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
+
+                    separate = random.random() < separate_prob
+
+                    if concatenated_abbreviations and not separate:
+                        abbreviation = random.choice(concatenated_abbreviations)
+                    elif separated_abbreviations:
+                        abbreviation = random.choice(separated_abbreviations)
+                    else:
+                        abbreviation = canonical
+
+                    abbreviated.append(token)
+                    if separate:
+                        abbreviated.append(u' ')
+                    if suffix.isupper():
+                        abbreviated.append(abbreviation.upper())
+                    elif separate:
+                        abbreviated.append(abbreviation.title())
+                    else:
+                        abbreviated.append(abbreviation)
+                    abbreviated.append(u' ')
+                    added = True
+
+            if not added:
+                for j, (t_i, c_i) in enumerate(t):
+                    abbreviated.append(tokens[i + j][0])
+                    if c_i != token_types.IDEOGRAPHIC_CHAR:
+                        abbreviated.append(u' ')
+            i += len(t)
+
+        else:
+            abbreviated.append(tokens[i][0])
+            if (c != token_types.IDEOGRAPHIC_CHAR):
+                abbreviated.append(u' ')
+            i += 1
+
+    return u''.join(abbreviated).strip()
+
+
 def build_ways_training_data(language_rtree, infile, out_dir):
     '''
     Creates a training set for language classification using most OSM ways
@@ -334,7 +493,7 @@ def build_ways_training_data(language_rtree, infile, out_dir):
                 if k in languages:
                     writer.writerow((k, country, tsv_string(s)))
             if i % 1000 == 0 and i > 0:
-                print 'did', i, 'ways'
+                print('did {} ways'.format(i))
             i += 1
     f.close()
 
@@ -362,124 +521,111 @@ def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude):
     return ret
 
 
-def osm_pick_random_name_key(suffix=''):
-    name_key = ''.join(('name', suffix))
-    raw_name_key = 'name'
-    short_name_key = ''.join(('short_name', suffix))
-    raw_short_name_key = 'short_name'
-    alt_name_key = ''.join(('alt_name', suffix))
-    raw_alt_name_key = 'alt_name'
-    official_name_key = ''.join(('official_name', suffix))
-    raw_official_name_key = 'official_name'
-
-    # Choose which name to use with given probabilities
-    r = random.random()
-    if r < 0.7:
-        # 70% of the time use the name tag
-        key = name_key
-        raw_key = raw_name_key
-    elif r < 0.8:
-        # 10% of the time use the short name
-        key = short_name_key
-        raw_key = raw_short_name_key
-    elif r < 0.9:
-        # 10% of the time use the official name
-        key = official_name_key
-        raw_key = raw_official_name_key
-    else:
-        # 10% of the time use the official name
-        key = alt_name_key
-        raw_key = raw_alt_name_key
-
-    return key, raw_key
-
-
-def build_address_format_training_data(admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, infile, out_dir, tag_components=True):
-    '''
-    Creates formatted address training data for supervised sequence labeling (or potentially 
-    for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
-
-    Example:
-
-    cs  cz  Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
-
-    The field structure is similar to other training data created by this script i.e.
-    {language, country, data}. The data field here is a sequence of labeled tokens similar
-    to what we might see in part-of-speech tagging.
-
-
-    This format uses a special character "|" to denote possible breaks in the input (comma, newline).
-
-    Note that for the address parser, we'd like it to be robust to many different types
-    of input, so we may selectively eleminate components
-
-    This information can potentially be used downstream by the sequence model as these
-    breaks may be present at prediction time.
-
-    Example:
-
-    sr      rs      Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic
-
-    This may be useful in learning word representations, statistical phrases, morphology
-    or other models requiring only the sequence of words.
-    '''
-    i = 0
-
-    formatter = AddressFormatter()
-    osm_address_components.configure()
-
-    if tag_components:
-        formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
-        writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
-    else:
-        formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
-        writer = csv.writer(formatted_file, 'tsv_no_quote')
-
-    remove_keys = OSM_IGNORE_KEYS
-
+class OSMAddressFormatter(object):
     alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries}
 
-    for node_id, value, deps in parse_osm(infile):
-        try:
-            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
-        except Exception:
-            continue
-
-        country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
-        if not (country and candidate_languages):
-            continue
-
-        for key in remove_keys:
-            _ = value.pop(key, None)
+    def __init__(self, admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, splitter=None):
+        self.admin_rtree = admin_rtree
+        self.language_rtree = language_rtree
+        self.neighborhoods_rtree = neighborhoods_rtree
+        self.quattroshapes_rtree = quattroshapes_rtree
+        self.geonames = geonames
+        self.formatter = AddressFormatter(splitter=splitter)
+        osm_address_components.configure()
 
+    def pick_language(self, value, candidate_languages, pick_namespaced_language_prob=0.6):
         language = None
 
-        more_than_one_official_language = len(candidate_languages) > 1
+        if len(candidate_languages) == 1:
+            language = candidate_languages[0]['lang']
+        else:
+            street = value.get('addr:street', None)
 
-        if tag_components:
-            if len(candidate_languages) == 1:
-                language = candidate_languages[0]['lang']
+            namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value]
+
+            if street is not None and not namespaced:
+                language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
+            elif namespaced and random.random() < pick_namespaced_language_prob:
+                language = random.choice(namespaced)
+                lang_suffix = ':{}'.format(language)
+                for k in value:
+                    if k.startswith('addr:') and k.endswith(lang_suffix):
+                        value[k.rstrip(lang_suffix)] = value[k]
             else:
-                street = value.get('addr:street', None)
+                language = UNKNOWN_LANGUAGE
 
-                namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value]
+        return language
 
-                if street is not None and not namespaced:
-                    language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
-                elif namespaced and random.random() < 0.6:
-                    language = random.choice(namespaced)
-                    lang_suffix = ':{}'.format(language)
-                    for k in value:
-                        if k.startswith('addr:') and k.endswith(lang_suffix):
-                            value[k.rstrip(lang_suffix)] = value[k]
-                else:
-                    language = UNKNOWN_LANGUAGE
+    def pick_random_name_key(self, suffix=''):
+        name_key = ''.join(('name', suffix))
+        raw_name_key = 'name'
+        short_name_key = ''.join(('short_name', suffix))
+        raw_short_name_key = 'short_name'
+        alt_name_key = ''.join(('alt_name', suffix))
+        raw_alt_name_key = 'alt_name'
+        official_name_key = ''.join(('official_name', suffix))
+        raw_official_name_key = 'official_name'
 
-        address_components = {k: v for k, v in value.iteritems() if k in formatter.aliases}
-        formatter.replace_aliases(address_components)
+        # Choose which name to use with given probabilities
+        r = random.random()
+        if r < 0.7:
+            # 70% of the time use the name tag
+            key = name_key
+            raw_key = raw_name_key
+        elif r < 0.8:
+            # 10% of the time use the short name
+            key = short_name_key
+            raw_key = raw_short_name_key
+        elif r < 0.9:
+            # 10% of the time use the official name
+            key = official_name_key
+            raw_key = raw_official_name_key
+        else:
+            # 10% of the time use the official name
+            key = alt_name_key
+            raw_key = raw_alt_name_key
 
-        address_country = address_components.get(AddressFormatter.COUNTRY)
+        return key, raw_key
 
+    def normalize_address_components(self, value):
+        address_components = {k: v for k, v in value.iteritems() if k in self.formatter.aliases}
+        self.formatter.replace_aliases(address_components)
+        return address_components
+
+    def abbreviated_street(self, street, language, abbreviate_prob=0.3, separate_prob=0.2):
+        '''
+        Street abbreviations
+        --------------------
+
+        Use street and unit type dictionaries to probabilistically abbreviate
+        phrases. Because the abbreviation is picked at random, this should
+        help bridge the gap between OSM addresses and user input, in addition
+        to capturing some non-standard abbreviations/surface forms which may be
+        missing or sparse in OSM.
+        '''
+        return osm_abbreviate(street_and_unit_types_gazetteer, street, language,
+                              abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
+
+    def abbreviated_venue_name(self, name, language, abbreviate_prob=0.2, separate_prob=0.0):
+        '''
+        Venue abbreviations
+        -------------------
+
+        Use street and unit type dictionaries to probabilistically abbreviate
+        phrases. Because the abbreviation is picked at random, this should
+        help bridge the gap between OSM addresses and user input, in addition
+        to capturing some non-standard abbreviations/surface forms which may be
+        missing or sparse in OSM.
+        '''
+        return osm_abbreviate(names_gazetteer, name, language,
+                              abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
+
+    def country_name(self, address_components, country_code, language,
+                     use_country_code_prob=0.3,
+                     local_language_name_prob=0.6,
+                     random_language_name_prob=0.1,
+                     alpha_3_iso_code_prob=0.1,
+                     ):
         '''
         Country names
         -------------
@@ -506,14 +652,16 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
 
         non_local_language = None
 
-        if random.random() < 0.3:
+        address_country = address_components.get(AddressFormatter.COUNTRY)
+
+        if random.random() < use_country_code_prob:
             # 30% of the time: add Quattroshapes country
-            address_country = country.upper()
+            address_country = country_code.upper()
 
         r = random.random()
 
         # 1. 60% of the time: use the country name in the current language or the country's local language
-        if address_country and r < 0.6:
+        if address_country and r < local_language_name_prob:
             localized = None
             if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
                 localized = language_country_names.get(language, {}).get(address_country.upper())
@@ -522,25 +670,29 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
                 localized = country_localized_display_name(address_country.lower())
 
             if localized:
-                address_components[AddressFormatter.COUNTRY] = localized
+                address_country = localized
         # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet
-        elif address_country and r < 0.7:
+        elif address_country and r < local_language_name_prob + random_language_name_prob:
             non_local_language = sample_random_language()
             lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper())
             if lang_country:
-                address_components[AddressFormatter.COUNTRY] = lang_country
+                address_country = lang_country
         # 3. 10% of the time: use the country's alpha-3 ISO code
-        elif address_country and r < 0.8:
-            iso_code_alpha3 = alpha3_codes.get(address_country)
+        elif address_country and r < local_language_name_prob + random_language_name_prob + alpha_3_iso_code_prob:
+            iso_code_alpha3 = self.alpha3_codes.get(address_country)
             if iso_code_alpha3:
-                address_components[AddressFormatter.COUNTRY] = iso_code_alpha3
+                address_country = iso_code_alpha3
         # 4. Implicit: the rest of the time keep the alpha-2 country code
 
+        return address_country, non_local_language
+
+    def venue_names(self, value):
         '''
         Venue names
         -----------
 
         Some venues have multiple names listed in OSM, grab them all
+        With a certain probability, add None to the list so we drop the name
         '''
 
         venue_names = []
@@ -548,7 +700,9 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
             venue_name = value.get(key)
             if venue_name:
                 venue_names.append(venue_name)
+        return venue_names
 
+    def state_name(self, address_components, country, language, non_local_language=None, state_full_name_prob=0.3):
         '''
         States
         ------
@@ -559,14 +713,38 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
         '''
         address_state = address_components.get(AddressFormatter.STATE)
 
-        if address_state and not non_local_language:
+        if address_state and country and not non_local_language:
             state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)
 
-            if state_full_name and random.random() < 0.3:
-                address_components[AddressFormatter.STATE] = state_full_name
+            if state_full_name and random.random() < state_full_name_prob:
+                address_state = state_full_name
         elif address_state and non_local_language:
             _ = address_components.pop(AddressFormatter.STATE, None)
+            address_state = None
+        return address_state
 
+    def tag_suffix(self, language, non_local_language, more_than_one_official_language=False):
+        if non_local_language is not None:
+            osm_suffix = ':{}'.format(non_local_language)
+        elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
+            osm_suffix = ':{}'.format(language)
+        else:
+            osm_suffix = ''
+        return osm_suffix
+
+    def add_osm_boundaries(self, address_components,
+                           country, language,
+                           latitude, longitude,
+                           osm_suffix='',
+                           non_local_language=None,
+                           random_key=True,
+                           alpha_3_iso_code_prob=0.1,
+                           alpha_2_iso_code_prob=0.2,
+                           simple_country_key_prob=0.4,
+                           replace_with_non_local_prob=0.4,
+                           join_state_district_prob=0.5,
+                           expand_state_prob=0.7
+                           ):
         '''
         OSM boundaries
         --------------
@@ -583,14 +761,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
         include these qualifiers in the training data.
         '''
 
-        osm_components = osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude)
-
-        if non_local_language is not None:
-            osm_suffix = ':{}'.format(non_local_language)
-        elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
-            osm_suffix = ':{}'.format(language)
-        else:
-            osm_suffix = ''
+        osm_components = osm_reverse_geocoded_components(self.admin_rtree, country, latitude, longitude)
 
         name_key = ''.join(('name', osm_suffix))
         raw_name_key = 'name'
@@ -608,24 +779,29 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
             for component, components_values in osm_components.iteritems():
                 seen = set()
 
-                key, raw_key = osm_pick_random_name_key(suffix=osm_suffix)
+                if random_key:
+                    key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
+                else:
+                    key, raw_key = name_key, raw_name_key
 
                 for component_value in components_values:
                     r = random.random()
                     name = None
 
-                    if iso_code3_key in component_value and r < 0.1:
-                        name = component_value[iso_code3_key]
-                    elif iso_code_key in component_value and r < 0.3:
-                        name = component_value[iso_code_key]
-                    elif language == 'en' and not non_local_language and r < 0.7:
-                        # Particularly to address the US (prefer United States,
-                        # not United States of America) but may capture variations
-                        # in other English-speaking countries as well.
-                        if simple_name_key in component_value:
-                            name = component_value[simple_name_key]
-                        elif international_name_key in component_value:
-                            name = component_value[international_name_key]
+                    if component == AddressFormatter.COUNTRY:
+                        if iso_code3_key in component_value and r < alpha_3_iso_code_prob:
+                            name = component_value[iso_code3_key]
+                        elif iso_code_key in component_value and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob:
+                            name = component_value[iso_code_key]
+                        elif language == 'en' and not non_local_language and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob + simple_country_key_prob:
+                            # Particularly to address the US (prefer United States,
+                            # not United States of America) but may capture variations
+                            # in other English-speaking countries as well.
+                            if simple_name_key in component_value:
+                                name = component_value[simple_name_key]
+                            elif international_name_key in component_value:
+                                name = component_value[international_name_key]
+
                     if not name:
                         name = component_value.get(key, component_value.get(raw_key))
 
@@ -640,17 +816,22 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
                         seen.add((component, name))
 
             for component, vals in poly_components.iteritems():
-                if component not in address_components or (non_local_language and random.random() < 0.4):
-                    if component == AddressFormatter.STATE_DISTRICT and random.random() < 0.5:
+                if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
+                    if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob:
                         num = random.randrange(1, len(vals) + 1)
                         val = u', '.join(vals[:num])
                     else:
                         val = random.choice(vals)
 
-                    if component == AddressFormatter.STATE and random.random() < 0.7:
-                        val = STATE_EXPANSIONS.get(address_country, {}).get(val, val)
+                    if component == AddressFormatter.STATE and random.random() < expand_state_prob:
+                        val = STATE_EXPANSIONS.get(country.upper(), {}).get(val, val)
                     address_components[component] = val
 
+    def quattroshapes_city(self, address_components,
+                           latitude, longitude,
+                           language, non_local_language=None,
+                           qs_add_city_prob=0.2,
+                           abbreviated_name_prob=0.1):
         '''
         Quattroshapes/GeoNames cities
         -----------------------------
@@ -661,13 +842,15 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
         reliably use local names, which we'll want for consistency
         '''
 
-        if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < 0.2):
+        city = None
+
+        if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob):
             lang = non_local_language or language
-            quattroshapes_cities = quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
+            quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
             for result in quattroshapes_cities:
-                if result.get(quattroshapes_rtree.LEVEL) == quattroshapes_rtree.LOCALITY and quattroshapes_rtree.GEONAMES_ID in result:
-                    geonames_id = int(result[quattroshapes_rtree.GEONAMES_ID].split(',')[0])
-                    names = geonames.get_alternate_names(geonames_id)
+                if result.get(self.quattroshapes_rtree.LEVEL) == self.quattroshapes_rtree.LOCALITY and self.quattroshapes_rtree.GEONAMES_ID in result:
+                    geonames_id = int(result[self.quattroshapes_rtree.GEONAMES_ID].split(',')[0])
+                    names = self.geonames.get_alternate_names(geonames_id)
 
                     if not names or lang not in names:
                         continue
@@ -676,20 +859,27 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
                     if 'abbr' not in names or non_local_language:
                         # Use the common city name in the target language
                         city = names[lang][0][0]
-                    elif random.random() < 0.1:
+                    elif random.random() < abbreviated_name_prob:
                         # Use an abbreviation: NYC, BK, SF, etc.
                         city = random.choice(names['abbr'])[0]
 
                     if not city or not city.strip():
                         continue
-                    address_components[AddressFormatter.CITY] = city
+                    return city
                     break
             else:
                 if non_local_language and AddressFormatter.CITY in address_components and (
-                        AddressFormatter.CITY_DISTRICT in osm_components or
-                        AddressFormatter.SUBURB in osm_components):
+                        AddressFormatter.CITY_DISTRICT in address_components or
+                        AddressFormatter.SUBURB in address_components):
                     address_components.pop(AddressFormatter.CITY)
 
+        return city
+
+    def add_neighborhoods(self, address_components,
+                          latitude, longitude,
+                          osm_suffix='',
+                          add_prefix_prob=0.5,
+                          add_neighborhood_prob=0.5):
         '''
         Neighborhoods
         -------------
@@ -702,13 +892,17 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
         on the whole of better quality).
         '''
 
-        neighborhoods = neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
+        neighborhoods = self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
         neighborhood_levels = defaultdict(list)
+
+        name_key = ''.join(('name', osm_suffix))
+        raw_name_key = 'name'
+
         for neighborhood in neighborhoods:
             place_type = neighborhood.get('place')
             polygon_type = neighborhood.get('polygon_type')
 
-            key, raw_key = osm_pick_random_name_key(suffix=osm_suffix)
+            key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
             name = neighborhood.get(key, neighborhood.get(raw_key))
 
             if not name:
@@ -716,7 +910,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
 
                 name_prefix = neighborhood.get('name:prefix')
 
-                if name_prefix and random.random() < 0.5:
+                if name_prefix and random.random() < add_prefix_prob:
                     name = u' '.join([name_prefix, name])
 
             if not name:
@@ -737,9 +931,10 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
             neighborhood_levels[neighborhood_level].append(name)
 
         for component, neighborhoods in neighborhood_levels.iteritems():
-            if component not in address_components and random.random() < 0.5:
+            if component not in address_components and random.random() < add_neighborhood_prob:
                 address_components[component] = neighborhoods[0]
 
+    def normalize_names(self, address_components, replacement_prob=0.6):
         '''
         Name normalization
         ------------------
@@ -751,9 +946,22 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
             if not name:
                 continue
             replacement = replace_name_prefixes(replace_name_suffixes(name))
-            if replacement != name and random.random() < 0.6:
+            if replacement != name and random.random() < replacement_prob:
                 address_components[component] = replacement
 
+    def replace_names(self, address_components):
+        '''
+        Name replacements
+        -----------------
+
+        Make a few special replacements (like UK instead of GB)
+        '''
+        for component, value in address_components.iteritems():
+            replacement, prob = RANDOM_VALUE_REPLACEMENTS.get(component, {}).get(value, (None, 0.0))
+            if replacement is not None and random.random() < prob:
+                address_components[component] = replacement
+
+    def prune_duplicate_names(self, address_components):
         '''
         Name deduping
         -------------
@@ -764,7 +972,8 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
 
         name_components = defaultdict(list)
 
-        for component in (AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY, AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
+        for component in (AddressFormatter.CITY, AddressFormatter.STATE_DISTRICT,
+                          AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
             name = address_components.get(component)
             if name:
                 name_components[name].append(component)
@@ -774,7 +983,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
                 for component in components[1:]:
                     address_components.pop(component, None)
 
-
+    def cleanup_house_number(self, address_components):
         '''
         House number cleanup
         --------------------
@@ -796,8 +1005,161 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
             else:
                 address_components.pop(AddressFormatter.HOUSE_NUMBER, None)
 
+    def expanded_address_components(self, value):
+        try:
+            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
+        except Exception:
+            return None, None, None
+
+        country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude)
+        if not (country and candidate_languages):
+            return None, None, None
+
+        for key in OSM_IGNORE_KEYS:
+            _ = value.pop(key, None)
+
+        language = None
+
+        more_than_one_official_language = len(candidate_languages) > 1
+
+        language = self.pick_language(value, candidate_languages)
+
+        address_components = self.normalize_address_components(value)
+
+        address_country, non_local_language = self.country_name(address_components, country, language)
+        if address_country:
+            address_components[AddressFormatter.COUNTRY] = address_country
+
+        address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, state_full_name_prob=1.0)
+        if address_state:
+            address_components[AddressFormatter.STATE] = address_state
+
+        osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
+
+        self.add_osm_boundaries(address_components, country, language, latitude, longitude,
+                                non_local_language=non_local_language,
+                                osm_suffix=osm_suffix)
+
+        city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
+        if city:
+            address_components[AddressFormatter.CITY] = city
+
+        self.add_neighborhoods(address_components, latitude, longitude,
+                               osm_suffix=osm_suffix)
+
+        street = address_components.get(AddressFormatter.ROAD)
+        if street:
+            address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language)
+
+        self.normalize_names(address_components)
+
+        self.replace_names(address_components)
+
+        self.prune_duplicate_names(address_components)
+
+        self.cleanup_house_number(address_components)
+
+        return address_components, country, language
+
+    def limited_address_components(self, value):
+        try:
+            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
+        except Exception:
+            return None, None, None
+
+        country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude)
+        if not (country and candidate_languages):
+            return None, None, None
+
+        remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS
+
+        for key in remove_keys:
+            _ = value.pop(key, None)
+
+        language = None
+
+        more_than_one_official_language = len(candidate_languages) > 1
+
+        language = self.pick_language(value, candidate_languages)
+
+        address_components = self.normalize_address_components(value)
+
+        address_country, non_local_language = self.country_name(address_components, country, language,
+                                                                use_country_code_prob=0.0,
+                                                                local_language_name_prob=1.0,
+                                                                random_language_name_prob=0.0,
+                                                                alpha_3_iso_code_prob=0.0)
+        if address_country:
+            address_components[AddressFormatter.COUNTRY] = address_country
+
+        address_state = self.state_name(address_components, non_local_language)
+        if address_state:
+            address_components[AddressFormatter.STATE] = address_state
+
+        street = address_components.get(AddressFormatter.ROAD)
+        if street:
+            address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language)
+
+        osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
+
+        self.add_osm_boundaries(address_components, country, language, latitude, longitude,
+                                osm_suffix=osm_suffix,
+                                non_local_language=non_local_language,
+                                random_key=False,
+                                alpha_3_iso_code_prob=0.0,
+                                alpha_2_iso_code_prob=0.0,
+                                replace_with_non_local_prob=0.0,
+                                expand_state_prob=1.0)
+
+        city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
+
+        if city:
+            address_components[AddressFormatter.CITY] = city
+
+        self.add_neighborhoods(address_components, latitude, longitude,
+                               osm_suffix=osm_suffix)
+
+        self.normalize_names(address_components)
+
+        self.prune_duplicate_names(address_components)
+
+        return address_components, country, language
+
+    def formatted_addresses(self, value, dropout_prob=0.5, tag_components=True):
+        '''
+        Formatted addresses
+        -------------------
+
+        Produces one or more formatted addresses (tagged/untagged)
+        from the given dictionary of OSM tags and values.
+
+        Here we also apply component dropout meaning we produce several
+        different addresses with various components removed at random.
+        That way the parser will have many examples of queries that are
+        just city/state or just house_number/street. The selected
+        components still have to make sense i.e. a lone house_number will
+        not be used without a street name. The dependencies are listed
+        above, see: OSM_ADDRESS_COMPONENTS.
+
+        If there is more than one venue name (say name and alt_name),
+        addresses using both names and the selected components are
+        returned.
+        '''
+
+        venue_names = self.venue_names(value) or []
+
+        address_components, country, language = self.expanded_address_components(value)
+
+        if not address_components:
+            return None, None, None
+
+        for venue_name in venue_names:
+            abbreviated_venue = self.abbreviated_venue_name(venue_name, language)
+            if abbreviated_venue != venue_name and abbreviated_venue not in set(venue_names):
+                venue_names.append(abbreviated_venue)
+
         # Version with all components
-        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)
+        formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)
 
         if tag_components:
             formatted_addresses = []
@@ -807,7 +1169,7 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
 
             address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES}
             if not address_components:
-                continue
+                return []
 
             current_components = address_components.keys()
             random.shuffle(current_components)
@@ -815,36 +1177,159 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
             component_set = component_bitset(address_components.keys())
 
             for component in current_components:
-                if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < 0.5:
+                if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < dropout_prob:
                     address_components.pop(component)
                     component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component]
                     if not address_components:
-                        break
+                        return []
 
                     # Since venue names are 1-per-record, we must use them all
                     for venue_name in (venue_names or [None]):
                         if venue_name and AddressFormatter.HOUSE in address_components:
                             address_components[AddressFormatter.HOUSE] = venue_name
-                        formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
-                        if formatted_address not in seen:
+                        formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
+                        if formatted_address and formatted_address not in seen:
                             formatted_addresses.append(formatted_address)
                             seen.add(formatted_address)
 
+            return formatted_addresses, country, language
+        else:
+            formatted_addresses = []
+            seen = set()
+            # Since venue names are 1-per-record, we must use them all
+            for venue_name in (venue_names or [None]):
+                if venue_name:
+                    address_components[AddressFormatter.HOUSE] = venue_name
+                formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
+                if formatted_address and formatted_address not in seen:
+                    formatted_addresses.append(formatted_address)
+                    seen.add(formatted_address)
+            return formatted_addresses, country, language
+
+    def formatted_address_limited(self, value, admin_dropout_prob=0.7):
+        address_components, country, language = self.limited_address_components(value)
+
+        if not address_components:
+            return None, None, None
+
+        formatted_addresses = []
+
+        address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES}
+        if not address_components:
+            return []
+
+        current_components = address_components.keys()
+        random.shuffle(current_components)
+
+        for component in (AddressFormatter.COUNTRY, AddressFormatter.STATE,
+                          AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY,
+                          AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
+            if random.random() < admin_dropout_prob:
+                _ = address_components.pop(component, None)
+
+        if not address_components:
+            return None, None, None
+
+        # Version with all components
+        formatted_address = self.formatter.format_address(country, address_components, tag_components=False, minimal_only=False)
+
+        return formatted_address, country, language
+
+    def build_training_data(self, infile, out_dir, tag_components=True):
+        '''
+        Creates formatted address training data for supervised sequence labeling (or potentially 
+        for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
+
+        Example:
+
+        cs  cz  Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
+
+        The field structure is similar to other training data created by this script i.e.
+        {language, country, data}. The data field here is a sequence of labeled tokens similar
+        to what we might see in part-of-speech tagging.
+
+
+        This format uses a special character "|" to denote possible breaks in the input (comma, newline).
+
+        Note that for the address parser, we'd like it to be robust to many different types
+        of input, so we may selectively eleminate components
+
+        This information can potentially be used downstream by the sequence model as these
+        breaks may be present at prediction time.
+
+        Example:
+
+        sr      rs      Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic
+
+        This may be useful in learning word representations, statistical phrases, morphology
+        or other models requiring only the sequence of words.
+        '''
+        i = 0
+
+        if tag_components:
+            formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
+            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
+        else:
+            formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
+            writer = csv.writer(formatted_file, 'tsv_no_quote')
+
+        for node_id, value, deps in parse_osm(infile):
+            formatted_addresses, country, language = self.formatted_addresses(value, tag_components=tag_components)
+            if not formatted_addresses:
+                continue
+
             for formatted_address in formatted_addresses:
                 if formatted_address and formatted_address.strip():
                     formatted_address = tsv_string(formatted_address)
                     if not formatted_address or not formatted_address.strip():
                         continue
-                    row = (language, country, formatted_address)
+
+                    if tag_components:
+                        row = (language, country, formatted_address)
+                    else:
+                        row = formatted_address
 
                     writer.writerow(row)
-        elif formatted_address and formatted_address.strip():
-            formatted_address = tsv_string(formatted_address)
-            writer.writerow([formatted_address])
 
-        i += 1
-        if i % 1000 == 0 and i > 0:
-            print 'did', i, 'formatted addresses'
+            i += 1
+            if i % 1000 == 0 and i > 0:
+                print('did {} formatted addresses'.format(i))
+
+    def build_limited_training_data(self, infile, out_dir):
+        '''
+        Creates a special kind of formatted address training data from OSM's addr:* tags
+        but are designed for use in language classification. These records are similar 
+        to the untagged formatted records but include the language and country
+        (suitable for concatenation with the rest of the language training data),
+        and remove several fields like country which usually do not contain helpful
+        information for classifying the language.
+
+        Example:
+
+        nb      no      Olaf Ryes Plass Oslo
+        '''
+        i = 0
+
+        f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w')
+        writer = csv.writer(f, 'tsv_no_quote')
+
+        for node_id, value, deps in parse_osm(infile):
+            formatted_address, country, language = self.formatted_address_limited(value)
+            if not formatted_addresses:
+                continue
+
+            if formatted_address.strip():
+                formatted_address = tsv_string(formatted_address.strip())
+                if not formatted_address or not formatted_address.strip():
+                    continue
+
+                row = (language, country, formatted_address)
+                writer.writerow(row)
+
+            i += 1
+            if i % 1000 == 0 and i > 0:
+                print('did {} formatted addresses'.format(i))
+
 
 NAME_KEYS = (
     'name',
@@ -875,93 +1360,6 @@ POSTAL_KEYS = (
 )
 
 
-def build_address_format_training_data_limited(language_rtree, infile, out_dir):
-    '''
-    Creates a special kind of formatted address training data from OSM's addr:* tags
-    but are designed for use in language classification. These records are similar 
-    to the untagged formatted records but include the language and country
-    (suitable for concatenation with the rest of the language training data),
-    and remove several fields like country which usually do not contain helpful
-    information for classifying the language.
-
-    Example:
-
-    nb      no      Olaf Ryes Plass Oslo
-    '''
-    i = 0
-
-    # Simple whitespace splitter is all that's necessary
-    formatter = AddressFormatter(splitter=u' ')
-
-    f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w')
-    writer = csv.writer(f, 'tsv_no_quote')
-
-    remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + COUNTRY_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS
-
-    country_keys_set = set(COUNTRY_KEYS)
-
-    for key, value, deps in parse_osm(infile):
-        try:
-            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
-        except Exception:
-            continue
-
-        have_country = False
-
-        for k in remove_keys:
-            if k in country_keys_set and k in value:
-                have_country = True
-            _ = value.pop(k, None)
-
-        if have_country and random.random() < 0.4:
-            have_country = False
-        elif not have_country and random.random() < 0.2:
-            have_country = True
-
-        if not value:
-            continue
-
-        country, name_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street')
-        if not name_language:
-            continue
-
-        if have_country:
-            value['addr:country'] = u''
-
-        single_language = len(name_language) == 1
-
-        for lang, val in name_language.iteritems():
-            if lang not in languages:
-                continue
-
-            if have_country:
-                localized = language_country_names.get(lang, {}).get(country.upper())
-
-                if localized:
-                    value['addr:country:{}'.format(lang)] = localized
-
-            address_dict = value.copy()
-            for k in address_dict.keys():
-                namespaced_val = u'{}:{}'.format(k, lang)
-                if namespaced_val in address_dict:
-                    address_dict[k] = address_dict[namespaced_val]
-                elif not single_language:
-                    address_dict.pop(k)
-
-            if not address_dict:
-                continue
-
-            formatted_address_untagged = formatter.format_address(country, address_dict, minimal_only=False, tag_components=False)
-            if formatted_address_untagged is not None:
-                formatted_address_untagged = tsv_string(formatted_address_untagged)
-
-                writer.writerow((lang, country, formatted_address_untagged))
-
-        i += 1
-        if i % 1000 == 0 and i > 0:
-            print 'did', i, 'formatted addresses'
-
-
 def build_toponym_training_data(language_rtree, infile, out_dir):
     '''
     Data set of toponyms by language and country which should assist
@@ -1054,7 +1452,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
                     continue
                 writer.writerow((k, country, tsv_string(s)))
             if i % 1000 == 0 and i > 0:
-                print 'did', i, 'toponyms'
+                print('did {} toponyms'.format(i))
             i += 1
 
     f.close()
@@ -1086,7 +1484,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
                 if k in languages:
                     writer.writerow((k, country, tsv_string(s)))
             if i % 1000 == 0 and i > 0:
-                print 'did', i, 'streets'
+                print('did {} streets'.format(i))
             i += 1
 
     f.close()
@@ -1124,7 +1522,7 @@ def build_venue_training_data(language_rtree, infile, out_dir):
                 if k in languages:
                     writer.writerow((k, country, safe_encode(venue_type), tsv_string(s)))
             if i % 1000 == 0 and i > 0:
-                print 'did', i, 'venues'
+                print('did, {} venues'.format(i))
             i += 1
 
     f.close()
@@ -1192,6 +1590,7 @@ if __name__ == '__main__':
 
     init_country_names()
     init_languages()
+    init_disambiguation()
 
     language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir)
     osm_rtree = None
@@ -1221,7 +1620,7 @@ if __name__ == '__main__':
 
     if args.address_file and not args.format_only and not args.limited_addresses:
         build_address_training_data(language_rtree, args.address_file, args.out_dir)
-    elif args.address_file and not args.limited_addresses:
+    elif args.address_file:
         if osm_rtree is None:
             parser.error('--rtree-dir required for formatted addresses')
         elif neighborhoods_rtree is None:
@@ -1232,8 +1631,10 @@ if __name__ == '__main__':
             parser.error('--geonames-db required for formatted addresses')
 
     if args.address_file and args.format_only:
-        build_address_format_training_data(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, args.address_file, args.out_dir, tag_components=not args.untagged)
+        osm_formatter = OSMAddressFormatter(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
+        osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
     if args.address_file and args.limited_addresses:
-        build_address_format_training_data_limited(language_rtree, args.address_file, args.out_dir)
+        osm_formatter = OSMAddressFormatter(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
+        osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
     if args.venues_file:
         build_venue_training_data(language_rtree, args.venues_file, args.out_dir)