diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py
index 223243be..a7fb5c02 100644
--- a/scripts/geodata/osm/osm_address_training_data.py
+++ b/scripts/geodata/osm/osm_address_training_data.py
@@ -8,7 +8,7 @@ streets, venues and toponyms.
 
 Note: the combined size of all the files created by this script exceeds 100GB
 so if training these models, it is wise to use a server-grade machine with
-plenty of disk space. The following commands can be used in parallel to create 
+plenty of disk space. The following commands can be used in parallel to create
 all the training sets:
 
 Ways:
@@ -49,8 +49,7 @@ from itertools import ifilter, chain, combinations
 this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 
-sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
-
+from geodata.address_expansions.abbreviations import abbreviate
 from geodata.address_expansions.gazetteers import *
 from geodata.coordinates.conversion import *
 from geodata.countries.country_names import *
@@ -58,11 +57,12 @@ from geodata.geonames.db import GeoNamesDB
 from geodata.language_id.disambiguation import *
 from geodata.language_id.sample import sample_random_language
 from geodata.states.state_abbreviations import STATE_ABBREVIATIONS, STATE_EXPANSIONS
-from geodata.language_id.polygon_lookup import country_and_languages
 from geodata.i18n.languages import *
 from geodata.address_formatting.formatter import AddressFormatter
 from geodata.names.normalization import replace_name_prefixes, replace_name_suffixes
+from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
 from geodata.osm.extract import *
+from geodata.osm.formatter import OSMAddressFormatter
 from geodata.polygons.language_polys import *
 from geodata.polygons.reverse_geocode import *
 from geodata.i18n.unicode_paths import DATA_DIR
@@ -70,8 +70,6 @@ from geodata.i18n.unicode_paths import DATA_DIR
 from geodata.csv_utils import *
 from geodata.file_utils import *
 
-this_dir = os.path.realpath(os.path.dirname(__file__))
-
 # Input files
 PLANET_ADDRESSES_INPUT_FILE = 'planet-addresses.osm'
 PLANET_WAYS_INPUT_FILE = 'planet-ways.osm'
@@ -87,143 +85,6 @@ ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
 TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv'
 
 
-class AddressComponent(object):
-    '''
-    Declare an address component and its dependencies e.g.
-    a house_numer cannot be used in the absence of a road name.
-    '''
-    ANY = 'any'
-
-    def __init__(self, name, dependencies=tuple(), method=ANY):
-        self.name = name
-        self.dependencies = dependencies
-
-    def __hash__(self):
-        return hash(self.name)
-
-    def __cmp__(self, other):
-        return cmp(self.name, other.name)
-
-
-OSM_ADDRESS_COMPONENTS = OrderedDict.fromkeys([
-    AddressComponent(AddressFormatter.HOUSE),
-    AddressComponent(AddressFormatter.ROAD, dependencies=(AddressFormatter.HOUSE,
-                                                          AddressFormatter.HOUSE_NUMBER,
-                                                          AddressFormatter.SUBURB,
-                                                          AddressFormatter.CITY,
-                                                          AddressFormatter.POSTCODE)),
-    AddressComponent(AddressFormatter.HOUSE_NUMBER, dependencies=(AddressFormatter.ROAD,)),
-    AddressComponent(AddressFormatter.SUBURB, dependencies=(AddressFormatter.CITY, AddressFormatter.STATE,
-                                                            AddressFormatter.POSTCODE)),
-    AddressComponent(AddressFormatter.CITY_DISTRICT, dependencies=(AddressFormatter.CITY,)),
-    AddressComponent(AddressFormatter.CITY),
-    AddressComponent(AddressFormatter.STATE_DISTRICT, dependencies=(AddressFormatter.STATE, AddressFormatter.POSTCODE)),
-    AddressComponent(AddressFormatter.STATE, dependencies=(AddressFormatter.SUBURB, AddressFormatter.CITY,
-                                                           AddressFormatter.POSTCODE, AddressFormatter.COUNTRY)),
-    AddressComponent(AddressFormatter.POSTCODE),
-    AddressComponent(AddressFormatter.COUNTRY),
-])
-
-
-def num_deps(c):
-    return len(c.dependencies)
-
-
-RANDOM_VALUE_REPLACEMENTS = {
-    # Key: address component
-    AddressFormatter.COUNTRY: {
-        # value: (replacement, probability)
-        'GB': ('UK', 0.3),
-        'United Kingdom': ('UK', 0.3),
-    }
-}
-
-
-OSM_ADDRESS_COMPONENTS_SORTED = sorted(OSM_ADDRESS_COMPONENTS, key=num_deps)
-
-OSM_ADDRESS_COMPONENT_COMBINATIONS = []
-
-'''
-The following statements create a bitset of address components
-for quickly checking testing whether or not a candidate set of
-address components can be considered a full geographic string
-suitable for formatting (i.e. would be a valid geocoder query).
-For instance, a house number by itself is not sufficient
-to be considered a valid address for this purpose unless it
-has a road name as well. Using bitsets we can easily answer
-questions like "Is house/house_number/road/city valid?"
-'''
-OSM_ADDRESS_COMPONENT_VALUES = {
-    c.name: 1 << i
-    for i, c in enumerate(OSM_ADDRESS_COMPONENTS.keys())
-}
-
-OSM_ADDRESS_COMPONENTS_VALID = set()
-
-
-def component_bitset(components):
-    return reduce(operator.or_, [OSM_ADDRESS_COMPONENT_VALUES[c] for c in components])
-
-
-for i in xrange(1, len(OSM_ADDRESS_COMPONENTS.keys())):
-    for perm in combinations(OSM_ADDRESS_COMPONENTS.keys(), i):
-        perm_set = set([p.name for p in perm])
-        valid = all((not p.dependencies or any(d in perm_set for d in p.dependencies) for p in perm))
-        if valid:
-            components = [c.name for c in perm]
-            OSM_ADDRESS_COMPONENT_COMBINATIONS.append(tuple(components))
-            OSM_ADDRESS_COMPONENTS_VALID.add(component_bitset(components))
-
-
-class OSMField(object):
-    def __init__(self, name, c_constant, alternates=None):
-        self.name = name
-        self.c_constant = c_constant
-        self.alternates = alternates
-
-osm_fields = [
-    # Field if alternate_names present, default field name if not, C header constant
-    OSMField('addr:housename', 'OSM_HOUSE_NAME'),
-    OSMField('addr:housenumber', 'OSM_HOUSE_NUMBER'),
-    OSMField('addr:block', 'OSM_BLOCK'),
-    OSMField('addr:street', 'OSM_STREET_ADDRESS'),
-    OSMField('addr:place', 'OSM_PLACE'),
-    OSMField('addr:city', 'OSM_CITY', alternates=['addr:locality', 'addr:municipality', 'addr:hamlet']),
-    OSMField('addr:suburb', 'OSM_SUBURB'),
-    OSMField('addr:neighborhood', 'OSM_NEIGHBORHOOD', alternates=['addr:neighbourhood']),
-    OSMField('addr:district', 'OSM_DISTRICT'),
-    OSMField('addr:subdistrict', 'OSM_SUBDISTRICT'),
-    OSMField('addr:ward', 'OSM_WARD'),
-    OSMField('addr:state', 'OSM_STATE'),
-    OSMField('addr:province', 'OSM_PROVINCE'),
-    OSMField('addr:postcode', 'OSM_POSTAL_CODE', alternates=['addr:postal_code']),
-    OSMField('addr:country', 'OSM_COUNTRY'),
-]
-
-
-BOUNDARY_COMPONENTS = (
-    AddressFormatter.SUBURB,
-    AddressFormatter.CITY_DISTRICT,
-    AddressFormatter.CITY,
-    AddressFormatter.STATE_DISTRICT,
-    AddressFormatter.STATE
-)
-
-
-def write_osm_json(filename, out_filename):
-    out = open(out_filename, 'w')
-    writer = csv.writer(out, 'tsv_no_quote')
-    for key, attrs, deps in parse_osm(filename):
-        writer.writerow((key, json.dumps(attrs)))
-    out.close()
-
-
-def read_osm_json(filename):
-    reader = csv.reader(open(filename), delimiter='\t')
-    for key, attrs in reader:
-        yield key, json.loads(attrs)
-
-
 def normalize_osm_name_tag(tag, script=False):
     norm = tag.rsplit(':', 1)[-1]
     if not script:
@@ -244,7 +105,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
     except Exception:
         return None, None
 
-    country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
+    country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude)
     if not (country and candidate_languages):
         return None, None
 
@@ -319,165 +180,8 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
 
 ALL_LANGUAGES = 'all'
 
-LOWER, UPPER, TITLE, MIXED = range(4)
 
-
-def token_capitalization(s):
-    if s.istitle():
-        return TITLE
-    elif s.islower():
-        return LOWER
-    elif s.isupper():
-        return UPPER
-    else:
-        return MIXED
-
-
-def recase_abbreviation(expansion, tokens):
-    expansion_tokens = expansion.split()
-    if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
-        return expansion.upper()
-    elif len(tokens) == len(expansion_tokens):
-        strings = []
-        for (t, c), e in zip(tokens, expansion_tokens):
-            cap = token_capitalization(t)
-            if cap == LOWER:
-                strings.append(e.lower())
-            elif cap == UPPER:
-                strings.append(e.upper())
-            elif cap == TITLE:
-                strings.append(e.title())
-            elif t.lower() == e.lower():
-                strings.append(t)
-            else:
-                strings.append(e.title())
-        return u' '.join(strings)
-    else:
-        return u' '.join([t.title() for t in expansion_tokens])
-
-
-def osm_abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2):
-    '''
-    Abbreviations
-    -------------
-
-    OSM discourages abbreviations, but to make our training data map better
-    to real-world input, we can safely replace the canonical phrase with an
-    abbreviated version and retain the meaning of the words
-    '''
-    raw_tokens = tokenize_raw(s)
-    s_utf8 = safe_encode(s)
-    tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
-    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
-
-    n = len(tokens)
-
-    abbreviated = []
-
-    i = 0
-
-    for t, c, length, data in gazetteer.filter(norm_tokens):
-        if c is PHRASE:
-            valid = []
-            data = [d.split('|') for d in data]
-
-            added = False
-
-            if random.random() > abbreviate_prob:
-                for j, (t_i, c_i) in enumerate(t):
-                    abbreviated.append(tokens[i + j][0])
-                    if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
-                        abbreviated.append(u' ')
-                i += len(t)
-                continue
-
-            for lang, dictionary, is_canonical, canonical in data:
-                if lang not in (language, 'all'):
-                    continue
-
-                is_canonical = int(is_canonical)
-                is_stopword = dictionary == 'stopword'
-                is_prefix = dictionary.startswith('concatenated_prefixes')
-                is_suffix = dictionary.startswith('concatenated_suffixes')
-                is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
-
-                suffix = None
-                prefix = None
-
-                if not is_canonical:
-                    continue
-
-                if not is_prefix and not is_suffix:
-                    abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
-                    token = random.choice(abbreviations) if abbreviations else canonical
-                    token = recase_abbreviation(token, tokens[i:i + len(t)])
-                    abbreviated.append(token)
-                    if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
-                        abbreviated.append(u' ')
-                    break
-                elif is_prefix:
-                    token = tokens[i][0]
-                    prefix, token = token[:length], token[length:]
-                    abbreviated.append(prefix)
-                    if random.random() < separate_prob:
-                        abbreviated.append(u' ')
-                    if token.islower():
-                        abbreviated.append(token.title())
-                    else:
-                        abbreviated.append(token)
-                    abbreviated.append(u' ')
-                    break
-                elif is_suffix:
-                    token = tokens[i][0]
-
-                    token, suffix = token[:-length], token[-length:]
-
-                    concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
-
-                    separated_abbreviations = []
-                    phrase = gazetteer.trie.get(suffix.rstrip('.'))
-                    suffix_data = [safe_decode(d).split(u'|') for d in (phrase or [])]
-                    for l, d, _, c in suffix_data:
-                        if l == lang and c == canonical:
-                            separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
-
-                    separate = random.random() < separate_prob
-
-                    if concatenated_abbreviations and not separate:
-                        abbreviation = random.choice(concatenated_abbreviations)
-                    elif separated_abbreviations:
-                        abbreviation = random.choice(separated_abbreviations)
-                    else:
-                        abbreviation = canonical
-
-                    abbreviated.append(token)
-                    if separate:
-                        abbreviated.append(u' ')
-                    if suffix.isupper():
-                        abbreviated.append(abbreviation.upper())
-                    elif separate:
-                        abbreviated.append(abbreviation.title())
-                    else:
-                        abbreviated.append(abbreviation)
-                    abbreviated.append(u' ')
-                    break
-            else:
-                for j, (t_i, c_i) in enumerate(t):
-                    abbreviated.append(tokens[i + j][0])
-                    if i + j < n - 1 and raw_tokens[i + j + 1][0] > sum(raw_tokens[i + j][:2]):
-                        abbreviated.append(u' ')
-            i += len(t)
-
-        else:
-            abbreviated.append(tokens[i][0])
-            if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
-                abbreviated.append(u' ')
-            i += 1
-
-    return u''.join(abbreviated).strip()
-
-
-def build_ways_training_data(language_rtree, infile, out_dir):
+def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets=True):
     '''
     Creates a training set for language classification using most OSM ways
     (streets) under a fairly lengthy osmfilter definition which attempts to
@@ -502,7 +206,9 @@ def build_ways_training_data(language_rtree, infile, out_dir):
                 for s in v.split(';'):
                     if lang in languages:
                         writer.writerow((lang, country, tsv_string(s)))
-                        abbrev = osm_abbreviate(street_types_gazetteer, s, lang)
+                        if not abbreviate_streets:
+                            continue
+                        abbrev = abbreviate(street_and_synonyms_gazetteer, s, lang)
                         if abbrev != s:
                             writer.writerow((lang, country, tsv_string(abbrev)))
             if i % 1000 == 0 and i > 0:
@@ -510,871 +216,6 @@ def build_ways_training_data(language_rtree, infile, out_dir):
             i += 1
     f.close()
 
-OSM_IGNORE_KEYS = (
-    'house',
-)
-
-
-def strip_keys(value, ignore_keys):
-    for key in ignore_keys:
-        value.pop(key, None)
-
-
-def osm_reverse_geocoded_components(admin_rtree, country, latitude, longitude):
-    ret = defaultdict(list)
-    for props in admin_rtree.point_in_poly(latitude, longitude, return_all=True):
-        name = props.get('name')
-        if not name:
-            continue
-
-        for k, v in props.iteritems():
-            normalized_key = osm_address_components.get_component(country, k, v)
-            if normalized_key:
-                ret[normalized_key].append(props)
-    return ret
-
-
-class OSMAddressFormatter(object):
-    alpha3_codes = {c.alpha2: c.alpha3 for c in pycountry.countries}
-
-    rare_components = {
-        AddressFormatter.SUBURB,
-        AddressFormatter.CITY_DISTRICT,
-        AddressFormatter.STATE_DISTRICT,
-        AddressFormatter.STATE,
-    }
-
-    state_important = {
-        'US',
-        'CA',
-    }
-
-    def __init__(self, admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames, splitter=None):
-        self.admin_rtree = admin_rtree
-        self.language_rtree = language_rtree
-        self.neighborhoods_rtree = neighborhoods_rtree
-        self.quattroshapes_rtree = quattroshapes_rtree
-        self.geonames = geonames
-        self.formatter = AddressFormatter(splitter=splitter)
-        osm_address_components.configure()
-
-    def pick_language(self, value, candidate_languages, pick_namespaced_language_prob=0.6):
-        language = None
-
-        if len(candidate_languages) == 1:
-            language = candidate_languages[0]['lang']
-        else:
-            street = value.get('addr:street', None)
-
-            namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in value]
-
-            if street is not None and not namespaced:
-                language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
-            elif namespaced and random.random() < pick_namespaced_language_prob:
-                language = random.choice(namespaced)
-                lang_suffix = ':{}'.format(language)
-                for k in value:
-                    if k.startswith('addr:') and k.endswith(lang_suffix):
-                        value[k.rstrip(lang_suffix)] = value[k]
-            else:
-                language = UNKNOWN_LANGUAGE
-
-        return language
-
-    def pick_random_name_key(self, suffix=''):
-        name_key = ''.join(('name', suffix))
-        raw_name_key = 'name'
-        short_name_key = ''.join(('short_name', suffix))
-        raw_short_name_key = 'short_name'
-        alt_name_key = ''.join(('alt_name', suffix))
-        raw_alt_name_key = 'alt_name'
-        official_name_key = ''.join(('official_name', suffix))
-        raw_official_name_key = 'official_name'
-
-        # Choose which name to use with given probabilities
-        r = random.random()
-        if r < 0.7:
-            # 70% of the time use the name tag
-            key = name_key
-            raw_key = raw_name_key
-        elif r < 0.8:
-            # 10% of the time use the short name
-            key = short_name_key
-            raw_key = raw_short_name_key
-        elif r < 0.9:
-            # 10% of the time use the official name
-            key = official_name_key
-            raw_key = raw_official_name_key
-        else:
-            # 10% of the time use the official name
-            key = alt_name_key
-            raw_key = raw_alt_name_key
-
-        return key, raw_key
-
-    def normalize_address_components(self, value):
-        address_components = {k: v for k, v in value.iteritems() if k in self.formatter.aliases}
-        self.formatter.replace_aliases(address_components)
-        return address_components
-
-    def abbreviated_street(self, street, language, abbreviate_prob=0.3, separate_prob=0.2):
-        '''
-        Street abbreviations
-        --------------------
-
-        Use street and unit type dictionaries to probabilistically abbreviate
-        phrases. Because the abbreviation is picked at random, this should
-        help bridge the gap between OSM addresses and user input, in addition
-        to capturing some non-standard abbreviations/surface forms which may be
-        missing or sparse in OSM.
-        '''
-        return osm_abbreviate(street_and_unit_types_gazetteer, street, language,
-                              abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
-
-    def abbreviated_venue_name(self, name, language, abbreviate_prob=0.2, separate_prob=0.0):
-        '''
-        Venue abbreviations
-        -------------------
-
-        Use street and unit type dictionaries to probabilistically abbreviate
-        phrases. Because the abbreviation is picked at random, this should
-        help bridge the gap between OSM addresses and user input, in addition
-        to capturing some non-standard abbreviations/surface forms which may be
-        missing or sparse in OSM.
-        '''
-        return osm_abbreviate(names_gazetteer, name, language,
-                              abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
-
-    def country_name(self, address_components, country_code, language,
-                     use_country_code_prob=0.3,
-                     local_language_name_prob=0.6,
-                     random_language_name_prob=0.1,
-                     alpha_3_iso_code_prob=0.1,
-                     ):
-        '''
-        Country names
-        -------------
-
-        In OSM, addr:country is almost always an ISO-3166 alpha-2 country code.
-        However, we'd like to expand these to include natural language forms
-        of the country names we might be likely to encounter in a geocoder or
-        handwritten address.
-
-        These splits are somewhat arbitrary but could potentially be fit to data
-        from OpenVenues or other sources on the usage of country name forms.
-
-        If the address includes a country, the selection procedure proceeds as follows:
-
-        1. With probability a, select the country name in the language of the address
-           (determined above), or with the localized country name if the language is
-           undtermined or ambiguous.
-
-        2. With probability b(1-a), sample a language from the distribution of
-           languages on the Internet and use the country's name in that language.
-
-        3. This is implicit, but with probability (1-b)(1-a), keep the country code
-        '''
-
-        non_local_language = None
-
-        address_country = address_components.get(AddressFormatter.COUNTRY)
-
-        if random.random() < use_country_code_prob:
-            # 30% of the time: add Quattroshapes country
-            address_country = country_code.upper()
-
-        r = random.random()
-
-        # 1. 60% of the time: use the country name in the current language or the country's local language
-        if address_country and r < local_language_name_prob:
-            localized = None
-            if language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
-                localized = language_country_names.get(language, {}).get(address_country.upper())
-
-            if not localized:
-                localized = country_localized_display_name(address_country.lower())
-
-            if localized:
-                address_country = localized
-        # 2. 10% of the time: country's name in a language samples from the distribution of languages on the Internet
-        elif address_country and r < local_language_name_prob + random_language_name_prob:
-            non_local_language = sample_random_language()
-            lang_country = language_country_names.get(non_local_language, {}).get(address_country.upper())
-            if lang_country:
-                address_country = lang_country
-        # 3. 10% of the time: use the country's alpha-3 ISO code
-        elif address_country and r < local_language_name_prob + random_language_name_prob + alpha_3_iso_code_prob:
-            iso_code_alpha3 = self.alpha3_codes.get(address_country)
-            if iso_code_alpha3:
-                address_country = iso_code_alpha3
-        # 4. Implicit: the rest of the time keep the alpha-2 country code
-
-        return address_country, non_local_language
-
-    def venue_names(self, value):
-        '''
-        Venue names
-        -----------
-
-        Some venues have multiple names listed in OSM, grab them all
-        With a certain probability, add None to the list so we drop the name
-        '''
-
-        venue_names = []
-        for key in ('name', 'alt_name', 'loc_name', 'int_name', 'old_name'):
-            venue_name = value.get(key)
-            if venue_name:
-                venue_names.append(venue_name)
-        return venue_names
-
-    def state_name(self, address_components, country, language, non_local_language=None, state_full_name_prob=0.4):
-        '''
-        States
-        ------
-
-        Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name
-        whereas we'd like to include both forms, so wtih some probability, replace the abbreviated
-        name with the unabbreviated one e.g. CA => California
-        '''
-        address_state = address_components.get(AddressFormatter.STATE)
-
-        if address_state and country and not non_local_language:
-            state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)
-
-            if state_full_name and random.random() < state_full_name_prob:
-                address_state = state_full_name
-        elif address_state and non_local_language:
-            _ = address_components.pop(AddressFormatter.STATE, None)
-            address_state = None
-        return address_state
-
-    def tag_suffix(self, language, non_local_language, more_than_one_official_language=False):
-        if non_local_language is not None:
-            osm_suffix = ':{}'.format(non_local_language)
-        elif more_than_one_official_language and language not in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
-            osm_suffix = ':{}'.format(language)
-        else:
-            osm_suffix = ''
-        return osm_suffix
-
-    def add_osm_boundaries(self, address_components,
-                           country, language,
-                           latitude, longitude,
-                           osm_suffix='',
-                           non_local_language=None,
-                           random_key=True,
-                           alpha_3_iso_code_prob=0.1,
-                           alpha_2_iso_code_prob=0.2,
-                           simple_country_key_prob=0.4,
-                           replace_with_non_local_prob=0.4,
-                           join_state_district_prob=0.5,
-                           expand_state_prob=0.7
-                           ):
-        '''
-        OSM boundaries
-        --------------
-
-        For many addresses, the city, district, region, etc. are all implicitly
-        generated by the reverse geocoder e.g. we do not need an addr:city tag
-        to identify that 40.74, -74.00 is in New York City as well as its parent
-        geographies (New York county, New York state, etc.)
-
-        Where possible we augment the addr:* tags with some of the reverse-geocoded
-        relations from OSM.
-
-        Since addresses found on the web may have the same properties, we
-        include these qualifiers in the training data.
-        '''
-
-        osm_components = osm_reverse_geocoded_components(self.admin_rtree, country, latitude, longitude)
-
-        name_key = ''.join(('name', osm_suffix))
-        raw_name_key = 'name'
-        simple_name_key = 'name:simple'
-        international_name_key = 'int_name'
-
-        iso_code_key = 'ISO3166-1:alpha2'
-        iso_code3_key = 'ISO3166-1:alpha3'
-
-        if osm_components:
-            poly_components = defaultdict(list)
-
-            existing_city_name = address_components.get(AddressFormatter.CITY)
-
-            for component, components_values in osm_components.iteritems():
-                seen = set()
-
-                if random_key:
-                    key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
-                else:
-                    key, raw_key = name_key, raw_name_key
-
-                for component_value in components_values:
-                    r = random.random()
-                    name = None
-
-                    if component == AddressFormatter.COUNTRY:
-                        if iso_code3_key in component_value and r < alpha_3_iso_code_prob:
-                            name = component_value[iso_code3_key]
-                        elif iso_code_key in component_value and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob:
-                            name = component_value[iso_code_key]
-                        elif language == 'en' and not non_local_language and r < alpha_3_iso_code_prob + alpha_2_iso_code_prob + simple_country_key_prob:
-                            # Particularly to address the US (prefer United States,
-                            # not United States of America) but may capture variations
-                            # in other English-speaking countries as well.
-                            if simple_name_key in component_value:
-                                name = component_value[simple_name_key]
-                            elif international_name_key in component_value:
-                                name = component_value[international_name_key]
-
-                    if not name:
-                        name = component_value.get(key, component_value.get(raw_key))
-
-                    if not name or (component != AddressFormatter.CITY and name == existing_city_name):
-                        name = component_value.get(name_key, component_value.get(raw_name_key))
-
-                    if not name or (component != AddressFormatter.CITY and name == existing_city_name):
-                        continue
-
-                    if (component, name) not in seen:
-                        poly_components[component].append(name)
-                        seen.add((component, name))
-
-            for component, vals in poly_components.iteritems():
-                if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
-                    if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob:
-                        num = random.randrange(1, len(vals) + 1)
-                        val = u', '.join(vals[:num])
-                    else:
-                        val = random.choice(vals)
-
-                    if component == AddressFormatter.STATE and random.random() < expand_state_prob:
-                        val = STATE_EXPANSIONS.get(country.upper(), {}).get(val, val)
-                    address_components[component] = val
-
-    def quattroshapes_city(self, address_components,
-                           latitude, longitude,
-                           language, non_local_language=None,
-                           qs_add_city_prob=0.2,
-                           abbreviated_name_prob=0.1):
-        '''
-        Quattroshapes/GeoNames cities
-        -----------------------------
-
-        Quattroshapes isn't great for everything, but it has decent city boundaries
-        in places where OSM sometimes does not (or at least in places where we aren't
-        currently able to create valid polygons). While Quattroshapes itself doesn't
-        reliably use local names, which we'll want for consistency
-        '''
-
-        city = None
-
-        if non_local_language or (AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob):
-            lang = non_local_language or language
-            quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
-            for result in quattroshapes_cities:
-                if result.get(self.quattroshapes_rtree.LEVEL) == self.quattroshapes_rtree.LOCALITY and self.quattroshapes_rtree.GEONAMES_ID in result:
-                    geonames_id = int(result[self.quattroshapes_rtree.GEONAMES_ID].split(',')[0])
-                    names = self.geonames.get_alternate_names(geonames_id)
-
-                    if not names or lang not in names:
-                        continue
-
-                    city = None
-                    if 'abbr' not in names or non_local_language:
-                        # Use the common city name in the target language
-                        city = names[lang][0][0]
-                    elif random.random() < abbreviated_name_prob:
-                        # Use an abbreviation: NYC, BK, SF, etc.
-                        city = random.choice(names['abbr'])[0]
-
-                    if not city or not city.strip():
-                        continue
-                    return city
-                    break
-            else:
-                if non_local_language and AddressFormatter.CITY in address_components and (
-                        AddressFormatter.CITY_DISTRICT in address_components or
-                        AddressFormatter.SUBURB in address_components):
-                    address_components.pop(AddressFormatter.CITY)
-
-        return city
-
-    def add_neighborhoods(self, address_components,
-                          latitude, longitude,
-                          osm_suffix='',
-                          add_prefix_prob=0.5,
-                          add_neighborhood_prob=0.5):
-        '''
-        Neighborhoods
-        -------------
-
-        In some cities, neighborhoods may be included in a free-text address.
-
-        OSM includes many neighborhoods but only as points, rather than the polygons
-        needed to perform reverse-geocoding. We use a hybrid index containing
-        Quattroshapes/Zetashapes polygons matched fuzzily with OSM names (which are
-        on the whole of better quality).
-        '''
-
-        neighborhoods = self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
-        neighborhood_levels = defaultdict(list)
-
-        name_key = ''.join(('name', osm_suffix))
-        raw_name_key = 'name'
-
-        for neighborhood in neighborhoods:
-            place_type = neighborhood.get('place')
-            polygon_type = neighborhood.get('polygon_type')
-
-            key, raw_key = self.pick_random_name_key(suffix=osm_suffix)
-            name = neighborhood.get(key, neighborhood.get(raw_key))
-
-            if not name:
-                name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
-
-                name_prefix = neighborhood.get('name:prefix')
-
-                if name_prefix and random.random() < add_prefix_prob:
-                    name = u' '.join([name_prefix, name])
-
-            if not name:
-                continue
-
-            neighborhood_level = AddressFormatter.SUBURB
-
-            if place_type == 'borough' or polygon_type == 'local_admin':
-                neighborhood_level = AddressFormatter.CITY_DISTRICT
-
-                # Optimization so we don't use e.g. Brooklyn multiple times
-                city_name = address_components.get(AddressFormatter.CITY)
-                if name == city_name:
-                    name = neighborhood.get(name_key, neighborhood.get(raw_name_key))
-                    if not name or name == city_name:
-                        continue
-
-            neighborhood_levels[neighborhood_level].append(name)
-
-        for component, neighborhoods in neighborhood_levels.iteritems():
-            if component not in address_components and random.random() < add_neighborhood_prob:
-                address_components[component] = neighborhoods[0]
-
-    def normalize_names(self, address_components, replacement_prob=0.6):
-        '''
-        Name normalization
-        ------------------
-
-        Probabilistically strip standard prefixes/suffixes e.g. "London Borough of"
-        '''
-        for component in BOUNDARY_COMPONENTS:
-            name = address_components.get(component)
-            if not name:
-                continue
-            replacement = replace_name_prefixes(replace_name_suffixes(name))
-            if replacement != name and random.random() < replacement_prob:
-                address_components[component] = replacement
-
-    def replace_names(self, address_components):
-        '''
-        Name replacements
-        -----------------
-
-        Make a few special replacements (like UK instead of GB)
-        '''
-        for component, value in address_components.iteritems():
-            replacement, prob = RANDOM_VALUE_REPLACEMENTS.get(component, {}).get(value, (None, 0.0))
-            if replacement is not None and random.random() < prob:
-                address_components[component] = replacement
-
-    def prune_duplicate_names(self, address_components):
-        '''
-        Name deduping
-        -------------
-
-        For some cases like "Antwerpen, Antwerpen, Antwerpen"
-        that are very unlikely to occur in real life.
-        '''
-
-        name_components = defaultdict(list)
-
-        for component in (AddressFormatter.CITY, AddressFormatter.STATE_DISTRICT,
-                          AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
-            name = address_components.get(component)
-            if name:
-                name_components[name].append(component)
-
-        for name, components in name_components.iteritems():
-            if len(components) > 1:
-                for component in components[1:]:
-                    address_components.pop(component, None)
-
-    def cleanup_house_number(self, address_components):
-        '''
-        House number cleanup
-        --------------------
-
-        For some OSM nodes, particularly in Uruguay, we get house numbers
-        that are actually a comma-separated list.
-
-        If there's one comma in the house number, allow it as it might
-        be legitimate, but if there are 2 or more, just take the first one.
-        '''
-
-        house_number = address_components.get(AddressFormatter.HOUSE_NUMBER)
-        if ';' in house_number:
-            house_number = house_number.replace(';', ',')
-            address_components[AddressFormatter.HOUSE_NUMBER] = house_number
-        if house_number and house_number.count(',') >= 2:
-            house_numbers = house_number.split(',')
-            random.shuffle(house_numbers)
-            for num in house_numbers:
-                num = num.strip()
-                if num:
-                    address_components[AddressFormatter.HOUSE_NUMBER] = num
-                    break
-            else:
-                address_components.pop(AddressFormatter.HOUSE_NUMBER, None)
-
-    def expanded_address_components(self, value):
-        try:
-            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
-        except Exception:
-            return None, None, None
-
-        country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude)
-        if not (country and candidate_languages):
-            return None, None, None
-
-        for key in OSM_IGNORE_KEYS:
-            _ = value.pop(key, None)
-
-        language = None
-
-        more_than_one_official_language = len(candidate_languages) > 1
-
-        language = self.pick_language(value, candidate_languages)
-
-        address_components = self.normalize_address_components(value)
-
-        address_country, non_local_language = self.country_name(address_components, country, language)
-        if address_country:
-            address_components[AddressFormatter.COUNTRY] = address_country
-
-        address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
-        if address_state:
-            address_components[AddressFormatter.STATE] = address_state
-
-        osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
-
-        self.add_osm_boundaries(address_components, country, language, latitude, longitude,
-                                non_local_language=non_local_language,
-                                osm_suffix=osm_suffix)
-
-        city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
-        if city:
-            address_components[AddressFormatter.CITY] = city
-
-        self.add_neighborhoods(address_components, latitude, longitude,
-                               osm_suffix=osm_suffix)
-
-        street = address_components.get(AddressFormatter.ROAD)
-        if street:
-            address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language)
-
-        self.normalize_names(address_components)
-
-        self.replace_names(address_components)
-
-        self.prune_duplicate_names(address_components)
-
-        self.cleanup_house_number(address_components)
-
-        return address_components, country, language
-
-    def limited_address_components(self, value):
-        try:
-            latitude, longitude = latlon_to_decimal(value['lat'], value['lon'])
-        except Exception:
-            return None, None, None
-
-        country, candidate_languages, language_props = country_and_languages(self.language_rtree, latitude, longitude)
-        if not (country and candidate_languages):
-            return None, None, None
-
-        remove_keys = NAME_KEYS + HOUSE_NUMBER_KEYS + POSTAL_KEYS + OSM_IGNORE_KEYS
-
-        for key in remove_keys:
-            _ = value.pop(key, None)
-
-        language = None
-
-        more_than_one_official_language = len(candidate_languages) > 1
-
-        language = self.pick_language(value, candidate_languages)
-
-        address_components = self.normalize_address_components(value)
-
-        address_country, non_local_language = self.country_name(address_components, country, language,
-                                                                use_country_code_prob=0.0,
-                                                                local_language_name_prob=1.0,
-                                                                random_language_name_prob=0.0,
-                                                                alpha_3_iso_code_prob=0.0)
-        if address_country:
-            address_components[AddressFormatter.COUNTRY] = address_country
-
-        address_state = self.state_name(address_components, country, language, non_local_language=non_local_language, state_full_name_prob=1.0)
-        if address_state:
-            address_components[AddressFormatter.STATE] = address_state
-
-        street = address_components.get(AddressFormatter.ROAD)
-        if street:
-            address_components[AddressFormatter.ROAD] = self.abbreviated_street(street, language)
-
-        osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
-
-        self.add_osm_boundaries(address_components, country, language, latitude, longitude,
-                                osm_suffix=osm_suffix,
-                                non_local_language=non_local_language,
-                                random_key=False,
-                                alpha_3_iso_code_prob=0.0,
-                                alpha_2_iso_code_prob=0.0,
-                                replace_with_non_local_prob=0.0,
-                                expand_state_prob=1.0)
-
-        city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
-
-        if city:
-            address_components[AddressFormatter.CITY] = city
-
-        self.add_neighborhoods(address_components, latitude, longitude,
-                               osm_suffix=osm_suffix)
-
-        self.normalize_names(address_components)
-
-        self.prune_duplicate_names(address_components)
-
-        return address_components, country, language
-
-    def formatted_addresses(self, value, dropout_prob=0.5, rare_component_dropout_prob=0.6, tag_components=True):
-        '''
-        Formatted addresses
-        -------------------
-
-        Produces one or more formatted addresses (tagged/untagged)
-        from the given dictionary of OSM tags and values.
-
-        Here we also apply component dropout meaning we produce several
-        different addresses with various components removed at random.
-        That way the parser will have many examples of queries that are
-        just city/state or just house_number/street. The selected
-        components still have to make sense i.e. a lone house_number will
-        not be used without a street name. The dependencies are listed
-        above, see: OSM_ADDRESS_COMPONENTS.
-
-        If there is more than one venue name (say name and alt_name),
-        addresses using both names and the selected components are
-        returned.
-        '''
-
-        venue_names = self.venue_names(value) or []
-
-        address_components, country, language = self.expanded_address_components(value)
-
-        if not address_components:
-            return None, None, None
-
-        for venue_name in venue_names:
-            abbreviated_venue = self.abbreviated_venue_name(venue_name, language)
-            if abbreviated_venue != venue_name and abbreviated_venue not in set(venue_names):
-                venue_names.append(abbreviated_venue)
-
-        # Version with all components
-        formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)
-
-        if tag_components:
-            formatted_addresses = []
-            formatted_addresses.append(formatted_address)
-
-            seen = set([formatted_address])
-
-            address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES}
-            if not address_components:
-                return []
-
-            current_components = []
-            current_components_rare = []
-
-            state_important = country.upper() in self.state_important
-
-            current_components = [k for k in address_components.keys() if k not in self.rare_components]
-            current_components_rare = [k for k in address_components.keys() if k in self.rare_components]
-            random.shuffle(current_components)
-            random.shuffle(current_components_rare)
-
-            current_components = current_components_rare + current_components
-            component_set = component_bitset(address_components.keys())
-
-            for component in current_components:
-                prob = rare_component_dropout_prob if component in self.rare_components else dropout_prob
-
-                if component not in self.rare_components or (component == AddressFormatter.STATE and state_important):
-                    prob = dropout_prob
-                else:
-                    prob = rare_component_dropout_prob
-
-                if component_set ^ OSM_ADDRESS_COMPONENT_VALUES[component] in OSM_ADDRESS_COMPONENTS_VALID and random.random() < prob:
-                    address_components.pop(component)
-                    component_set ^= OSM_ADDRESS_COMPONENT_VALUES[component]
-                    if not address_components:
-                        return []
-
-                    # Since venue names are 1-per-record, we must use them all
-                    for venue_name in (venue_names or [None]):
-                        if venue_name and AddressFormatter.HOUSE in address_components:
-                            address_components[AddressFormatter.HOUSE] = venue_name
-                        formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
-                        if formatted_address and formatted_address not in seen:
-                            formatted_addresses.append(formatted_address)
-                            seen.add(formatted_address)
-
-            return formatted_addresses, country, language
-        else:
-            formatted_addresses = []
-            seen = set()
-            # Since venue names are 1-per-record, we must use them all
-            for venue_name in (venue_names or [None]):
-                if venue_name:
-                    address_components[AddressFormatter.HOUSE] = venue_name
-                formatted_address = self.formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=False)
-                if formatted_address and formatted_address not in seen:
-                    formatted_addresses.append(formatted_address)
-                    seen.add(formatted_address)
-            return formatted_addresses, country, language
-
-    def formatted_address_limited(self, value, admin_dropout_prob=0.7):
-        address_components, country, language = self.limited_address_components(value)
-
-        if not address_components:
-            return None, None, None
-
-        formatted_addresses = []
-
-        address_components = {k: v for k, v in address_components.iteritems() if k in OSM_ADDRESS_COMPONENT_VALUES}
-        if not address_components:
-            return []
-
-        current_components = address_components.keys()
-        random.shuffle(current_components)
-
-        for component in (AddressFormatter.COUNTRY, AddressFormatter.STATE,
-                          AddressFormatter.STATE_DISTRICT, AddressFormatter.CITY,
-                          AddressFormatter.CITY_DISTRICT, AddressFormatter.SUBURB):
-            if random.random() < admin_dropout_prob:
-                _ = address_components.pop(component, None)
-
-        if not address_components:
-            return None, None, None
-
-        # Version with all components
-        formatted_address = self.formatter.format_address(country, address_components, tag_components=False, minimal_only=False)
-
-        return formatted_address, country, language
-
-    def build_training_data(self, infile, out_dir, tag_components=True):
-        '''
-        Creates formatted address training data for supervised sequence labeling (or potentially 
-        for unsupervised learning e.g. for word vectors) using addr:* tags in OSM.
-
-        Example:
-
-        cs  cz  Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
-
-        The field structure is similar to other training data created by this script i.e.
-        {language, country, data}. The data field here is a sequence of labeled tokens similar
-        to what we might see in part-of-speech tagging.
-
-
-        This format uses a special character "|" to denote possible breaks in the input (comma, newline).
-
-        Note that for the address parser, we'd like it to be robust to many different types
-        of input, so we may selectively eleminate components
-
-        This information can potentially be used downstream by the sequence model as these
-        breaks may be present at prediction time.
-
-        Example:
-
-        sr      rs      Crkva Svetog Arhangela Mihaila | Vukov put BB | 15303 Trsic
-
-        This may be useful in learning word representations, statistical phrases, morphology
-        or other models requiring only the sequence of words.
-        '''
-        i = 0
-
-        if tag_components:
-            formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
-            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
-        else:
-            formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
-            writer = csv.writer(formatted_file, 'tsv_no_quote')
-
-        for node_id, value, deps in parse_osm(infile):
-            formatted_addresses, country, language = self.formatted_addresses(value, tag_components=tag_components)
-            if not formatted_addresses:
-                continue
-
-            for formatted_address in formatted_addresses:
-                if formatted_address and formatted_address.strip():
-                    formatted_address = tsv_string(formatted_address)
-                    if not formatted_address or not formatted_address.strip():
-                        continue
-
-                    if tag_components:
-                        row = (language, country, formatted_address)
-                    else:
-                        row = formatted_address
-
-                    writer.writerow(row)
-
-            i += 1
-            if i % 1000 == 0 and i > 0:
-                print('did {} formatted addresses'.format(i))
-
-    def build_limited_training_data(self, infile, out_dir):
-        '''
-        Creates a special kind of formatted address training data from OSM's addr:* tags
-        but are designed for use in language classification. These records are similar 
-        to the untagged formatted records but include the language and country
-        (suitable for concatenation with the rest of the language training data),
-        and remove several fields like country which usually do not contain helpful
-        information for classifying the language.
-
-        Example:
-
-        nb      no      Olaf Ryes Plass Oslo
-        '''
-        i = 0
-
-        f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w')
-        writer = csv.writer(f, 'tsv_no_quote')
-
-        for node_id, value, deps in parse_osm(infile):
-            formatted_address, country, language = self.formatted_address_limited(value)
-            if not formatted_address:
-                continue
-
-            if formatted_address.strip():
-                formatted_address = tsv_string(formatted_address.strip())
-                if not formatted_address or not formatted_address.strip():
-                    continue
-
-                row = (language, country, formatted_address)
-                writer.writerow(row)
-
-            i += 1
-            if i % 1000 == 0 and i > 0:
-                print('did {} formatted addresses'.format(i))
-
 
 NAME_KEYS = (
     'name',
@@ -1431,7 +272,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
         except Exception:
             continue
 
-        country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
+        country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude)
         if not (country and candidate_languages):
             continue
 
@@ -1579,6 +420,11 @@ if __name__ == '__main__':
     parser.add_argument('-s', '--streets-file',
                         help='Path to planet-ways.osm')
 
+    parser.add_argument('--unabbreviated',
+                        action='store_true',
+                        default=False,
+                        help='Use unabbreviated street names for token counts')
+
     parser.add_argument('-a', '--address-file',
                         help='Path to planet-addresses.osm')
 
@@ -1588,7 +434,7 @@ if __name__ == '__main__':
     parser.add_argument('-b', '--borders-file',
                         help='Path to planet-borders.osm')
 
-    parser.add_argument('-f', '--format-only',
+    parser.add_argument('-f', '--format',
                         action='store_true',
                         default=False,
                         help='Save formatted addresses (slow)')
@@ -1658,7 +504,7 @@ if __name__ == '__main__':
 
     # Can parallelize
     if args.streets_file:
-        build_ways_training_data(language_rtree, args.streets_file, args.out_dir)
+        build_ways_training_data(language_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
     if args.borders_file:
         build_toponym_training_data(language_rtree, args.borders_file, args.out_dir)