Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/init.py
+++ b/scripts/geodata/init.py
--- a/scripts/geodata/address_expansions/init.py
+++ b/scripts/geodata/address_expansions/init.py
--- a/scripts/geodata/address_expansions/abbreviations.py
+++ b/scripts/geodata/address_expansions/abbreviations.py
@@ -0,0 +1,233 @@
+import random
+import re
+import six
+
+from geodata.address_expansions.gazetteers import *
+from geodata.encoding import safe_decode, safe_encode
+from geodata.text.tokenize import tokenize_raw, token_types
+from geodata.text.utils import non_breaking_dash_regex
+
+
+LOWER, UPPER, TITLE, MIXED = range(4)
+
+
+def token_capitalization(s):
+    if s.istitle():
+        return TITLE
+    elif s.islower():
+        return LOWER
+    elif s.isupper():
+        return UPPER
+    else:
+        return MIXED
+
+
+expansion_token_regex = re.compile('([^  \-\.]+)([\.\- ]+|$)')
+
+
+def recase_abbreviation(expansion, tokens, space_token=six.u(' ')):
+    expansion_tokens = expansion_token_regex.findall(expansion)
+
+    if len(tokens) > len(expansion_tokens) and all((token_capitalization(t) != LOWER for t, c in tokens)):
+        expansion_tokenized = tokenize(expansion)
+        is_acronym = len(expansion_tokenized) == 1 and expansion_tokenized[0][1] == token_types.ACRONYM
+        if len(expansion) <= 3 or is_acronym:
+            return expansion.upper()
+        else:
+            return expansion.title()
+    elif len(tokens) == len(expansion_tokens):
+        strings = []
+        for (t, c), (e, suf) in zip(tokens, expansion_tokens):
+            cap = token_capitalization(t)
+            if suf == six.u(' '):
+                suf = space_token
+            if cap == LOWER:
+                strings.append(six.u('').join((e.lower(), suf)))
+            elif cap == UPPER:
+                strings.append(six.u('').join((e.upper(), suf)))
+            elif cap == TITLE:
+                strings.append(six.u('').join((e.title(), suf)))
+            elif t.lower() == e.lower():
+                strings.append(t)
+            else:
+                strings.append(six.u('').join((e.title(), suf)))
+        return six.u('').join(strings)
+    else:
+
+        strings = []
+        for e, suf in expansion_tokens:
+            strings.append(e.title())
+            if suf == six.u(' '):
+                strings.append(space_token)
+            else:
+                strings.append(suf)
+        return six.u('').join(strings)
+
+
+def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
+    '''
+    Abbreviations
+    -------------
+
+    OSM discourages abbreviations, but to make our training data map better
+    to real-world input, we can safely replace the canonical phrase with an
+    abbreviated version and retain the meaning of the words
+    '''
+    raw_tokens = tokenize_raw(s)
+    s_utf8 = safe_encode(s)
+    tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
+    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
+
+    n = len(tokens)
+
+    abbreviated = []
+
+    i = 0
+
+    def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
+        data = [d.split(six.b('|')) for d in data]
+
+        # local copy
+        abbreviated = []
+
+        n = len(t)
+
+        # Append the original tokens with whitespace if there is any
+        if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
+            for j, (t_i, c_i) in enumerate(t):
+                abbreviated.append(tokens[i + j][0])
+
+                if j < n - 1:
+                    abbreviated.append(space_token)
+            return abbreviated
+
+        for lang, dictionary, is_canonical, canonical in data:
+            if lang not in (language, 'all'):
+                continue
+
+            is_canonical = int(is_canonical)
+            is_stopword = dictionary == 'stopword'
+            is_prefix = dictionary.startswith('concatenated_prefixes')
+            is_suffix = dictionary.startswith('concatenated_suffixes')
+            is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length
+
+            suffix = None
+            prefix = None
+
+            if not is_canonical:
+                continue
+
+            if not is_prefix and not is_suffix:
+                abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
+                # TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
+                # would require an audit of the dictionaries though so abbreviations are listed from
+                # left-to-right by frequency of usage
+                token = random.choice(abbreviations) if abbreviations else canonical
+                token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
+                abbreviated.append(token)
+                break
+            elif is_prefix:
+                token = tokens[i][0]
+                prefix, token = token[:length], token[length:]
+
+                abbreviated.append(prefix)
+                if random.random() < separate_prob:
+                    sub_tokens = tokenize(token)
+                    if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
+                        token = six.u('').join((t for t, c in sub_tokens[1:]))
+
+                    abbreviated.append(space_token)
+                if token.islower():
+                    abbreviated.append(token.title())
+                else:
+                    abbreviated.append(token)
+                abbreviated.append(space_token)
+                break
+            elif is_suffix:
+                token = tokens[i][0]
+
+                token, suffix = token[:-length], token[-length:]
+
+                concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])
+
+                separated_abbreviations = []
+                phrase = gazetteer.trie.get(suffix.rstrip('.'))
+                suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
+                for l, d, _, c in suffix_data:
+                    if l == lang and c == canonical:
+                        separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))
+
+                separate = random.random() < separate_prob
+
+                if concatenated_abbreviations and not separate:
+                    abbreviation = random.choice(concatenated_abbreviations)
+                elif separated_abbreviations:
+                    abbreviation = random.choice(separated_abbreviations)
+                else:
+                    abbreviation = canonical
+
+                if separate:
+                    sub_tokens = tokenize(token)
+                    if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
+                        token = six.u('').join((t for t, c in sub_tokens[:-1]))
+
+                abbreviated.append(token)
+                if separate:
+                    abbreviated.append(space_token)
+                if suffix.isupper():
+                    abbreviated.append(abbreviation.upper())
+                elif separate:
+                    abbreviated.append(abbreviation.title())
+                else:
+                    abbreviated.append(abbreviation)
+                break
+        else:
+            for j, (t_i, c_i) in enumerate(t):
+                abbreviated.append(tokens[i + j][0])
+                if j < n - 1:
+                    abbreviated.append(space_token)
+        return abbreviated
+
+    for t, c, length, data in gazetteer.filter(norm_tokens):
+        if c == token_types.PHRASE:
+            abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
+            abbreviated.extend(abbrev_tokens)
+
+            if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
+                abbreviated.append(six.u(' '))
+
+            i += len(t)
+
+        else:
+            token = tokens[i][0]
+            if not non_breaking_dash_regex.search(token):
+                abbreviated.append(token)
+            else:
+                sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
+                sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]
+
+                sub_token_abbreviated = []
+                sub_i = 0
+                sub_n = len(sub_tokens)
+                for t, c, length, data in gazetteer.filter(sub_tokens_norm):
+                    if c == token_types.PHRASE:
+                        abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
+                        sub_token_abbreviated.extend(abbrev_tokens)
+                        sub_i += len(t)
+                        if sub_i < sub_n:
+                            if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
+                                sub_token_abbreviated.append(six.u('.'))
+                            sub_token_abbreviated.append(six.u('-'))
+                    else:
+                        sub_token_abbreviated.append(sub_tokens[sub_i][0])
+                        sub_i += 1
+                        if sub_i < sub_n:
+                            sub_token_abbreviated.append(six.u('-'))
+
+                abbreviated.append(six.u('').join(sub_token_abbreviated))
+
+            if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
+                abbreviated.append(six.u(' '))
+            i += 1
+
+    return six.u('').join(abbreviated).strip()
--- a/scripts/geodata/address_expansions/address_dictionaries.py
+++ b/scripts/geodata/address_expansions/address_dictionaries.py
@@ -0,0 +1,254 @@
+import os
+import sys
+
+from collections import defaultdict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.encoding import safe_encode, safe_decode
+
+ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                      'resources', 'dictionaries')
+
+ADDRESS_HEADER_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h')
+ADDRESS_DATA_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
+
+address_language_index_template = u'{{{language}, {index}, {length}}}'
+address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}'
+
+
+address_expansion_rule_header_template = u'''
+#ifndef ADDRESS_EXPANSION_RULE_H
+#define ADDRESS_EXPANSION_RULE_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "constants.h"
+#include "gazetteers.h"
+
+#define MAX_DICTIONARY_TYPES {max_dictionary_types}
+
+typedef struct address_expansion_rule {{
+    char *phrase;
+    uint32_t num_dictionaries;
+    dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES];
+    int32_t canonical_index;
+}} address_expansion_rule_t;
+
+typedef struct address_language_index {{
+    char language[MAX_LANGUAGE_LEN];
+    uint32_t index;
+    size_t len;
+}} address_language_index_t;
+
+
+#endif
+'''
+
+address_expansion_data_file_template = u'''
+char *canonical_strings[] = {{
+    {canonical_strings}
+}};
+
+address_expansion_rule_t expansion_rules[] = {{
+    {expansion_rules}
+}};
+
+address_language_index_t expansion_languages[] = {{
+    {address_languages}
+}};
+'''
+
+
+gazetteer_types = {
+    'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
+    'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
+    'building_types': 'DICTIONARY_BUILDING_TYPE',
+    'categories': 'DICTIONARY_CATEGORY',
+    'categories_plural': 'DICTIONARY_CATEGORY_PLURAL',
+    'chains': 'DICTIONARY_CHAIN',
+    'company_types': 'DICTIONARY_COMPANY_TYPE',
+    'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
+    'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
+    'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE',
+    'cross_streets': 'DICTIONARY_CROSS_STREET',
+    'directionals': 'DICTIONARY_DIRECTIONAL',
+    'elisions': 'DICTIONARY_ELISION',
+    'entrances': 'DICTIONARY_ENTRANCE',
+    'given_names': 'DICTIONARY_GIVEN_NAME',
+    'house_numbers': 'DICTIONARY_HOUSE_NUMBER',
+    'level_types_basement': 'DICTIONARY_LEVEL_BASEMENT',
+    'level_types_mezzanine': 'DICTIONARY_LEVEL_MEZZANINE',
+    'level_types_numbered': 'DICTIONARY_LEVEL_NUMBERED',
+    'level_types_standalone': 'DICTIONARY_LEVEL_STANDALONE',
+    'level_types_sub_basement': 'DICTIONARY_LEVEL_SUB_BASEMENT',
+    'near': 'DICTIONARY_NEAR',
+    'no_number': 'DICTIONARY_NO_NUMBER',
+    'number': 'DICTIONARY_NUMBER',
+    'nulls': 'DICTIONARY_NULL',
+    'organizations': 'DICTIONARY_NAMED_ORGANIZATION',
+    'people': 'DICTIONARY_NAMED_PERSON',
+    'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX',
+    'personal_titles': 'DICTIONARY_PERSONAL_TITLE',
+    'place_names': 'DICTIONARY_PLACE_NAME',
+    'post_office': 'DICTIONARY_POST_OFFICE',
+    'postcodes': 'DICTIONARY_POSTAL_CODE',
+    'qualifiers': 'DICTIONARY_QUALIFIER',
+    'staircases': 'DICTIONARY_STAIRCASE',
+    'stopwords': 'DICTIONARY_STOPWORD',
+    'street_names': 'DICTIONARY_STREET_NAME',
+    'street_types': 'DICTIONARY_STREET_TYPE',
+    'surnames': 'DICTIONARY_SURNAME',
+    'synonyms': 'DICTIONARY_SYNONYM',
+    'toponyms': 'DICTIONARY_TOPONYM',
+    'unit_directions': 'DICTIONARY_UNIT_DIRECTION',
+    'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED',
+    'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE',
+
+}
+
+
+class InvalidAddressFileException(Exception):
+    pass
+
+
+def read_dictionary_file(path):
+    for i, line in enumerate(open(path)):
+        line = safe_decode(line.rstrip())
+        if not line.strip():
+            continue
+
+        if u'}' in line:
+            raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
+        phrases = line.split(u'|')
+
+        if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
+            raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
+
+        yield phrases
+
+
+def quote_string(s):
+    return u'"{}"'.format(safe_decode(s).replace('\\', '\\\\').replace('"', '\\"'))
+
+
+class AddressPhraseDictionaries(object):
+    def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
+        self.base_dir = base_dir
+        self.languages = []
+
+        self.language_dictionaries = defaultdict(list)
+        self.phrases = defaultdict(list)
+
+        for language in os.listdir(base_dir):
+            language_dir = os.path.join(base_dir, language)
+            if not os.path.isdir(language_dir):
+                continue
+
+            self.languages.append(language)
+
+            for filename in os.listdir(language_dir):
+                if not filename.endswith('.txt'):
+                    raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
+                dictionary_name = filename.split('.')[0].lower()
+
+                if dictionary_name not in gazetteer_types:
+                    raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
+                self.language_dictionaries[language].append(dictionary_name)
+
+                path = os.path.join(language_dir, filename)
+                for i, line in enumerate(open(path)):
+                    line = safe_decode(line.rstrip())
+                    if not line.strip():
+                        continue
+
+                    if u'}' in line:
+                        raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
+                    phrases = line.split(u'|')
+
+                    if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
+                        raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
+
+                    self.phrases[(language, dictionary_name)].append(phrases)
+
+        self.language_dictionaries = dict(self.language_dictionaries)
+        self.phrases = dict(self.phrases)
+
+
+address_phrase_dictionaries = AddressPhraseDictionaries()
+
+
+def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
+    address_languages = []
+    expansion_rules = []
+    canonical_strings = []
+
+    max_dictionary_types = 0
+
+    for language in address_phrase_dictionaries.languages:
+        num_language_rules = 0
+        language_index = len(expansion_rules)
+
+        language_canonical_dictionaries = defaultdict(list)
+        canonical_indices = {}
+
+        for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
+            dictionary_type = gazetteer_types[dictionary_name]
+
+            for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
+                canonical = phrases[0]
+                if len(phrases) > 1:
+                    canonical_index = canonical_indices.get(canonical, None)
+                    if canonical_index is None:
+                        canonical_index = len(canonical_strings)
+                        canonical_strings.append(quote_string(canonical))
+                        canonical_indices[canonical] = canonical_index
+                else:
+                    canonical_index = -1
+
+                for i, p in enumerate(phrases):
+                    language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type)
+
+        for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
+            max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
+            rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
+                                                                   num_dictionaries=str(len(dictionary_types)),
+                                                                   dictionaries=', '.join(dictionary_types),
+                                                                   canonical_index=canonical_index)
+            expansion_rules.append(rule_template)
+            num_language_rules += 1
+
+        address_languages.append(address_language_index_template.format(language=quote_string(language),
+                                                                        index=language_index,
+                                                                        length=num_language_rules))
+
+    header = address_expansion_rule_header_template.format(
+        max_dictionary_types=str(max_dictionary_types)
+    )
+    out = open(header_file, 'w')
+    out.write(safe_encode(header))
+    out.close()
+
+    data_file = address_expansion_data_file_template.format(
+        canonical_strings=u''',
+    '''.join(canonical_strings),
+        expansion_rules=u''',
+    '''.join(expansion_rules),
+        address_languages=u''',
+    '''.join(address_languages),
+    )
+
+    out = open(output_file, 'w')
+    out.write(safe_encode(data_file))
+    out.close()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        input_dir = sys.argv[1]
+    else:
+        input_dir = ADDRESS_EXPANSIONS_DIR
+
+    create_address_expansion_rules_file(base_dir=input_dir, output_file=ADDRESS_DATA_FILE)
--- a/scripts/geodata/address_expansions/equivalence.py
+++ b/scripts/geodata/address_expansions/equivalence.py
@@ -0,0 +1,56 @@
+import random
+import re
+import six
+
+from itertools import izip
+
+from geodata.address_expansions.gazetteers import *
+from geodata.encoding import safe_decode, safe_encode
+from geodata.text.normalize import normalized_tokens
+from geodata.text.tokenize import tokenize_raw, token_types
+from geodata.text.utils import non_breaking_dash_regex
+
+
+def canonicals_for_language(data, language):
+    canonicals = set()
+
+    for d in data:
+        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
+        if language is None or lang == language:
+            canonicals.add(canonical)
+
+    return canonicals
+
+def equivalent(s1, s2, gazetteer, language):
+    '''
+    Address/place equivalence
+    -------------------------
+
+    OSM discourages abbreviations, but to make our training data map better
+    to real-world input, we can safely replace the canonical phrase with an
+    abbreviated version and retain the meaning of the words
+    '''
+
+    tokens_s1 = normalized_tokens(s1)
+    tokens_s2 = normalized_tokens(s2)
+
+    abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
+    abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
+
+    if len(abbreviated_s1) != len(abbreviated_s2):
+        return False
+
+    for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
+        if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
+            if t1 != t2:
+                return False
+        elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
+            canonicals_s1 = canonicals_for_language(d1, language)
+            canonicals_s2 = canonicals_for_language(d2, language)
+
+            if not canonicals_s1 & canonicals_s2:
+                return False
+        else:
+            return False
+
+    return True
--- a/scripts/geodata/address_expansions/gazetteers.py
+++ b/scripts/geodata/address_expansions/gazetteers.py
@@ -0,0 +1,260 @@
+import os
+import six
+
+from collections import defaultdict, OrderedDict
+
+from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
+from geodata.encoding import safe_decode, safe_encode
+from geodata.i18n.unicode_paths import DATA_DIR
+from geodata.text.normalize import normalized_tokens, normalize_string
+from geodata.text.tokenize import tokenize, token_types
+from geodata.text.phrases import PhraseFilter
+from geodata.enum import EnumValue
+
+from marisa_trie import BytesTrie
+
+
+DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries')
+
+PREFIX_KEY = u'\x02'
+SUFFIX_KEY = u'\x03'
+
+POSSIBLE_ROMAN_NUMERALS = set(['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix',
+                               'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix',
+                               'xx', 'xxx', 'xl', 'l', 'lx', 'lxx', 'lxxx', 'xc',
+                               'c', 'cc', 'ccc', 'cd', 'd', 'dc', 'dcc', 'dccc', 'cm',
+                               'm', 'mm', 'mmm', 'mmmm'])
+
+
+class DictionaryPhraseFilter(PhraseFilter):
+    serialize = safe_encode
+    deserialize = safe_decode
+
+    def __init__(self, *dictionaries):
+        self.dictionaries = dictionaries
+        self.canonicals = {}
+
+        kvs = defaultdict(OrderedDict)
+
+        for language in address_phrase_dictionaries.languages:
+            for dictionary_name in self.dictionaries:
+                is_suffix_dictionary = 'suffixes' in dictionary_name
+                is_prefix_dictionary = 'prefixes' in dictionary_name
+
+                for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
+                    canonical = phrases[0]
+                    canonical_normalized = normalize_string(canonical)
+
+                    self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]
+
+                    for i, phrase in enumerate(phrases):
+
+                        if phrase in POSSIBLE_ROMAN_NUMERALS:
+                            continue
+
+                        is_canonical = normalize_string(phrase) == canonical_normalized
+
+                        if is_suffix_dictionary:
+                            phrase = SUFFIX_KEY + phrase[::-1]
+                        elif is_prefix_dictionary:
+                            phrase = PREFIX_KEY + phrase
+
+                        kvs[phrase][(language, dictionary_name, canonical)] = is_canonical
+
+        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]
+
+        self.trie = BytesTrie(kvs)
+
+    def serialize(self, s):
+        return s
+
+    def deserialize(self, s):
+        return s
+
+    def search_substring(self, s):
+        if len(s) == 0:
+            return None, 0
+
+        for i in xrange(len(s) + 1):
+            if not self.trie.has_keys_with_prefix(s[:i]):
+                i -= 1
+                break
+        if i > 0:
+            return (self.trie.get(s[:i]), i)
+        else:
+            return None, 0
+
+    def search_suffix(self, token):
+        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
+        if suffix_len > 0:
+            suffix_len -= len(SUFFIX_KEY)
+        return suffix_search, suffix_len
+
+    def search_prefix(self, token):
+        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
+        if prefix_len > 0:
+            prefix_len -= len(PREFIX_KEY)
+        return prefix_search, prefix_len
+
+    def basic_filter(self, tokens):
+        return super(DictionaryPhraseFilter, self).filter(tokens)
+
+    def filter(self, tokens):
+        for p, t, data in self.basic_filter(tokens):
+            if not p:
+                t, c = t
+                token = t
+                token_len = len(token)
+
+                suffix_search, suffix_len = self.search_suffix(token)
+                if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
+                    yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
+                    continue
+                prefix_search, prefix_len = self.search_prefix(token)
+                if prefix_search and self.trie.get(token[:prefix_len]):
+                    yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
+                    continue
+            else:
+                c = token_types.PHRASE
+            yield t, c, len(t), map(safe_decode, data)
+
+    def gen_phrases(self, s, canonical_only=False, languages=None):
+        tokens = tokenize(s)
+        norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]
+
+        if not languages:
+            languages = None
+        elif not hasattr(languages, '__iter__'):
+            languages = [languages]
+
+        if not hasattr(languages, '__contains__'):
+            languages = set(languages)
+
+        for t, c, length, data in self.filter(norm_tokens):
+            if c == token_types.PHRASE:
+                if not canonical_only and languages is None:
+                    yield six.u(' ').join([t_i for t_i, c_i in t])
+                else:
+                    phrase = None
+                    for d in data:
+                        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
+
+                        if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
+                            phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
+                            yield phrase
+
+    def string_contains_phrases(self, s, canonical_only=False, languages=None):
+        phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
+        try:
+            phrases.next()
+            return True
+        except StopIteration:
+            return False
+
+    def extract_phrases(self, s, canonical_only=False, languages=None):
+        return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
+
+
+STREET_TYPES_ONLY_DICTIONARIES = ('street_types',
+                                  'directionals',
+                                  'concatenated_suffixes_separable',
+                                  'concatenated_suffixes_inseparable',
+                                  'people',
+                                  'personal_suffixes',
+                                  'personal_titles',
+                                  )
+
+STREET_TYPES_DICTIONARIES = STREET_TYPES_ONLY_DICTIONARIES + ('concatenated_prefixes_separable',
+                                                              'organizations',
+                                                              'qualifiers',
+                                                              'stopwords',
+                                                              )
+
+GIVEN_NAME_DICTIONARY = 'given_names'
+SURNAME_DICTIONARY = 'surnames'
+
+CHAIN_DICTIONARY = 'chains'
+
+SYNONYM_DICTIONARY = 'synonyms'
+
+PERSONAL_NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
+                              SURNAME_DICTIONARY,)
+
+
+NAME_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
+                                                 'building_types',
+                                                 'company_types',
+                                                 'place_names',
+                                                 'qualifiers',
+                                                 'synonyms',
+                                                 'toponyms',
+                                                 )
+
+QUALIFIERS_DICTIONARY = 'qualifiers'
+
+HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
+
+POSTCODE_DICTIONARIES = ('postcode',)
+
+TOPONYMS_DICTIONARY = 'toponyms'
+
+TOPONYM_ABBREVIATION_DICTIONARIES = ('qualifiers',
+                                     'directionals',
+                                     'personal_titles',
+                                     'synonyms',
+                                     )
+
+
+UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement',
+                                  'level_types_mezzanine',
+                                  'level_types_numbered',
+                                  'level_types_standalone',
+                                  'level_types_sub_basement',
+                                  'number',
+                                  'post_office',
+                                  'unit_types_numbered',
+                                  'unit_types_standalone',
+                                  )
+
+VENUE_NAME_DICTIONARIES = ('academic_degrees',
+                           'building_types',
+                           'chains',
+                           'company_types',
+                           'directionals',
+                           'given_names',
+                           'organizations',
+                           'people',
+                           'personal_suffixes',
+                           'personal_titles',
+                           'place_names',
+                           'stopwords',
+                           'surnames',
+                           )
+
+ALL_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + \
+    NAME_DICTIONARIES + \
+    UNIT_ABBREVIATION_DICTIONARIES + \
+    ('no_number', 'nulls',)
+
+
+_gazetteers = []
+
+
+def create_gazetteer(*dictionaries):
+    g = DictionaryPhraseFilter(*dictionaries)
+    _gazetteers.append(g)
+    return g
+
+
+street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
+street_types_only_gazetteer = create_gazetteer(*STREET_TYPES_ONLY_DICTIONARIES)
+qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
+names_gazetteer = create_gazetteer(*NAME_DICTIONARIES)
+chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
+unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
+street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, )))
+abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES)
+toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES)
+toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY)
+given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY)
+venue_names_gazetteer = create_gazetteer(*VENUE_NAME_DICTIONARIES)
--- a/scripts/geodata/address_formatting/init.py
+++ b/scripts/geodata/address_formatting/init.py
--- a/scripts/geodata/address_formatting/aliases.py
+++ b/scripts/geodata/address_formatting/aliases.py
@@ -0,0 +1,29 @@
+import six
+from collections import defaultdict
+
+
+class Aliases(object):
+    def __init__(self, aliases):
+        self.aliases = aliases
+        self.priorities = {k: i for i, k in enumerate(aliases)}
+
+    def key_priority(self, key):
+        return self.priorities.get(key, len(self.priorities))
+
+    def get(self, key, default=None):
+        return self.aliases.get(key, default)
+
+    def replace(self, components):
+        replacements = defaultdict(list)
+        values = {}
+        for k in list(components):
+            new_key = self.aliases.get(k)
+            if new_key and new_key not in components:
+                value = components.pop(k)
+                values[k] = value
+                replacements[new_key].append(k)
+
+        for key, source_keys in six.iteritems(replacements):
+            source_keys.sort(key=self.key_priority)
+            value = values[source_keys[0]]
+            components[key] = value
--- a/scripts/geodata/address_formatting/formatter.py
+++ b/scripts/geodata/address_formatting/formatter.py
@@ -0,0 +1,924 @@
+# -*- coding: utf-8 -*-
+import copy
+import os
+import pystache
+import random
+import re
+import six
+import subprocess
+import yaml
+
+from collections import OrderedDict, defaultdict
+from itertools import ifilter
+
+from geodata.address_formatting.aliases import Aliases
+from geodata.configs.utils import nested_get, recursive_merge
+from geodata.math.floats import isclose
+from geodata.math.sampling import weighted_choice, cdf
+from geodata.text.tokenize import tokenize, tokenize_raw, token_types
+from geodata.encoding import safe_decode
+
+FORMATTER_GIT_REPO = 'https://github.com/OpenCageData/address-formatting'
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+FORMATTER_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                'resources', 'formatting', 'global.yaml')
+
+
+class AddressFormatter(object):
+    '''
+    Approximate Python port of lokku's Geo::Address::Formatter
+
+    Usage:
+        address_formatter = AddressFormatter()
+        components = {
+            'house': u'Anticafé',
+            'house_number': '2',
+            'road': u'Calle de la Unión',
+            'postcode': '28013',
+            'city': u'Madrid',
+        }
+        country = 'es'
+        language = 'es'
+        address_formatter.format_address(components, country, language)
+    '''
+
+    whitespace_component_regex = re.compile('[\r\n]+[\s\r\n]*')
+
+    splitter = ' | '
+
+    separator_tag = 'SEP'
+    field_separator_tag = 'FSEP'
+
+    CATEGORY = 'category'
+    NEAR = 'near'
+    ATTENTION = 'attention'
+    CARE_OF = 'care_of'
+    HOUSE = 'house'
+    HOUSE_NUMBER = 'house_number'
+    PO_BOX = 'po_box'
+    ROAD = 'road'
+    BUILDING = 'building'
+    ENTRANCE = 'entrance'
+    STAIRCASE = 'staircase'
+    LEVEL = 'level'
+    UNIT = 'unit'
+    INTERSECTION = 'intersection'
+    SUBDIVISION = 'subdivision'
+    METRO_STATION = 'metro_station'
+    SUBURB = 'suburb'
+    CITY_DISTRICT = 'city_district'
+    CITY = 'city'
+    ISLAND = 'island'
+    STATE = 'state'
+    STATE_DISTRICT = 'state_district'
+    POSTCODE = 'postcode'
+    COUNTRY_REGION = 'country_region'
+    COUNTRY = 'country'
+    WORLD_REGION = 'world_region'
+
+    component_order = {k: i for i, k in enumerate([
+        CATEGORY,
+        NEAR,
+        ATTENTION,
+        CARE_OF,
+        HOUSE,
+        PO_BOX,
+        HOUSE_NUMBER,
+        BUILDING,
+        ENTRANCE,
+        STAIRCASE,
+        LEVEL,
+        UNIT,
+        ROAD,
+        INTERSECTION,
+        SUBDIVISION,
+        METRO_STATION,
+        SUBURB,
+        CITY,
+        CITY_DISTRICT,
+        ISLAND,
+        STATE,
+        STATE_DISTRICT,
+        POSTCODE,
+        COUNTRY_REGION,
+        COUNTRY,
+        WORLD_REGION,
+    ])}
+
+    BOUNDARY_COMPONENTS_ORDERED = [
+        SUBDIVISION,
+        METRO_STATION,
+        SUBURB,
+        CITY_DISTRICT,
+        CITY,
+        ISLAND,
+        STATE_DISTRICT,
+        STATE,
+        COUNTRY_REGION,
+        COUNTRY,
+        WORLD_REGION,
+    ]
+
+    BOUNDARY_COMPONENTS = set(BOUNDARY_COMPONENTS_ORDERED)
+
+    SUB_BUILDING_COMPONENTS = {
+        ENTRANCE,
+        STAIRCASE,
+        LEVEL,
+        UNIT,
+    }
+
+    STREET_COMPONENTS = {
+        HOUSE_NUMBER,
+        ROAD,
+    }
+
+    ADDRESS_LEVEL_COMPONENTS = STREET_COMPONENTS | SUB_BUILDING_COMPONENTS
+
+    NAME_COMPONENTS = {
+        ATTENTION,
+        CARE_OF,
+        HOUSE,
+    }
+
+    address_formatter_fields = set(component_order)
+
+    aliases = Aliases(
+        OrderedDict([
+            ('street', ROAD),
+            ('street_name', ROAD),
+            ('hamlet', CITY),
+            ('village', CITY),
+            ('neighborhood', SUBURB),
+            ('neighbourhood', SUBURB),
+            ('city_district', CITY_DISTRICT),
+            ('county', STATE_DISTRICT),
+            ('state_code', STATE),
+            ('country_name', COUNTRY),
+            ('continent', WORLD_REGION),
+            ('postal_code', POSTCODE),
+            ('post_code', POSTCODE),
+        ])
+    )
+
+    category_template = '{{{category}}} {{{near}}} {{{place}}}'
+    chain_template = '{{{house}}} {{{near}}} {{{place}}}'
+    intersection_template = '{{{road1}}} {{{intersection}}} {{{road2}}} {{{place}}}'
+
+    template_address_parts = [HOUSE, HOUSE_NUMBER, ROAD]
+    template_admin_parts = [CITY, STATE, COUNTRY]
+
+    template_address_parts_re = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in template_address_parts]))
+    template_admin_parts_re = re.compile('|'.join(['\{{{key}\}}'.format(key=key) for key in template_admin_parts]))
+
+    MINIMAL_COMPONENT_KEYS = [
+        (ROAD, HOUSE_NUMBER),
+        (ROAD, HOUSE),
+        (ROAD, POSTCODE)
+    ]
+
+    FIRST, BEFORE, AFTER, LAST = range(4)
+
+    def __init__(self, scratch_dir='/tmp', splitter=None):
+        if splitter is not None:
+            self.splitter = splitter
+
+        self.formatter_repo_path = os.path.join(scratch_dir, 'address-formatting')
+        self.clone_repo()
+
+        self.load_config()
+        self.load_country_formats()
+
+        self.language_code_replacements = self.config['language_code_replacements']
+
+        self.setup_insertion_probabilities()
+        self.setup_no_name_templates()
+        self.setup_place_only_templates()
+
+        self.template_cache = {}
+        self.parsed_cache = {}
+
+    def clone_repo(self):
+        subprocess.check_call(['rm', '-rf', self.formatter_repo_path])
+        subprocess.check_call(['git', 'clone', FORMATTER_GIT_REPO, self.formatter_repo_path])
+
+    def load_country_formats(self):
+        config = yaml.load(open(os.path.join(self.formatter_repo_path,
+                                'conf', 'countries', 'worldwide.yaml')))
+        self.country_aliases = {}
+        self.house_number_ordering = {}
+
+        for key in list(config):
+            country = key
+            language = None
+            if '_' in key:
+                country, language = country.split('_', 1)
+            value = config[key]
+            if hasattr(value, 'items'):
+                address_template = value.get('address_template')
+                if not address_template and 'use_country' in value:
+                    # Temporary fix for Norway territories (NO unquoted is a boolean) and recursive references
+                    if value['use_country'] in (country, False):
+                        continue
+                    self.country_aliases[country] = value['use_country']
+                    address_template = config[value['use_country']]['address_template']
+
+                if address_template:
+                    value['address_template'] = self.add_postprocessing_tags(address_template, country, language=language)
+
+                post_format_replacements = value.get('postformat_replace')
+                if post_format_replacements:
+                    value['postformat_replace'] = [[pattern, replacement.replace('$', '\\')] for pattern, replacement in post_format_replacements]
+            else:
+                address_template = value
+                config[country] = self.add_postprocessing_tags(value, country, language=language)
+
+            try:
+                house_number_index = address_template.index(self.tag_token(self.HOUSE_NUMBER))
+                road_index = address_template.index(self.tag_token(self.ROAD))
+
+                if house_number_index < road_index:
+                    self.house_number_ordering[key.lower()] = -1
+                else:
+                    self.house_number_ordering[key.lower()] = 1
+            except ValueError:
+                self.house_number_ordering[key.lower()] = 0
+
+        self.country_formats = config
+
+    def load_config(self):
+        config = yaml.load(open(FORMATTER_CONFIG))
+        self.config = config.get('global', {})
+        language_configs = config.get('languages', {})
+
+        self.language_configs = {}
+        for language in language_configs:
+            language_config = language_configs[language]
+            config_copy = copy.deepcopy(self.config)
+            self.language_configs[language] = recursive_merge(config_copy, language_config)
+
+        country_configs = config.get('countries', {})
+
+        self.country_configs = {}
+        for country in country_configs:
+            country_config = country_configs[country]
+            config_copy = copy.deepcopy(self.config)
+            self.country_configs[country] = recursive_merge(config_copy, country_config)
+
+    def get_property(self, keys, country, language=None, default=None):
+        if isinstance(keys, six.string_types):
+            keys = keys.split('.')
+        keys = tuple(keys)
+        value = nested_get(self.language_configs, (language,) + keys, default=default)
+        if not value:
+            value = nested_get(self.country_configs, (country,) + keys, default=default)
+        if not value:
+            value = nested_get(self.config, keys, default=default)
+        return value
+
+    def insertion_distribution(self, insertions):
+        values = []
+        probs = []
+
+        for k, v in six.iteritems(insertions):
+            if k == 'conditional' or not v:
+                continue
+
+            if 'before' in v:
+                val = (self.BEFORE, v['before'])
+            elif 'after' in v:
+                val = (self.AFTER, v['after'])
+            elif 'last' in v:
+                val = (self.LAST, None)
+            elif 'first' in v:
+                val = (self.FIRST, None)
+            else:
+                raise ValueError('Insertions must contain one of {{first, before, after, last}}. Value was: {}'.format(v))
+
+            prob = v['probability']
+            values.append(val)
+            probs.append(prob)
+
+        # If the probabilities don't sum to 1, add a "do nothing" action
+        if not isclose(sum(probs), 1.0):
+            probs.append(1.0 - sum(probs))
+            values.append((None, None, False))
+
+        return values, cdf(probs)
+
+    def insertion_probs(self, config):
+        component_insertions = {}
+        for component, insertions in six.iteritems(config):
+            component_insertions[component] = self.insertion_distribution(insertions)
+
+        return component_insertions
+
+    def inverted(self, template):
+        lines = template.split(six.u('\n'))
+        return six.u('\n').join(reversed(lines))
+
+    def house_number_before_road(self, country, language=None):
+        key = value = None
+        if language is not None:
+            key = six.u('_').join((country.lower(), language.lower()))
+            if key in self.house_number_ordering:
+                value = self.house_number_ordering[key]
+
+        if value is None:
+            key = country
+            if key in self.house_number_ordering:
+                value = self.house_number_ordering[key]
+
+        if value is None:
+            value = 0
+
+        if value <= 0:
+            return True
+        else:
+            return False
+
+    def conditional_insertion_probs(self, conditionals):
+        conditional_insertions = defaultdict(OrderedDict)
+        for component, value in six.iteritems(conditionals):
+            if 'conditional' in value:
+                conditionals = value['conditional']
+
+                for c in conditionals:
+                    other = c['component']
+                    conditional_insertions[component][other] = self.insertion_distribution(c['probabilities'])
+        return conditional_insertions
+
+    def setup_insertion_probabilities(self):
+        config = self.config['insertions']
+        self.global_insertions = self.insertion_probs(config)
+        self.global_conditionals = self.conditional_insertion_probs(config)
+
+        self.global_invert_probability = self.config.get('invert_probability', 0.0)
+
+        self.country_insertions = {}
+        self.country_conditionals = {}
+
+        self.country_invert_probabilities = {}
+
+        for country, config in six.iteritems(self.country_configs):
+            if 'insertions' in config:
+                self.country_insertions[country.lower()] = self.insertion_probs(config['insertions'])
+                self.country_conditionals[country.lower()] = self.conditional_insertion_probs(config['insertions'])
+
+            if 'invert_probability' in config:
+                self.country_invert_probabilities[country] = config['invert_probability']
+
+        self.language_insertions = {}
+        self.language_conditionals = {}
+
+        for language, config in six.iteritems(self.language_configs):
+            if 'insertions' in config:
+                self.language_insertions[language.lower()] = self.insertion_probs(config['insertions'])
+                self.language_conditionals[language.lower()] = self.conditional_insertion_probs(config['insertions'])
+
+    def setup_no_name_templates(self):
+        self.templates_no_name = {}
+
+        for country, config in six.iteritems(self.country_formats):
+            if hasattr(config, 'items') and 'address_template' in config:
+                address_template = self.remove_components(config['address_template'], self.NAME_COMPONENTS)
+                self.templates_no_name[country] = address_template
+
+    def setup_place_only_templates(self):
+        self.templates_place_only = {}
+
+        for country, config in six.iteritems(self.country_formats):
+            if hasattr(config, 'items') and 'address_template' in config:
+                address_template = self.remove_components(config['address_template'], self.NAME_COMPONENTS | self.ADDRESS_LEVEL_COMPONENTS)
+                self.templates_place_only[country] = address_template
+
+    def country_template(self, c):
+        return self.country_formats.get(c, self.country_formats['default'])
+
+    def is_reverse(self, template):
+        address_parts_match = self.template_address_parts_re.search(template)
+        admin_parts_match = list(self.template_admin_parts_re.finditer(template))
+
+        # last instance of city/state/country occurs before the first instance of house_number/road
+        return admin_parts_match[-1].start() < address_parts_match.start()
+
+    def build_first_of_template(self, keys):
+        """ For constructing """
+        return '{{{{#first}}}} {keys} {{{{/first}}}}'.format(keys=' || '.join(['{{{{{{{key}}}}}}}'.format(key=key) for key in keys]))
+
+    def tag_token(self, key):
+        return '{{{{{{{key}}}}}}}'.format(key=key)
+
+    def remove_components(self, template, tags):
+        new_components = []
+        tags = set(tags)
+
+        parsed = pystache.parse(safe_decode(template))
+
+        last_removed = False
+        for i, el in enumerate(parsed._parse_tree):
+            if hasattr(el, 'parsed'):
+                keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key') and e.key not in tags]
+                if keys:
+                    new_components.append(self.build_first_of_template(keys))
+                    last_removed = False
+                else:
+                    last_removed = True
+            elif hasattr(el, 'key'):
+                if el.key not in tags:
+                    new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
+                    last_removed = False
+                else:
+                    last_removed = True
+
+            elif not last_removed:
+                new_components.append(el)
+            else:
+                last_removed = False
+        return ''.join(new_components).strip()
+
+    def insert_component(self, template, tag, before=None, after=None, first=False, last=False,
+                         separate=True, is_reverse=False, exact_order=True):
+        if not before and not after and not first and not last:
+            return
+
+        template = template.rstrip()
+
+        if not exact_order:
+            first_template_regex = re.compile(six.u('{{#first}}.*?{{/first}}'), re.UNICODE)
+            sans_firsts = first_template_regex.sub(six.u(''), template)
+
+            tag_match = re.compile(self.tag_token(tag)).search(sans_firsts)
+
+            if before:
+                before_match = re.compile(self.tag_token(before)).search(sans_firsts)
+                if before_match and tag_match and before_match.start() > tag_match.start():
+                    return template
+
+            if after:
+                after_match = re.compile(self.tag_token(after)).search(sans_firsts)
+                if after_match and tag_match and tag_match.start() > after_match.start():
+                    return template
+
+        key_added = False
+        skip_next_non_token = False
+        new_components = []
+
+        tag_token = self.tag_token(tag)
+
+        parsed = pystache.parse(safe_decode(template))
+        num_tokens = len(parsed._parse_tree)
+        for i, el in enumerate(parsed._parse_tree):
+
+            if hasattr(el, 'parsed'):
+                keys = [e.key for e in el.parsed._parse_tree if hasattr(e, 'key')]
+                if (before in set(keys) or first) and not key_added:
+                    token = new_components[-1] if new_components and '{' not in new_components[-1] else '\n'
+                    new_components.extend([tag_token, token])
+                    key_added = True
+
+                keys = [k for k in keys if self.aliases.get(k, k) != tag]
+                if keys:
+                    new_components.append(self.build_first_of_template(keys))
+                else:
+                    while new_components and '{' not in new_components[-1]:
+                        new_components.pop()
+                    continue
+
+                if (after in set(keys) or i == num_tokens - 1) and not key_added:
+                    token = '\n'
+                    if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
+                        token = parsed._parse_tree[i + 1]
+                    new_components.extend([token, tag_token])
+                    key_added = True
+
+            elif hasattr(el, 'key'):
+                if el.key == tag:
+                    if i == num_tokens - 1 and last:
+                        new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
+
+                    skip_next_non_token = True
+                    continue
+
+                if (el.key == before or first) and not key_added:
+                    token = '\n'
+                    if new_components and '{' not in new_components[-1]:
+                        token = new_components[-1]
+                    new_components.extend([tag_token, token])
+                    key_added = True
+
+                new_components.append('{{{{{{{key}}}}}}}'.format(key=el.key))
+
+                if (el.key == after or i == num_tokens - 1) and not key_added:
+                    token = '\n'
+                    if i < num_tokens - 1 and isinstance(parsed._parse_tree[i + 1], six.string_types):
+                        token = parsed._parse_tree[i + 1]
+                    new_components.extend([token, tag_token])
+                    key_added = True
+            elif not skip_next_non_token:
+                new_components.append(el)
+
+            if i == num_tokens - 1 and not key_added:
+                key_added = True
+                new_components.append(tag_token)
+
+            skip_next_non_token = False
+
+        return ''.join(new_components)
+
+    def add_postprocessing_tags(self, template, country, language=None):
+        is_reverse = self.is_reverse(template)
+
+        i = None
+        pivot = None
+
+        pivot_keys = (AddressFormatter.CITY, AddressFormatter.STATE, AddressFormatter.COUNTRY)
+
+        for component in pivot_keys:
+            token = self.tag_token(component)
+            if token in template:
+                i = self.BOUNDARY_COMPONENTS_ORDERED.index(component)
+                pivot = component
+                break
+
+        if i is None:
+            raise ValueError('Template {} does not contain one of {{{}}}'.format(country, ','.join(pivot_keys)))
+
+        prev = pivot
+
+        if i > 1:
+            for component in self.BOUNDARY_COMPONENTS_ORDERED[i - 1:0:-1]:
+                kw = {'before': prev} if not is_reverse else {'after': prev}
+                template = self.insert_component(template, component, exact_order=False, **kw)
+                prev = component
+
+        prev = pivot
+
+        if i < len(self.BOUNDARY_COMPONENTS_ORDERED) - 1:
+            for component in self.BOUNDARY_COMPONENTS_ORDERED[i + 1:]:
+                kw = {'after': prev} if not is_reverse else {'before': prev}
+                template = self.insert_component(template, component, exact_order=False, **kw)
+                prev = component
+
+        return template
+
+    def render_template(self, template, components, tagged=False):
+        def render_first(text):
+            text = pystache.render(text, **components)
+            splits = (e.strip() for e in text.split('||'))
+            selected = next(ifilter(bool, splits), '')
+            return selected
+
+        output = pystache.render(template, first=render_first,
+                                 **components).strip()
+
+        values = self.whitespace_component_regex.split(output)
+
+        splitter = self.splitter if not tagged else ' {}/{} '.format(self.splitter.strip(), self.field_separator_tag)
+
+        values = [self.strip_component(val, tagged=tagged) for val in values]
+
+        output = splitter.join([
+            val for val in values if val.strip()
+        ])
+
+        return output
+
+    def minimal_components(self, components):
+        for component_list in self.MINIMAL_COMPONENT_KEYS:
+            if all((c in components for c in component_list)):
+                return True
+        return False
+
+    def post_replacements(self, template, text):
+        components = []
+        seen = set()
+        for component in text.split(self.splitter):
+            component = component.strip()
+            if component not in seen:
+                components.append(component)
+                seen.add(component)
+        text = self.splitter.join(components)
+        post_format_replacements = template.get('postformat_replace')
+        if post_format_replacements:
+            for regex, replacement in post_format_replacements:
+                text = re.sub(regex, replacement, text)
+        return text
+
+    def revised_template(self, template, components, country, language=None):
+        if not template:
+            return None
+
+        country_language = None
+        if language:
+            country_language = '{}_{}'.format(country, language)
+
+        alias_country = self.country_aliases.get(country.upper(), country).lower()
+        for term in (country, country_language):
+            if term in self.country_insertions or term in self.country_conditionals:
+                break
+        else:
+            country = alias_country
+
+        cache_keys = []
+
+        invert_probability = self.country_invert_probabilities.get(country, self.global_invert_probability)
+        if random.random() < invert_probability:
+            cache_keys.append('inverted')
+            cache_key = tuple(sorted(cache_keys))
+            if cache_key in self.template_cache:
+                template = self.template_cache[cache_key]
+            else:
+                template = self.inverted(template)
+                self.template_cache[cache_key] = template
+
+        for component in sorted(components, key=self.component_order.get):
+            scope = country
+            insertions = nested_get(self.country_insertions, (country, component), default=None)
+            conditionals = nested_get(self.country_conditionals, (country, component), default=None)
+
+            if insertions is None and language:
+                insertions = nested_get(self.country_insertions, (country_language, component), default=None)
+                scope = country_language
+
+            if conditionals is None and language:
+                conditionals = nested_get(self.country_conditionals, (country_language, component), default=None)
+
+            if insertions is None and language:
+                insertions = nested_get(self.language_insertions, (language, component), default=None)
+                scope = 'lang:{}'.format(language)
+
+            if conditionals is None and language:
+                conditionals = nested_get(self.language_conditionals, (language, component), default=None)
+
+            if insertions is None:
+                insertions = nested_get(self.global_insertions, (component,), default=None)
+                scope = None
+
+            if conditionals is None:
+                conditionals = nested_get(self.global_conditionals, (component,), default=None)
+
+            if insertions is not None:
+                conditional_insertions = None
+                if conditionals is not None:
+                    for k, v in six.iteritems(conditionals):
+                        if k in components:
+                            conditional_insertions = v
+                            break
+
+                order, other = None, None
+
+                # Check the conditional probabilities first
+                if conditional_insertions is not None:
+                    values, probs = conditional_insertions
+                    order, other = weighted_choice(values, probs)
+
+                # If there are no conditional probabilites or the "default" value was chosen, sample from the marginals
+                if other is None:
+                    values, probs = insertions
+                    order, other = weighted_choice(values, probs)
+
+                # Even though we may change the value of "other" below, use
+                # the original cache key because changes from here on are
+                # deterministic and should be cached.
+                insertion_id = (scope, component, order, other)
+                cache_keys.append(insertion_id)
+
+                cache_key = tuple(sorted(cache_keys))
+
+                if cache_key in self.template_cache:
+                    template = self.template_cache[cache_key]
+                    continue
+
+                other_token = self.tag_token(other)
+
+                # Don't allow insertions between road and house_number
+                # This can happen if e.g. "level" is supposed to be inserted
+                # after house number assuming that it's a continental European
+                # address where house number comes after road. If in a previous
+                # insertion we were to swap house_number and road to create an
+                # English-style address, the final ordering would be
+                # house_number, unit, road, which we don't want. So effectively
+                # treat house_number and road as an atomic unit.
+
+                if other == self.HOUSE_NUMBER and component != self.ROAD:
+                    road_tag = self.tag_token(self.ROAD)
+                    house_number_tag = other_token
+
+                    if house_number_tag in template and road_tag in template:
+                        road_after_house_number = template.index(road_tag) > template.index(house_number_tag)
+
+                        if road_after_house_number and order == self.AFTER:
+                            other = self.ROAD
+                        elif not road_after_house_number and order == self.BEFORE:
+                            other = self.ROAD
+                elif other == self.ROAD and component != self.HOUSE_NUMBER:
+                    house_number_tag = self.tag_token(self.HOUSE_NUMBER)
+                    road_tag = other_token
+
+                    if house_number_tag in template and road_tag in template:
+                        road_before_house_number = template.index(road_tag) < template.index(house_number_tag)
+
+                        if road_before_house_number and order == self.AFTER:
+                            other = self.HOUSE_NUMBER
+                        elif not road_before_house_number and order == self.BEFORE:
+                            other = self.HOUSE_NUMBER
+
+                if order == self.BEFORE and other_token in template:
+                    template = self.insert_component(template, component, before=other)
+                elif order == self.AFTER and other_token in template:
+                    template = self.insert_component(template, component, after=other)
+                elif order == self.LAST:
+                    template = self.insert_component(template, component, last=True)
+                elif order == self.FIRST:
+                    template = self.insert_component(template, component, first=True)
+                else:
+                    continue
+
+                self.template_cache[cache_key] = template
+
+        return template
+
+    def remove_repeat_template_separators(self, template):
+        return re.sub('(?:[\s]*([,;\-]/{})[\s]*){{2,}}'.format(self.separator_tag), r' \1 ', template)
+
+    def tag_template_separators(self, template):
+        template = re.sub(r'}\s*([,\-;])\s*', r'}} \1/{} '.format(self.separator_tag), template)
+        return template
+
+    def strip_component(self, value, tagged=False):
+        if not tagged:
+            comma = token_types.COMMA.value
+            hyphen = token_types.HYPHEN.value
+
+            start = end = 0
+            tokens = tokenize_raw(value.strip())
+            for token_start, token_length, token_type in tokens:
+                start = token_start
+                if token_type not in (comma, hyphen):
+                    break
+                else:
+                    start = token_start + token_length
+
+            for token_start, token_length, token_type in reversed(tokens):
+                end = token_start + token_length
+                if token_type not in (comma, hyphen):
+                    break
+                else:
+                    end = token_start
+
+            return value[start:end]
+        else:
+            start = end = 0
+            tokens = value.split()
+
+            separator_tag = self.separator_tag
+
+            for i, t in enumerate(tokens):
+                t, c = t.rsplit('/', 1)
+                start = i
+                if c != separator_tag:
+                    break
+                else:
+                    start = i + 1
+
+            num_tokens = len(tokens)
+
+            for j, t in enumerate(reversed(tokens)):
+                t, c = t.rsplit('/', 1)
+                end = num_tokens - j
+                if c != separator_tag:
+                    break
+                else:
+                    end = num_tokens - j - 1
+
+            return six.u(' ').join(tokens[start:end])
+
+    def get_template_from_config(self, config, country, language=None):
+        template = None
+        if language:
+            language = self.language_code_replacements.get(language, language.split('_')[0])
+            # For countries like China and Japan where the country format varies
+            # based on which language is being used
+            template = config.get('{}_{}'.format(country.upper(), language.lower()), None)
+
+        if not template:
+            template = config.get(country.upper())
+
+        if not template:
+            return None
+
+        return template
+
+    def get_template(self, country, language=None):
+        return self.get_template_from_config(self.country_formats, country, language=language)
+
+    def get_no_name_template(self, country, language=None):
+        return self.get_template_from_config(self.templates_no_name, country, language=language)
+
+    def get_place_template(self, country, language=None):
+        return self.get_template_from_config(self.templates_place_only, country, language=language)
+
+    def tagged_tokens(self, name, label):
+        return six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), label if t != ',' else self.separator_tag) for t, c in tokenize(name)])
+
+    def template_language_matters(self, country, language):
+        return '{}_{}'.format(country.upper(), language) in self.country_formats or '{}_{}'.format(country, language) in self.country_formats
+
+    def format_category_query(self, category_query, address_components, country, language, tag_components=True):
+        if tag_components:
+            components = {self.CATEGORY: self.tagged_tokens(category_query.category, self.CATEGORY)}
+            if category_query.prep is not None:
+                components[self.NEAR] = self.tagged_tokens(category_query.prep, self.NEAR)
+        else:
+            components = {self.CATEGORY: category_query.category}
+            if category_query.prep is not None:
+                components[self.NEAR] = category_query.prep
+
+        if category_query.add_place_name or category_query.add_address:
+            place_formatted = self.format_address(address_components, country, language=language,
+                                                  minimal_only=False, tag_components=tag_components)
+            if not place_formatted:
+                return None
+            components['place'] = place_formatted
+
+        return self.render_template(self.category_template, components, tagged=tag_components)
+
+    def format_chain_query(self, chain_query, address_components, country, language, tag_components=True):
+        if tag_components:
+            components = {self.HOUSE: self.tagged_tokens(chain_query.name, self.HOUSE)}
+            if chain_query.prep is not None:
+                components[self.NEAR] = self.tagged_tokens(chain_query.prep, self.NEAR)
+        else:
+            components = {self.HOUSE: chain_query.name}
+            if chain_query.prep is not None:
+                components[self.NEAR] = chain_query.prep
+
+        if chain_query.add_place_name or chain_query.add_address:
+            place_formatted = self.format_address(address_components, country, language=language,
+                                                  minimal_only=False, tag_components=tag_components)
+            if not place_formatted:
+                return None
+            components['place'] = place_formatted
+
+        return self.render_template(self.chain_template, components, tagged=tag_components)
+
+    def format_intersection(self, intersection_query, place_components, country, language, tag_components=True):
+        components = {}
+        if tag_components:
+            components = {'road1': self.tagged_tokens(intersection_query.road1, self.ROAD),
+                          'intersection': self.tagged_tokens(intersection_query.intersection_phrase, self.INTERSECTION),
+                          'road2': self.tagged_tokens(intersection_query.road2, self.ROAD),
+                          }
+        else:
+            components = {'road1': intersection_query.road1,
+                          'intersection': intersection_query.intersection_phrase,
+                          'road2': intersection_query.road2}
+
+        if place_components:
+            place_formatted = self.format_address(place_components, country, language=language,
+                                                  minimal_only=False, tag_components=tag_components)
+
+            if place_formatted:
+                components['place'] = place_formatted
+        return self.render_template(self.intersection_template, components, tagged=tag_components)
+
+    def format_address(self, components, country, language,
+                       minimal_only=True, tag_components=True, replace_aliases=True):
+        if minimal_only and not self.minimal_components(components):
+            return None
+
+        template = self.get_template(country, language=language)
+        if not template:
+            return None
+
+        if not template or 'address_template' not in template:
+            return None
+        template_text = template['address_template']
+
+        template_text = self.revised_template(template_text, components, country, language=language)
+        if template_text is None:
+            return None
+
+        if tag_components:
+            template_text = self.tag_template_separators(template_text)
+
+        if template_text in self.parsed_cache:
+            template = self.parsed_cache[template_text]
+        else:
+            template = pystache.parse(template_text)
+            self.parsed_cache[template_text] = template
+
+        if replace_aliases:
+            self.aliases.replace(components)
+
+        if tag_components:
+            components = {k: self.tagged_tokens(v, k) for k, v in six.iteritems(components)}
+
+        text = self.render_template(template, components, tagged=tag_components)
+
+        text = self.remove_repeat_template_separators(text)
+
+        return text
--- a/scripts/geodata/addresses/init.py
+++ b/scripts/geodata/addresses/init.py
--- a/scripts/geodata/addresses/blocks.py
+++ b/scripts/geodata/addresses/blocks.py
@@ -0,0 +1,59 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+from geodata.configs.utils import nested_get
+from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+
+
+class Block(NumberedComponent):
+    max_blocks = 10
+
+    block_range = range(1, max_blocks + 1)
+    block_range_probs = zipfian_distribution(len(block_range), 2.0)
+    block_range_cdf = cdf(block_range_probs)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('blocks.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type == cls.NUMERIC:
+            number = weighted_choice(cls.block_range, cls.block_range_cdf)
+            return safe_decode(number)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet, 2.0)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                number = weighted_choice(cls.block_range, cls.block_range_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def phrase(cls, block, language, country=None):
+        if block is None:
+            return None
+
+        phrase_prob = address_config.get_property('blocks.alphanumeric_phrase_probability', language, country=country, default=0.0)
+        if random.random() < phrase_prob:
+            return cls.numeric_phrase('blocks.alphanumeric', block, language,
+                                      dictionaries=['qualifiers'], country=country)
+        else:
+            return None
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
--- a/scripts/geodata/addresses/config.py
+++ b/scripts/geodata/addresses/config.py
@@ -0,0 +1,152 @@
+
+import copy
+import os
+import six
+import yaml
+
+from collections import Mapping
+
+from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
+from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge, alternative_probabilities
+from geodata.math.sampling import cdf, check_probability_distribution
+
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                  'resources', 'addresses')
+
+DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                'resources', 'dictionaries')
+
+
+class AddressConfig(object):
+    def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
+        self.address_configs = {}
+        self.cache = {}
+
+        for filename in os.listdir(config_dir):
+            if not filename.endswith('.yaml'):
+                continue
+            config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
+            countries = config.pop('countries', {})
+
+            for k in countries.keys():
+                country_config = countries[k]
+                config_copy = copy.deepcopy(config)
+                countries[k] = recursive_merge(config_copy, country_config)
+
+            config['countries'] = countries
+
+            lang = filename.rsplit('.yaml')[0]
+            self.address_configs[lang] = config
+
+        self.sample_phrases = {}
+
+        for language in address_phrase_dictionaries.languages:
+            for dictionary in address_phrase_dictionaries.language_dictionaries[language]:
+                self.sample_phrases[(language, dictionary)] = {}
+                for phrases in address_phrase_dictionaries.phrases[(language, dictionary)]:
+                    self.sample_phrases[(language, dictionary)][phrases[0]] = phrases[1:]
+
+    def get_property(self, key, language, country=None, default=None):
+        keys = key.split('.')
+        config = self.address_configs.get(language, {})
+
+        if country:
+            country_config = config.get('countries', {}).get(country, {})
+            if country_config:
+                config = country_config
+
+        value = nested_get(config, keys)
+        if value is not DoesNotExist:
+            return value
+
+        return default
+
+    def cache_key(self, prop, language, dictionaries=(), country=None):
+        return (prop, language, country, tuple(dictionaries))
+
+    def alternative_probabilities(self, prop, language, dictionaries=(), country=None):
+        '''Get a probability distribution over alternatives'''
+        key = self.cache_key(prop, language, dictionaries, country=country)
+        if key not in self.cache:
+            properties = self.get_property(prop, language, country=country, default=None)
+
+            if properties is None:
+                return None, None
+
+            alternatives, probs = alternative_probabilities(properties)
+            if alternatives is None:
+                return None, None
+
+            forms = []
+            form_probs = []
+
+            for props, prob in zip(alternatives, probs):
+                phrases, phrase_probs = self.form_probabilities(props, language, dictionaries=dictionaries)
+                forms.extend([(p, props) for p in phrases])
+                form_probs.extend([prob * p for p in phrase_probs])
+
+            sample_probability = properties.get('sample_probability')
+            if sample_probability is not None:
+                sample_phrases = []
+                for dictionary in dictionaries:
+                    phrases = self.sample_phrases.get((language, dictionary), [])
+                    for canonical, surface_forms in six.iteritems(phrases):
+                        sample_phrases.append(canonical)
+                        sample_phrases.extend(surface_forms)
+                # Note: use the outer properties dictionary e.g. units.alphanumeric
+                forms.extend([(p, properties) for p in sample_phrases])
+                form_probs.extend([float(sample_probability) / len(sample_phrases)] * len(sample_phrases))
+
+            try:
+                check_probability_distribution(form_probs)
+            except AssertionError:
+                print 'values were: {}'.format(forms)
+                raise
+
+            form_probs_cdf = cdf(form_probs)
+            self.cache[key] = (forms, form_probs_cdf)
+        return self.cache[key]
+
+    def form_probabilities(self, properties, language, dictionaries=()):
+        probs = []
+        alternatives = []
+        canonical_prob = properties.get('canonical_probability', 1.0)
+        canonical = properties['canonical']
+
+        alternatives.append(canonical)
+        probs.append(canonical_prob)
+
+        if 'abbreviated_probability' in properties:
+            probs.append(properties['abbreviated_probability'])
+            abbreviated = properties['abbreviated']
+            assert isinstance(abbreviated, basestring)
+            alternatives.append(abbreviated)
+
+        if properties.get('sample', False) and 'sample_probability' in properties:
+            sample_prob = properties['sample_probability']
+            samples = set()
+            for dictionary in dictionaries:
+                phrases = self.sample_phrases.get((language, dictionary), {})
+                samples |= set(phrases.get(canonical, []))
+            if 'sample_exclude' in properties:
+                samples -= set(properties['sample_exclude'])
+            if samples:
+                for phrase in samples:
+                    probs.append(sample_prob / float(len(samples)))
+                    alternatives.append(phrase)
+            else:
+                total = sum(probs)
+                probs = [p / total for p in probs]
+
+        try:
+            check_probability_distribution(probs)
+        except AssertionError:
+            print 'values were: {}'.format(alternatives)
+            raise
+
+        return alternatives, probs
+
+address_config = AddressConfig()
--- a/scripts/geodata/addresses/conjunctions.py
+++ b/scripts/geodata/addresses/conjunctions.py
@@ -0,0 +1,37 @@
+import six
+from geodata.addresses.config import address_config
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice
+
+
+class Conjunction(object):
+    DEFAULT_WHITESPACE_JOIN = ', '
+    DEFAULT_NON_WHITESPACE_JOIN = ''
+    key = 'and'
+
+    @classmethod
+    def join(cls, phrases, language, country=None):
+
+        if not hasattr(phrases, '__iter__'):
+            raise ValueError('Param phrases must be iterable')
+
+        values, probs = address_config.alternative_probabilities(cls.key, language, country=country)
+        phrase, props = weighted_choice(values, probs)
+
+        whitespace = props.get('whitespace', True)
+        whitespace_phrase = six.u(' ') if whitespace else six.u('')
+
+        phrases = [safe_decode(p) for p in phrases]
+
+        max_phrase_join = props.get('max_phrase_join', 2)
+        if len(phrases) > max_phrase_join:
+            default_join = safe_decode(props.get('default_join', cls.DEFAULT_WHITESPACE_JOIN if whitespace else cls.DEFAULT_NON_WHITESPACE_JOIN))
+            prefix = default_join.join(phrases[:-max_phrase_join] + [six.u('')])
+        else:
+            prefix = six.u('')
+
+        if whitespace:
+            phrase = six.u('{}{}{}').format(whitespace_phrase, phrase, whitespace_phrase)
+        joined_phrase = phrase.join(phrases[-max_phrase_join:])
+
+        return six.u('').join([prefix, joined_phrase])
--- a/scripts/geodata/addresses/conscription_numbers.py
+++ b/scripts/geodata/addresses/conscription_numbers.py
@@ -0,0 +1,19 @@
+import random
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+
+class ConscriptionNumber(NumberedComponent):
+    @classmethod
+    def phrase(cls, number, language, country=None):
+        if number is None:
+            return number
+
+        key = 'conscription_numbers.alphanumeric'
+        dictionaries = ['house_numbers']
+        default = safe_decode(number)
+
+        return cls.numeric_phrase(key, safe_decode(number), language,
+                                  dictionaries=dictionaries, country=country)
--- a/scripts/geodata/addresses/dependencies.py
+++ b/scripts/geodata/addresses/dependencies.py
@@ -0,0 +1,42 @@
+import operator
+import six
+
+from geodata.graph.topsort import topsort
+
+
+class ComponentDependencies(object):
+    '''
+    Declare an address component and its dependencies e.g.
+    a house_numer cannot be used in the absence of a road name.
+    '''
+
+    component_bit_values = {}
+
+    def __init__(self, graph):
+        self.dependencies = {}
+
+        self.all_values = long('1' * len(graph), 2)
+
+        self.dependency_order = [c for c in topsort(graph)]
+
+        for component, deps in six.iteritems(graph):
+            self.dependencies[component] = self.component_bitset(deps) if deps else self.all_values
+
+    def __getitem__(self, key):
+        return self.dependencies.__getitem__(key)
+
+    def __contains__(self, key):
+        return self.dependencies.__contains__(key)
+
+    @classmethod
+    def get_component_bit_value(cls, name):
+        val = cls.component_bit_values.get(name)
+        if val is None:
+            num_values = len(cls.component_bit_values)
+            val = 1 << num_values
+            cls.component_bit_values[name] = val
+        return val
+
+    @classmethod
+    def component_bitset(cls, components):
+        return reduce(operator.or_, [cls.get_component_bit_value(name) for name in components])
--- a/scripts/geodata/addresses/directions.py
+++ b/scripts/geodata/addresses/directions.py
@@ -0,0 +1,37 @@
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumericPhrase
+from geodata.math.sampling import weighted_choice
+
+
+class RelativeDirection(NumericPhrase):
+    key = 'directions'
+    dictionaries = ['unit_directions']
+
+
+class AnteroposteriorDirection(RelativeDirection):
+    key = 'directions.anteroposterior'
+
+
+class LateralDirection(RelativeDirection):
+    key = 'directions.lateral'
+
+
+class CardinalDirection(NumericPhrase):
+    key = 'cardinal_directions'
+    dictionaries = ['cardinal_directions']
+
+
+class Direction(object):
+    CARDINAL = 'cardinal'
+    RELATIVE = 'relative'
+
+    @classmethod
+    def random(cls, language, country=None, cardinal_proability=0.5):
+        values = [cls.CARDINAL, cls.RELATIVE]
+        probs_cdf = [cardinal_proability, 1.0]
+
+        choice = weighted_choice(values, probs_cdf)
+        if choice == cls.CARDINAL:
+            return CardinalDirection.phrase(None, language, country=country)
+        else:
+            return RelativeDirection.phrase(None, language, country=country)
--- a/scripts/geodata/addresses/entrances.py
+++ b/scripts/geodata/addresses/entrances.py
@@ -0,0 +1,66 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+from geodata.configs.utils import nested_get
+from geodata.addresses.directions import RelativeDirection
+from geodata.addresses.floors import Floor
+from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+
+
+class Entrance(NumberedComponent):
+    max_entrances = 10
+
+    entrance_range = range(1, max_entrances + 1)
+    entrance_range_probs = zipfian_distribution(len(entrance_range), 2.0)
+    entrance_range_cdf = cdf(entrance_range_probs)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('entrances.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type == cls.NUMERIC:
+            number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+            return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+            number2 = number + weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+            return u'{}-{}'.format(number, number2)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet, 2.0)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                number = weighted_choice(cls.entrance_range, cls.entrance_range_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
+                whitespace_phrase = u''
+                r = random.random()
+                if r < whitespace_probability:
+                    whitespace_phrase = u' '
+                elif r < (whitespace_probability + hyphen_probability):
+                    whitespace_phrase = u'-'
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def phrase(cls, entrance, language, country=None):
+        if entrance is None:
+            return None
+        return cls.numeric_phrase('entrances.alphanumeric', entrance, language,
+                                  dictionaries=['entrances'], country=country)
--- a/scripts/geodata/addresses/floors.py
+++ b/scripts/geodata/addresses/floors.py
@@ -0,0 +1,165 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+
+from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+from geodata.numbers.spellout import numeric_expressions
+
+
+class Floor(NumberedComponent):
+    # When we don't know the number of floors, use a Zipfian distribution
+    # to choose randomly between 1 and max_floors with 1 being much more
+    # likely than 2, etc.
+    max_floors = 10
+    max_basements = 2
+    numbered_floors = range(max_floors + 1) + range(-1, -max_basements - 1, -1)
+    floor_probs = zipfian_distribution(len(numbered_floors), 0.75)
+    floor_probs_cdf = cdf(floor_probs)
+
+    # For use with letters e.g. A0 is probably not as common
+    floors_letters = range(1, max_floors + 1) + [0]
+    floors_letters_probs = zipfian_distribution(len(floors_letters), 2.0)
+    floors_letters_cdf = cdf(floors_letters_probs)
+
+    @classmethod
+    def sample_floors(cls, num_floors, num_basements=0):
+        num_floors = int(num_floors)
+        return random.randint(-num_basements, (num_floors - 1) if num_floors > 0 else 0)
+
+    @classmethod
+    def sample_floors_range(cls, min_floor, max_floor):
+        return random.randint(min_floor, (max_floor - 1) if max_floor > min_floor else min_floor)
+
+    @classmethod
+    def random_int(cls, language, country=None, num_floors=None, num_basements=None):
+        number = None
+        if num_floors is not None:
+            try:
+                num_floors = int(num_floors)
+            except (ValueError, TypeError):
+                return weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
+
+            if num_floors <= cls.max_floors:
+                number = cls.sample_floors(num_floors, num_basements=num_basements or 0)
+            else:
+                number = cls.sample_floors_range(cls.max_floors + 1, num_floors)
+
+        else:
+            number = weighted_choice(cls.numbered_floors, cls.floor_probs_cdf)
+
+        return number
+
+    @classmethod
+    def random_from_int(cls, number, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('levels.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
+
+        if number >= 0:
+            number += numbering_starts_at
+
+        if num_type == cls.NUMERIC:
+            return safe_decode(number)
+        elif num_type == cls.ROMAN_NUMERAL:
+            roman_numeral = numeric_expressions.roman_numeral(number)
+            if roman_numeral is not None:
+                return roman_numeral
+            else:
+                return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number2 = number + sample_floors_range(1, cls.max_floors)
+            return u'{}-{}'.format(number, number2)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet)
+            if num_type == cls.ALPHA:
+                return letter
+            else:
+                number = weighted_choice(cls.floors_letters, cls.floors_letters_cdf)
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}').format(letter, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}').format(number, letter)
+
+        return None
+
+    @classmethod
+    def random(cls, language, country=None, num_floors=None, num_basements=None):
+        number = cls.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
+        return cls.random_from_int(number, language, country=country)
+
+    @classmethod
+    def phrase(cls, floor, language, country=None, num_floors=None):
+        if floor is None:
+            return None
+
+        integer_floor = False
+        floor = safe_decode(floor)
+        try:
+            floor = int(floor)
+            integer_floor = True
+        except (ValueError, TypeError):
+            try:
+                floor = float(floor)
+                integer_floor = int(floor) == floor
+            except (ValueError, TypeError):
+                return cls.numeric_phrase('levels.alphanumeric', floor, language,
+                                          dictionaries=['level_types_numbered'], country=country)
+
+        numbering_starts_at = int(address_config.get_property('levels.numbering_starts_at', language, country=country, default=0))
+        try:
+            num_floors = int(num_floors)
+            top_floor = num_floors if numbering_starts_at == 1 else num_floors - 1
+            is_top = num_floors and floor == top_floor
+        except (ValueError, TypeError):
+            is_top = False
+
+        alias_prefix = 'levels.aliases'
+        aliases = address_config.get_property(alias_prefix, language, country=country)
+        if aliases:
+            alias = None
+
+            if not integer_floor and floor >= 0 and 'half_floors' in aliases:
+                floor = int(floor)
+                alias = 'half_floors'
+            elif not integer_floor and floor < 0 and 'half_floors_negative' in aliases:
+                floor = int(floor)
+                alias = 'half_floors_negative'
+            elif floor < -1 and '<-1' in aliases:
+                alias = '<-1'
+            elif is_top and 'top' in aliases:
+                alias = 'top'
+            elif safe_decode(floor) in aliases:
+                alias = safe_decode(floor)
+
+            floor = safe_decode(floor)
+
+            if alias:
+                alias_props = aliases.get(alias)
+
+                # Aliases upon aliases, e.g. for something like "Upper Mezzanine"
+                # where it's an alias for "1" under the half_floors key
+                if safe_decode(floor) in alias_props.get('aliases', {}):
+                    alias_prefix = '{}.{}.aliases'.format(alias_prefix, alias)
+                    alias = safe_decode(floor)
+
+            if alias:
+                return cls.numeric_phrase('{}.{}'.format(alias_prefix, alias), floor, language,
+                                          dictionaries=['level_types_basement',
+                                                        'level_types_mezzanine',
+                                                        'level_types_numbered',
+                                                        'level_types_standalone',
+                                                        'level_types_sub_basement'],
+                                          country=country)
+
+        return cls.numeric_phrase('levels.alphanumeric', floor, language,
+                              dictionaries=['level_types_numbered'], country=country)
--- a/scripts/geodata/addresses/house_numbers.py
+++ b/scripts/geodata/addresses/house_numbers.py
@@ -0,0 +1,26 @@
+import random
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+
+class HouseNumber(NumberedComponent):
+    @classmethod
+    def phrase(cls, number, language, country=None):
+        if number is not None:
+            prob_key = 'house_numbers.alphanumeric_phrase_probability'
+            key = 'house_numbers.alphanumeric'
+            dictionaries = ['house_numbers', 'number']
+            default = safe_decode(number)
+        else:
+            prob_key = 'house_numbers.no_number_probability'
+            key = 'house_numbers.no_number'
+            dictionaries = ['no_number']
+            default = None
+
+        phrase_prob = address_config.get_property(prob_key, language, country=country, default=0.0)
+        if random.random() < phrase_prob:
+            return cls.numeric_phrase(key, safe_decode(number), language,
+                                      dictionaries=dictionaries, country=country)
+        return default
--- a/scripts/geodata/addresses/metro_stations.py
+++ b/scripts/geodata/addresses/metro_stations.py
@@ -0,0 +1,24 @@
+from geodata.addresses.config import address_config
+
+import random
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumericPhrase 
+from geodata.encoding import safe_decode
+
+
+class MetroStationPhrase(NumericPhrase):
+    key = 'metro_stations.alphanumeric'
+    dictionaries = ['qualifiers']
+
+
+class MetroStation(object):
+    @classmethod
+    def phrase(cls, station, language, country=None):
+        if station is None:
+            return None
+        phrase_prob = address_config.get_property('metro_stations.alphanumeric_phrase_probability', language, country=country, default=0.0)
+        if random.random() < phrase_prob:
+            return MetroStationPhrase.phrase(station, language, country=country)
+
+        return None
--- a/scripts/geodata/addresses/numbering.py
+++ b/scripts/geodata/addresses/numbering.py
@@ -0,0 +1,434 @@
+# -*- coding: utf-8 -*-
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+from geodata.math.floats import isclose
+from geodata.numbers.ordinals import ordinal_expressions
+from geodata.numbers.spellout import numeric_expressions
+from geodata.text.tokenize import tokenize, token_types
+
+alphabets = {}
+
+
+def sample_alphabet(alphabet, b=1.5):
+    '''
+    Sample an "alphabet" using a Zipfian distribution (frequent items are very
+    frequent, long tail of infrequent items). If we look at something like
+    unit numbers, "Unit A" or "Unit B" are much more likely than "Unit X" or
+    "Unit Z" simply because most dwellings only have a few units. Sampling
+    letters from a Zipfian distribution rather than uniformly means that instead
+    of every letter having the same likelihood (1/26), letters toward the beginning
+    of the alphabet are much more likely to be selected. Letters toward the end can
+    still be selected sometimes, but are not very likely.
+
+    Note letters don't necessarily need to be sorted alphabetically, just in order
+    of frequency.
+    '''
+    global alphabets
+    alphabet = tuple(alphabet)
+    if alphabet not in alphabets:
+        probs = zipfian_distribution(len(alphabet), b)
+        probs_cdf = cdf(probs)
+
+        alphabets[alphabet] = probs_cdf
+
+    probs_cdf = alphabets[alphabet]
+    return weighted_choice(alphabet, probs_cdf)
+
+latin_alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
+
+
+class Digits(object):
+    ASCII = 'ascii'
+    SPELLOUT = 'spellout'
+    UNICODE_FULL_WIDTH = 'unicode_full_width'
+    ROMAN_NUMERAL = 'roman_numeral'
+
+    CARDINAL = 'cardinal'
+    ORDINAL = 'ordinal'
+
+    unicode_full_width_map = {
+        '0': safe_decode('０'),
+        '1': safe_decode('１'),
+        '2': safe_decode('２'),
+        '3': safe_decode('３'),
+        '4': safe_decode('４'),
+        '5': safe_decode('５'),
+        '6': safe_decode('６'),
+        '7': safe_decode('７'),
+        '8': safe_decode('８'),
+        '9': safe_decode('９'),
+    }
+
+    full_width_digit_map = {
+        v: k for k, v in six.iteritems(unicode_full_width_map)
+    }
+
+    @classmethod
+    def rewrite_full_width(cls, s):
+        return six.u('').join([cls.unicode_full_width_map.get(c, c) for c in s])
+
+    @classmethod
+    def rewrite_standard_width(cls, s):
+        return six.u('').join([cls.full_width_digit_map.get(c, c) for c in s])
+
+    @classmethod
+    def rewrite_roman_numeral(cls, s):
+        roman_numeral = None
+        if s.isdigit():
+            roman_numeral = numeric_expressions.roman_numeral(s)
+
+        if roman_numeral:
+            return roman_numeral
+        else:
+            return s
+
+    @classmethod
+    def rewrite_spellout(cls, s, lang, num_type, props):
+        if s.isdigit():
+            num = int(s)
+            spellout = None
+            gender = props.get('gender')
+            category = props.get('category')
+
+            if num_type == cls.CARDINAL:
+                spellout = numeric_expressions.spellout_cardinal(num, lang, gender=gender, category=category)
+            elif num_type == cls.ORDINAL:
+                spellout = numeric_expressions.spellout_ordinal(num, lang, gender=gender, category=category)
+
+            if spellout:
+                return spellout.title()
+            return s
+        else:
+            return s
+
+    @classmethod
+    def rewrite(cls, d, lang, props, num_type=CARDINAL):
+        if not props:
+            return d
+
+        d = safe_decode(d)
+
+        values = []
+        probs = []
+
+        for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
+            key = '{}_probability'.format(digit_type)
+            if key in props:
+                values.append(digit_type)
+                probs.append(props[key])
+
+        if not isclose(sum(probs), 1.0):
+            values.append(cls.ASCII)
+            probs.append(1.0 - sum(probs))
+
+        probs = cdf(probs)
+        digit_type = weighted_choice(values, probs)
+
+        if digit_type == cls.ASCII:
+            return d
+        elif digit_type == cls.SPELLOUT:
+            return cls.rewrite_spellout(d, lang, num_type, props)
+        elif digit_type == cls.ROMAN_NUMERAL:
+            roman_numeral = cls.rewrite_roman_numeral(d)
+            if random.random() < props.get('ordinal_suffix_probability', 0.0):
+                ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
+                if ordinal_suffix:
+                    roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
+            return roman_numeral
+        elif digit_type == cls.UNICODE_FULL_WIDTH:
+            return cls.rewrite_full_width(d)
+        else:
+            return d
+
+
+class NumericPhrase(object):
+    key = None
+
+    NUMERIC = 'numeric'
+    NUMERIC_AFFIX = 'numeric_affix'
+
+    @classmethod
+    def pick_phrase_and_type(cls, number, language, country=None):
+        values, probs = address_config.alternative_probabilities(cls.key, language, dictionaries=cls.dictionaries, country=country)
+        if not values:
+            return None, safe_decode(number) if number is not None else None, None
+
+        phrase, phrase_props = weighted_choice(values, probs)
+
+        values = []
+        probs = []
+
+        for num_type in (cls.NUMERIC, cls.NUMERIC_AFFIX):
+            key = '{}_probability'.format(num_type)
+            prob = phrase_props.get(key, None)
+            if prob is not None:
+                values.append(num_type)
+                probs.append(prob)
+
+        if not probs:
+            num_type = cls.NUMERIC
+        else:
+            probs = cdf(probs)
+            num_type = weighted_choice(values, probs)
+
+        return num_type, phrase, phrase_props[num_type]
+
+    @classmethod
+    def combine_with_number(cls, number, phrase, num_type, props, whitespace_default=False):
+
+        if num_type == cls.NUMERIC_AFFIX:
+            phrase = props['affix']
+            if 'zero_pad' in props and number.isdigit():
+                number = number.rjust(props['zero_pad'], props.get('zero_char', '0'))
+
+        direction = props['direction']
+        whitespace = props.get('whitespace', whitespace_default)
+        whitespace_probability = props.get('whitespace_probability')
+        if whitespace_probability is not None:
+            whitespace = random.random() < whitespace_probability
+
+        if props.get('title_case', True):
+            # Title case unless the config specifies otherwise
+            phrase = phrase.title()
+
+        if number is None:
+            return phrase
+
+        whitespace_phrase = six.u(' ') if whitespace else six.u('')
+        # Phrase goes to the left of hte number
+        if direction == 'left':
+            return six.u('{}{}{}').format(phrase, whitespace_phrase, number)
+        # Phrase goes to the right of the number
+        elif direction == 'right':
+            return six.u('{}{}{}').format(number, whitespace_phrase, phrase)
+        # Need to specify a direction, otherwise return naked number
+        else:
+            return safe_decode(number)
+
+    @classmethod
+    def phrase(cls, number, language, country=None):
+        num_type, phrase, props = cls.pick_phrase_and_type(number, language, country=country)
+        whitespace_default = num_type == cls.NUMERIC
+        return cls.combine_with_number(number, phrase, num_type, props, whitespace_default=whitespace_default)
+
+
+class Number(NumericPhrase):
+    key = 'numbers'
+    dictionaries = ['number']
+
+
+class NumberedComponent(object):
+    NUMERIC = 'numeric'
+    ALPHA = 'alpha'
+    ALPHA_PLUS_NUMERIC = 'alpha_plus_numeric'
+    NUMERIC_PLUS_ALPHA = 'numeric_plus_alpha'
+    HYPHENATED_NUMBER = 'hyphenated_number'
+    ROMAN_NUMERAL = 'roman_numeral'
+
+    @classmethod
+    def choose_alphanumeric_type(cls, key, language, country=None):
+        alphanumeric_props = address_config.get_property(key, language, country=country, default=None)
+        if alphanumeric_props is None:
+            return None, None
+
+        values = []
+        probs = []
+
+        for num_type in (cls.NUMERIC, cls.ALPHA, cls.ALPHA_PLUS_NUMERIC, cls.NUMERIC_PLUS_ALPHA, cls.HYPHENATED_NUMBER, cls.ROMAN_NUMERAL):
+            key = '{}_probability'.format(num_type)
+            prob = alphanumeric_props.get(key)
+            if prob is not None:
+                values.append(num_type)
+                probs.append(prob)
+
+        if not values:
+            return None, None
+
+        probs = cdf(probs)
+        num_type = weighted_choice(values, probs)
+        num_type_props = alphanumeric_props.get(num_type, {})
+
+        return num_type, num_type_props
+
+    @classmethod
+    def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
+        has_alpha = False
+        has_numeric = True
+        is_integer = False
+        is_none = False
+        if num is not None:
+            try:
+                num_int = int(num)
+                is_integer = True
+            except ValueError:
+                try:
+                    num_float = float(num)
+                except ValueError:
+                    tokens = tokenize(safe_decode(num))
+                    has_numeric = False
+                    for t, c in tokens:
+                        if c == token_types.NUMERIC:
+                            has_numeric = True
+                        if any((ch.isalpha() for ch in t)):
+                            has_alpha = True
+
+                    if strict_numeric and has_alpha:
+                        return safe_decode(num)
+
+        else:
+            is_none = True
+
+        values, probs = None, None
+
+        if is_alpha:
+            values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)
+
+        # Pick a phrase given the probability distribution from the config
+        if values is None:
+            values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)
+
+        if not values:
+            return safe_decode(num) if not is_none else None
+
+        phrase, phrase_props = weighted_choice(values, probs)
+
+        values = []
+        probs = []
+
+        # Dictionaries are lowercased, so title case here
+        if phrase_props.get('title_case', True):
+            phrase = phrase.title()
+
+        '''
+        There are a few ways we can express the number itself
+
+        1. Alias it as some standalone word like basement (for floor "-1")
+        2. Use the number itself, so "Floor 2"
+        3. Append/prepend an affix e.g. 2/F for second floor
+        4. As an ordinal expression e.g. "2nd Floor"
+        '''
+        have_standalone = False
+        have_null = False
+        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
+            key = '{}_probability'.format(num_type)
+            prob = phrase_props.get(key)
+            if prob is not None:
+                if num_type == 'standalone':
+                    have_standalone = True
+                elif num_type == 'null':
+                    have_null = True
+                values.append(num_type)
+                probs.append(prob)
+            elif num_type in phrase_props:
+                values.append(num_type)
+                probs.append(1.0)
+                break
+
+        if not probs or is_none:
+            return phrase
+
+        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
+        if has_alpha:
+            values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
+            total = float(sum(probs))
+            if isclose(total, 0.0):
+                return None
+
+            probs = [p / total for p in probs]
+
+        probs = cdf(probs)
+
+        if len(values) < 2:
+            if have_standalone:
+                num_type = 'standalone'
+            elif have_null:
+                num_type = 'null'
+            else:
+                num_type = 'numeric'
+        else:
+            num_type = weighted_choice(values, probs)
+
+        if num_type == 'standalone':
+            return phrase
+        elif num_type == 'null':
+            return safe_decode(num)
+
+        props = phrase_props[num_type]
+
+        if is_integer:
+            num_int = int(num)
+            if phrase_props.get('number_abs_value', False):
+                num_int = abs(num_int)
+                num = num_int
+
+            if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
+                return None
+
+            if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
+                return None
+
+            if phrase_props.get('number_subtract_abs_value'):
+                num_int -= phrase_props['number_subtract_abs_value']
+                num = num_int
+
+        num = safe_decode(num)
+        digits_props = props.get('digits')
+        if digits_props:
+            # Inherit the gender and category e.g. for ordinals
+            for k in ('gender', 'category'):
+                if k in props:
+                    digits_props[k] = props[k]
+            num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)
+
+        # Do we add the numeric phrase e.g. Floor No 1
+        add_number_phrase = props.get('add_number_phrase', False)
+        if add_number_phrase and random.random() < props['add_number_phrase_probability']:
+            num = Number.phrase(num, language, country=country)
+
+        whitespace_default = True
+
+        if num_type == 'numeric_affix':
+            phrase = props['affix']
+            if props.get('upper_case', True):
+                phrase = phrase.upper()
+            if 'zero_pad' in props and num.isdigit():
+                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
+            whitespace_default = False
+        elif num_type == 'ordinal' and safe_decode(num).isdigit():
+            ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))
+
+            if ordinal_expression is not None:
+                num = ordinal_expression
+
+        if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
+            if random.random() < props['null_phrase_probability']:
+                return num
+
+        direction = props['direction']
+        whitespace = props.get('whitespace', whitespace_default)
+
+        whitespace_probability = props.get('whitespace_probability')
+        if whitespace_probability is not None:
+            whitespace = random.random() < whitespace_probability
+
+        # Occasionally switch up if direction_probability is specified
+        if random.random() > props.get('direction_probability', 1.0):
+            if direction == 'left':
+                direction = 'right'
+            elif direction == 'right':
+                direction = 'left'
+
+        whitespace_phrase = six.u(' ') if whitespace else six.u('')
+        # Phrase goes to the left of hte number
+        if direction == 'left':
+            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
+        # Phrase goes to the right of the number
+        elif direction == 'right':
+            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
+        # Need to specify a direction, otherwise return naked number
+        else:
+            return safe_decode(num)
--- a/scripts/geodata/addresses/po_boxes.py
+++ b/scripts/geodata/addresses/po_boxes.py
@@ -0,0 +1,76 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import cdf, weighted_choice
+
+
+class POBox(NumberedComponent):
+    @classmethod
+    def random_digits(cls, num_digits):
+        # Note: PO Boxes can have leading zeros but not important for the parser
+        # since it only cares about how many digits there are in a number
+        low = 10 ** (num_digits - 1)
+        high = (10 ** num_digits) - 1
+
+        return random.randint(low, high)
+
+    @classmethod
+    def random_digits_with_prefix(cls, num_digits, prefix=six.u('')):
+        return six.u('').join([prefix, safe_decode(cls.random_digits(num_digits))])
+
+    @classmethod
+    def random_digits_with_suffix(cls, num_digits, suffix=six.u('')):
+        return six.u('').join([safe_decode(cls.random_digits(num_digits)), suffix])
+
+    @classmethod
+    def random_letter(cls, language, country=None):
+        alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+        return sample_alphabet(alphabet)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('po_boxes.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type != cls.ALPHA:
+            digit_config = address_config.get_property('po_boxes.digits', language, country=country, default=[])
+            values = []
+            probs = []
+
+            for val in digit_config:
+                values.append(val['length'])
+                probs.append(val['probability'])
+
+            probs = cdf(probs)
+
+            num_digits = weighted_choice(values, probs)
+
+            digits = cls.random_digits(num_digits)
+            number = Digits.rewrite(digits, language, num_type_props)
+
+
+            if num_type == cls.NUMERIC:
+                return safe_decode(number)
+            else:
+                letter = cls.random_letter(language, country=country)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                whitespace_phrase = six.u(' ') if whitespace_probability and random.random() < whitespace_probability else six.u('')
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+        else:
+            return cls.random_letter(language, country=country)
+
+    @classmethod
+    def phrase(cls, box_number, language, country=None):
+        if box_number is None:
+            return None
+        return cls.numeric_phrase('po_boxes.alphanumeric', safe_decode(box_number), language,
+                                  dictionaries=['post_office'], country=country)
--- a/scripts/geodata/addresses/postcodes.py
+++ b/scripts/geodata/addresses/postcodes.py
@@ -0,0 +1,11 @@
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+
+class PostCode(NumberedComponent):
+    @classmethod
+    def phrase(cls, postcode, language, country=None):
+        if postcode is None:
+            return None
+        return cls.numeric_phrase('postcodes.alphanumeric', postcode, language,
+                                  dictionaries=['postcodes'], country=country)
--- a/scripts/geodata/addresses/staircases.py
+++ b/scripts/geodata/addresses/staircases.py
@@ -0,0 +1,66 @@
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.numbering import NumberedComponent
+from geodata.encoding import safe_decode
+
+from geodata.configs.utils import nested_get
+from geodata.addresses.directions import RelativeDirection
+from geodata.addresses.floors import Floor
+from geodata.addresses.numbering import NumberedComponent, Digits, sample_alphabet, latin_alphabet
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+
+
+class Staircase(NumberedComponent):
+    max_staircases = 10
+
+    staircase_range = range(1, max_staircases + 1)
+    staircase_range_probs = zipfian_distribution(len(staircase_range), 2.0)
+    staircase_range_cdf = cdf(staircase_range_probs)
+
+    @classmethod
+    def random(cls, language, country=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('staircases.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        if num_type == cls.NUMERIC:
+            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+            return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+            number2 = number + weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+            return u'{}-{}'.format(number, number2)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet, 2.0)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                number = weighted_choice(cls.staircase_range, cls.staircase_range_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
+                whitespace_phrase = u''
+                r = random.random()
+                if r < whitespace_probability:
+                    whitespace_phrase = u' '
+                elif r < (whitespace_probability + hyphen_probability):
+                    whitespace_phrase = u'-'
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def phrase(cls, staircase, language, country=None):
+        if staircase is None:
+            return None
+        return cls.numeric_phrase('staircases.alphanumeric', staircase, language,
+                                  dictionaries=['staircases'], country=country)
--- a/scripts/geodata/addresses/units.py
+++ b/scripts/geodata/addresses/units.py
@@ -0,0 +1,285 @@
+import itertools
+import random
+import six
+
+from geodata.addresses.config import address_config
+from geodata.addresses.directions import RelativeDirection, LateralDirection, AnteroposteriorDirection
+from geodata.addresses.floors import Floor
+from geodata.addresses.numbering import NumberedComponent, sample_alphabet, latin_alphabet
+from geodata.configs.utils import nested_get
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice, zipfian_distribution, cdf
+from geodata.text.utils import is_numeric_strict
+
+
+class Unit(NumberedComponent):
+    # When we don't know the number of units, use a Zipfian distribution
+    # to choose randomly between 1 and max_units with 1 being much more
+    # likely than 2, etc.
+    max_units = 99
+    max_basements = 2
+
+    hundreds_numbered_units_tens = [range(101, 110) + [100],
+                                    range(201, 210) + [200],
+                                    range(301, 310) + [300],
+                                    range(401, 410) + [400],
+                                    range(501, 510) + [500],
+                                    ]
+
+    hundreds_numbered_units = [range(110, 200),
+                               range(210, 300),
+                               range(310, 400),
+                               range(410, 500),
+                               range(510, 600),
+                               ]
+
+    thousands_numbered_units = [range(1001, 1030) + [1000],
+                                range(2001, 2030) + [2000],
+                                range(3001, 3030) + [3000],
+                                range(4001, 4030) + [4000],
+                                range(5001, 5030) + [5000]
+                                ]
+
+    numbered_units = range(1, 10)
+    numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units_tens)))
+    numbered_units.extend(range(10, 100))
+    numbered_units.extend(itertools.chain(*itertools.izip(*hundreds_numbered_units)))
+    numbered_units.extend(itertools.chain(*itertools.izip(*thousands_numbered_units)))
+    numbered_units.extend(range(10001, 10100) + [10000])
+    numbered_units.append(0)
+    numbered_units.extend(range(0, -max_basements - 1, -1))
+
+    unit_probs = zipfian_distribution(len(numbered_units), 0.7)
+    unit_probs_cdf = cdf(unit_probs)
+
+    num_digits = [2, 3, 4]
+    num_digits_probs = zipfian_distribution(len(num_digits), 4.0)
+    num_digits_cdf = cdf(num_digits_probs)
+
+    # For use with floors e.g. #301 more common than #389
+    positive_units_floors = range(1, 10) + [0] + range(10, max_units + 1)
+    positive_units_floors_probs = zipfian_distribution(len(positive_units_floors), 0.6)
+    positive_units_floors_cdf = cdf(positive_units_floors_probs)
+
+    # For basic positive units
+    positive_units = range(1, max_units + 1)
+    positive_units_probs = zipfian_distribution(len(positive_units), 0.6)
+    positive_units_cdf = cdf(positive_units_probs)
+
+    # For use with letters e.g. A0 less common
+    positive_units_letters = range(1, max_units + 1) + [0]
+    positive_units_letters_probs = zipfian_distribution(len(positive_units_letters), 0.6)
+    positive_units_letters_cdf = cdf(positive_units_letters_probs)
+
+    RESIDENTIAL = 'residential'
+    COMMERCIAL = 'commercial'
+    INDUSTRIAL = 'industrial'
+    UNIVERSITY = 'university'
+
+    @classmethod
+    def sample_num_digits(cls):
+        return weighted_choice(cls.num_digits, cls.num_digits_cdf)
+
+    @classmethod
+    def for_floor(cls, floor_number, num_digits=None):
+        num_digits = num_digits if num_digits is not None else cls.sample_num_digits()
+        unit = weighted_choice(cls.positive_units_floors, cls.positive_units_floors_cdf)
+        return six.u('{}{}').format(floor_number, safe_decode(unit).zfill(num_digits))
+
+    @classmethod
+    def random(cls, language, country=None, num_floors=None, num_basements=None, floor=None):
+        num_type, num_type_props = cls.choose_alphanumeric_type('units.alphanumeric', language, country=country)
+        if num_type is None:
+            return None
+
+        use_floor_prob = address_config.get_property('units.alphanumeric.use_floor_probability', language, country=country, default=0.0)
+
+        use_positive_numbers_prob = address_config.get_property('units.alphanumeric.use_positive_numbers_probability', language, country=country, default=0.0)
+
+        if (num_floors is None and floor is None) or random.random() >= use_floor_prob:
+            if random.random() >= use_positive_numbers_prob:
+                number = weighted_choice(cls.numbered_units, cls.unit_probs_cdf)
+            else:
+                number = weighted_choice(cls.positive_units, cls.positive_units_cdf)
+        else:
+            if floor is None or not floor.isdigit():
+                floor = Floor.random_int(language, country=country, num_floors=num_floors, num_basements=num_basements)
+
+            floor_numbering_starts_at = address_config.get_property('levels.numbering_starts_at', language, country=country, default=0)
+            ground_floor_starts_at = address_config.get_property('units.alphanumeric.use_floor_ground_starts_at', language, country=country, default=None)
+
+            if ground_floor_starts_at is not None:
+                try:
+                    floor = int(floor)
+                    if floor >= floor_numbering_starts_at:
+                        floor -= floor_numbering_starts_at
+                    floor += ground_floor_starts_at
+                    floor = safe_decode(floor)
+                except (TypeError, ValueError):
+                    pass
+
+            use_floor_affix_prob = address_config.get_property('units.alphanumeric.use_floor_numeric_affix_probability', language, country=country, default=0.0)
+            if use_floor_affix_prob and random.random() < use_floor_affix_prob:
+                floor_phrase = Floor.phrase(floor, language, country=country)
+                # Only works if the floor phrase is strictly numeric e.g. "1" or "H1"
+                if is_numeric_strict(floor_phrase):
+                    unit = weighted_choice(cls.positive_units, cls.positive_units_cdf)
+
+                    unit_num_digits = address_config.get_property('units.alphanumeric.use_floor_unit_num_digits', language, country=country, default=None)
+                    if unit_num_digits is not None:
+                        unit = safe_decode(unit).zfill(unit_num_digits)
+
+                    return six.u('{}{}').format(floor_phrase, unit)
+
+            floor_num_digits = address_config.get_property('units.alphanumeric.use_floor_floor_num_digits', language, country=country, default=None)
+            if floor_num_digits is not None and floor.isdigit():
+                floor = floor.zfill(floor_num_digits)
+
+            number = cls.for_floor(floor)
+
+        if num_type == cls.NUMERIC:
+            return safe_decode(number)
+        elif num_type == cls.HYPHENATED_NUMBER:
+            number2 = weighted_choice(cls.positive_units, cls.positive_units_cdf)
+            range_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.range_probability', language, country=country, default=0.5))
+            direction = address_config.get_property('units.alphanumeric.hyphenated_number.direction', language, country=country, default='right')
+            direction_prob = float(address_config.get_property('units.alphanumeric.hyphenated_number.direction_probability', language, country=country, default=0.0))
+
+            if random.random() < direction_prob:
+                direction = 'left' if direction == 'right' else 'right'
+
+            direction_right = direction == 'right'
+
+            if random.random() < range_prob:
+                if direction_right:
+                    number2 += number
+                else:
+                    number2 = max(0, number - number2)
+            if direction == 'right':
+                return u'{}-{}'.format(number, number2)
+            else:
+                return u'{}-{}'.format(number2, number)
+        else:
+            alphabet = address_config.get_property('alphabet', language, country=country, default=latin_alphabet)
+            alphabet_probability = address_config.get_property('alphabet_probability', language, country=country, default=None)
+            if alphabet_probability is not None and random.random() >= alphabet_probability:
+                alphabet = latin_alphabet
+            letter = sample_alphabet(alphabet)
+            if num_type == cls.ALPHA:
+                return safe_decode(letter)
+            else:
+                if num_floors is None:
+                    number = weighted_choice(cls.positive_units_letters, cls.positive_units_letters_cdf)
+
+                whitespace_probability = float(num_type_props.get('whitespace_probability', 0.0))
+                hyphen_probability = float(num_type_props.get('hyphen_probability', 0.0))
+                whitespace_phrase = u''
+                r = random.random()
+                if r < whitespace_probability:
+                    whitespace_phrase = u' '
+                elif r < (whitespace_probability + hyphen_probability):
+                    whitespace_phrase = u'-' 
+
+                if num_type == cls.ALPHA_PLUS_NUMERIC:
+                    return six.u('{}{}{}').format(letter, whitespace_phrase, number)
+                elif num_type == cls.NUMERIC_PLUS_ALPHA:
+                    return six.u('{}{}{}').format(number, whitespace_phrase, letter)
+
+    @classmethod
+    def add_direction(cls, key, unit, language, country=None):
+        add_direction_probability = address_config.get_property('{}.add_direction_probability'.format(key),
+                                                                language, country=country, default=0.0)
+        if not random.random() < add_direction_probability:
+            return unit
+        add_direction_numeric = address_config.get_property('{}.add_direction_numeric'.format(key),
+                                                            language, country=country)
+        try:
+            unit = int(unit)
+            integer_unit = True
+        except (ValueError, TypeError):
+            integer_unit = False
+
+        if add_direction_numeric and integer_unit:
+            return RelativeDirection.phrase(unit, language, country=country)
+        elif not integer_unit:
+            add_direction_standalone = address_config.get_property('{}.add_direction_standalone'.format(key),
+                                                                   language, country=country)
+            if add_direction_standalone:
+                return RelativeDirection.phrase(None, language, country=country)
+
+    @classmethod
+    def add_quadrant(cls, key, unit, language, country=None):
+        add_quadrant_probability = address_config.get_property('{}.add_quadrant_probability'.format(key),
+                                                               language, country=country, default=0.0)
+        if not random.random() < add_quadrant_probability:
+            return unit
+        add_quadrant_numeric = address_config.get_property('{}.add_quadrant_numeric'.format(key),
+                                                           language, country=country)
+        try:
+            unit = int(unit)
+            integer_unit = True
+        except (ValueError, TypeError):
+            integer_unit = False
+
+        first_direction = address_config.get_property('{}.add_quadrant_first_direction'.format(key),
+                                                      language, country=country)
+
+        if first_direction == 'lateral':
+            ordering = (LateralDirection, AnteroposteriorDirection)
+        elif first_direction == 'anteroposterior':
+            ordering = (AnteroposteriorDirection, LateralDirection)
+        else:
+            return unit
+
+        if not integer_unit:
+            add_quadrant_standalone = address_config.get_property('{}.add_quadrant_standalone'.format(key),
+                                                                  language, country=country)
+            if add_quadrant_standalone:
+                unit = None
+            else:
+                return None
+
+        last_num_type = None
+        for i, c in enumerate(ordering):
+            num_type, phrase, props = c.pick_phrase_and_type(unit, language, country=country)
+            whitespace_default = num_type == c.NUMERIC or last_num_type == c.NUMERIC
+            unit = c.combine_with_number(unit, phrase, num_type, props, whitespace_default=whitespace_default)
+            last_num_type = num_type
+
+        return unit
+
+    @classmethod
+    def phrase(cls, unit, language, country=None, zone=None):
+        if unit is not None:
+            key = 'units.alphanumeric' if zone is None else 'units.zones.{}'.format(zone)
+
+            if not address_config.get_property(key, language, country=country):
+                return None
+
+            is_alpha = safe_decode(unit).isalpha()
+
+            direction_unit = None
+            add_direction = address_config.get_property('{}.add_direction'.format(key), language, country=country)
+            if add_direction:
+                direction_unit = cls.add_direction(key, unit, language, country=country)
+
+            if direction_unit and direction_unit != unit:
+                unit = direction_unit
+                is_alpha = False
+            else:
+                add_quadrant = address_config.get_property('{}.add_quadrant'.format(key), language, country=country)
+                if add_quadrant:
+                    unit = cls.add_quadrant(key, unit, language, country=country)
+                    is_alpha = False
+
+            return cls.numeric_phrase(key, safe_decode(unit), language,
+                                      dictionaries=['unit_types_numbered'], country=country, is_alpha=is_alpha)
+        else:
+            key = 'units.standalone'
+            values, probs = address_config.alternative_probabilities(key, language,
+                                                                     dictionaries=['unit_types_standalone'],
+                                                                     country=country)
+            if values is None:
+                return None
+            phrase, phrase_props = weighted_choice(values, probs)
+            return phrase.title()
--- a/scripts/geodata/boundaries/init.py
+++ b/scripts/geodata/boundaries/init.py
--- a/scripts/geodata/boundaries/names.py
+++ b/scripts/geodata/boundaries/names.py
@@ -0,0 +1,167 @@
+import os
+import random
+import re
+import six
+import yaml
+
+from collections import defaultdict
+
+from geodata.configs.utils import nested_get, DoesNotExist, alternative_probabilities
+from geodata.encoding import safe_decode
+from geodata.math.floats import isclose
+from geodata.math.sampling import cdf, weighted_choice
+
+from geodata.encoding import safe_encode
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+BOUNDARY_NAMES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                  'resources', 'boundaries', 'names')
+
+BOUNDARY_NAMES_CONFIG = os.path.join(BOUNDARY_NAMES_DIR, 'global.yaml')
+
+
+class BoundaryNames(object):
+    DEFAULT_NAME_KEY = 'name'
+
+    def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
+        config = yaml.load(open(config_file))
+
+        default_names = nested_get(config, ('names', 'keys'))
+        name_keys, probs = alternative_probabilities(default_names)
+
+        self.name_keys = name_keys
+        self.name_key_probs = cdf(probs)
+
+        self.component_name_keys = {}
+
+        for component, component_config in six.iteritems(nested_get(config, ('names', 'components'), default={})):
+            component_names = component_config.get('keys')
+            component_name_keys, component_probs = alternative_probabilities(component_names)
+            self.component_name_keys[component] = (component_name_keys, cdf(component_probs))
+
+        self.country_regex_replacements = defaultdict(list)
+        for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
+            country = props.get('country')
+            re_flags = re.I | re.UNICODE
+            if not props.get('case_insensitive', True):
+                re.flags ^= re.I
+
+            pattern = re.compile(props['pattern'], re_flags)
+            replace_group = props['replace_with_group']
+            replace_probability = props['replace_probability']
+            self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))
+
+        self.country_regex_replacements = dict(self.country_regex_replacements)
+
+        self.prefixes = {}
+        self.prefix_regexes = {}
+        self.suffixes = {}
+        self.suffix_regexes = {}
+
+        for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ):
+            for component, affixes in six.iteritems(components):
+                affix_values, probs = alternative_probabilities(affixes)
+
+                for val in affix_values:
+                    if 'prefix' not in val:
+                        raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val))
+
+                prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values])
+                self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)
+
+                if not isclose(sum(probs), 1.0):
+                    affix_values.append(None)
+                    probs.append(1.0 - sum(probs))
+                affix_probs_cdf = cdf(probs)
+                self.prefixes[(language, component)] = affix_values, affix_probs_cdf
+
+        for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ):
+            for component, affixes in six.iteritems(components):
+                affix_values, probs = alternative_probabilities(affixes)
+
+                for val in affix_values:
+                    if 'suffix' not in val:
+                        raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val))
+
+                suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values])
+                self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)
+
+                if not isclose(sum(probs), 1.0):
+                    affix_values.append(None)
+                    probs.append(1.0 - sum(probs))
+                affix_probs_cdf = cdf(probs)
+                self.suffixes[(language, component)] = affix_values, affix_probs_cdf
+
+        self.exceptions = {}
+
+        for props in nested_get(config, ('names', 'exceptions'), default=[]):
+            object_type = props['type']
+            object_id = safe_encode(props['id'])
+            keys = [props['default']]
+            probs = [props['probability']]
+            for alt in props.get('alternatives', []):
+                keys.append(alt['alternative'])
+                probs.append(alt['probability'])
+
+            probs = cdf(probs)
+            self.exceptions[(object_type, object_id)] = (keys, probs)
+
+    def _string_as_regex(self, s):
+        return safe_decode(s).replace(six.u('.'), six.u('\\.'))
+
+    def valid_name(self, object_type, object_id, name):
+        exceptions, probs  = self.exceptions.get((object_type, object_id), ((), ()))
+        return not exceptions or name in exceptions
+
+    def name_key_dist(self, props, component):
+        object_type = props.get('type')
+        object_id = safe_encode(props.get('id', ''))
+
+        if (object_type, object_id) in self.exceptions:
+            values, probs = self.exceptions[(object_type, object_id)]
+            return values, probs
+
+        name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs))
+        return name_keys, probs
+
+    def name_key(self, props, component):
+        name_keys, probs = self.name_key_dist(props, component)
+        return weighted_choice(name_keys, probs)
+
+    def name(self, country, language, component, name):
+        all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
+
+        prefixes, prefix_probs = self.prefixes.get((language, component), (None, None))
+        suffixes, suffix_probs = self.suffixes.get((language, component), (None, None))
+
+        if not all_replacements and not prefixes and not suffixes:
+            return name
+
+        for regex, group, prob in all_replacements:
+            match = regex.match(name)
+            if match and random.random() < prob:
+                name = match.group(group)
+
+        for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0),
+                                                              (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
+            if affixes is not None:
+                regex = regexes[language, component]
+                if regex.match(name):
+                    continue
+
+                affix = weighted_choice(affixes, affix_probs)
+
+                if affix is not None:
+                    whitespace = affix.get('whitespace', True)
+                    space_val = six.u(' ') if whitespace else six.u('')
+                    affix = affix[key]
+                    if direction == 0:
+                        return six.u('{}{}{}').format(affix, space_val, safe_decode(name))
+                    else:
+                        return six.u('{}{}{}').format(safe_decode(name), space_val, affix)
+
+        return name
+
+
+boundary_names = BoundaryNames()
--- a/scripts/geodata/categories/init.py
+++ b/scripts/geodata/categories/init.py
--- a/scripts/geodata/categories/config.py
+++ b/scripts/geodata/categories/config.py
@@ -0,0 +1,72 @@
+import csv
+import os
+import six
+import random
+import sys
+
+from collections import defaultdict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode
+
+CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                              'resources', 'categories')
+
+
+class CategoryConfig(object):
+    def __init__(self, base_dir=CATEGORIES_DIR):
+        self.language_categories_singular = {}
+        self.language_categories_plural = {}
+
+        self.language_property_names = defaultdict(set)
+
+        if not os.path.exists(base_dir):
+            raise RuntimeError('{} does not exist'.format(base_dir))
+
+        for filename in os.listdir(base_dir):
+            if not filename.endswith('.tsv'):
+                continue
+
+            lang = filename.rsplit('.tsv')[0]
+            base_lang = lang.split('_')[0]
+
+            singular_rules = self.language_categories_singular.get(base_lang, defaultdict(list))
+            plural_rules = self.language_categories_plural.get(base_lang, defaultdict(list))
+
+            reader = csv.reader(open(os.path.join(CATEGORIES_DIR, filename)), delimiter='\t')
+            reader.next()  # headers
+
+            for key, value, is_plural, phrase in reader:
+                self.language_property_names[lang].add(key)
+                is_plural = bool(int(is_plural))
+                if is_plural:
+                    plural_rules[(key, value)].append(phrase)
+                else:
+                    singular_rules[(key, value)].append(phrase)
+
+            self.language_categories_singular[base_lang] = singular_rules
+            self.language_categories_plural[base_lang] = plural_rules
+
+        self.language_categories_singular = {key: dict(value) for key, value
+                                             in six.iteritems(self.language_categories_singular)}
+
+        self.language_categories_plural = {key: dict(value) for key, value
+                                           in six.iteritems(self.language_categories_plural)}
+
+    def has_keys(self, language, keys):
+        prop_names = self.language_property_names.get(language, set())
+        return [k for k in keys if k in prop_names]
+
+    def get_phrase(self, language, key, value, is_plural=False):
+        config = self.language_categories_singular if not is_plural else self.language_categories_plural
+        if language not in config:
+            return None
+        language_config = config[language]
+        choices = language_config.get((key, value))
+        if not choices:
+            return None
+        return random.choice(choices)
+
+category_config = CategoryConfig()
--- a/scripts/geodata/categories/preposition.py
+++ b/scripts/geodata/categories/preposition.py
@@ -0,0 +1,31 @@
+from geodata.addresses.config import address_config
+from geodata.categories.config import category_config
+from geodata.math.sampling import weighted_choice, cdf
+
+
+class CategoryPreposition(object):
+    NEAR = 'near'
+    NEARBY = 'nearby'
+    NEAR_ME = 'near_me'
+    IN = 'in'
+    NULL = 'null'
+
+    @classmethod
+    def random(cls, language, country=None):
+        category_props = address_config.get_property('categories', language, country=country)
+        if category_props is None:
+            return None
+
+        values = []
+        probs = []
+
+        for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
+            k = '{}_probability'.format(prep_phrase_type)
+            prob = category_props.get(k, None)
+            if prob is not None:
+                values.append(prep_phrase_type)
+                probs.append(prob)
+
+        probs = cdf(probs)
+
+        return weighted_choice(values, probs)
--- a/scripts/geodata/categories/query.py
+++ b/scripts/geodata/categories/query.py
@@ -0,0 +1,38 @@
+from collections import namedtuple
+
+from geodata.addresses.config import address_config
+from geodata.categories.config import category_config
+from geodata.categories.preposition import CategoryPreposition
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice
+
+CategoryQuery = namedtuple('CategoryQuery', 'category, prep, add_place_name, add_address')
+
+NULL_CATEGORY_QUERY = CategoryQuery(None, None, False, False)
+
+
+class Category(object):
+    @classmethod
+    def phrase(cls, language, key, value, is_plural=False, country=None):
+        category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
+        if not category_phrase:
+            return NULL_CATEGORY_QUERY
+
+        category_phrase = safe_decode(category_phrase)
+
+        prep_phrase_type = CategoryPreposition.random(language, country=country)
+
+        if prep_phrase_type in (None, CategoryPreposition.NULL):
+            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
+
+        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
+        if not values:
+            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
+
+        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
+        prep_phrase = safe_decode(prep_phrase)
+
+        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
+        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
+
+        return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
--- a/scripts/geodata/categories/scrape_nominatim_special_phrases.py
+++ b/scripts/geodata/categories/scrape_nominatim_special_phrases.py
@@ -0,0 +1,125 @@
+'''
+scrape_nominatim_special_phrases.py
+-----------------------------------
+
+Simple script to scrape https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
+for category-related phrases sometimes found in geocoder input.
+
+Populates a per-language CSV with (phrase, OSM key, OSM value, plural):
+
+OSM keys/values are like:
+
+amenity=restaurant
+tourism=museum
+shop=books
+
+Using these phrases, it is possible to construct queries like "restaurants in Brooklyn"
+'''
+
+import csv
+import os
+import re
+import requests
+import six
+import sys
+import time
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode, safe_encode
+
+DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                      'resources', 'categories')
+
+
+# Use Special:Export to get wiki markup
+WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
+NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
+NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
+
+phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
+wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
+
+IGNORE_LANGUAGES = {
+    # Interlingua
+    'ia'
+}
+
+
+IGNORE_PLURAL_LANGUAGES = {
+    # For Japanese, seems to just put an s on the end, which doesn't seem right
+    # Need input from a native speaker on that one
+    'ja',
+}
+
+# Wait this many seconds between page fetches
+POLITENESS_DELAY = 5.0
+
+
+def scrape_nominatim_category_page(url, ignore_plurals=False):
+    result = requests.get(url)
+
+    if not result or not result.content:
+        return
+
+    for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
+        if operator and operator != '-':
+            continue
+
+        is_plural = plural == 'Y'
+        if is_plural and ignore_plurals:
+            continue
+
+        yield safe_decode(phrase).lower(), key, value, is_plural
+
+
+def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
+    print('Fetching main page')
+    result = requests.get(url)
+    languages = {}
+    if not result or not result.content:
+        return languages
+
+    time.sleep(POLITENESS_DELAY)
+
+    for entity, anchor_text in wiki_link_re.findall(result.content):
+        if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
+            continue
+
+        lang = entity.rstrip('/').rsplit('/')[-1].lower()
+        if lang in IGNORE_LANGUAGES:
+            continue
+
+        link = WIKI_BASE_URL + entity.replace(' ', '_')
+
+        ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
+
+        print('Doing {}'.format(lang))
+        phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
+        time.sleep(POLITENESS_DELAY)
+
+        if not phrases:
+            continue
+
+        languages[lang] = phrases
+
+    return languages
+
+
+def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
+    languages = scrape_all_nominatim_category_pages(url=url)
+    for lang, phrases in six.iteritems(languages):
+        filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
+        with open(filename, 'w') as f:
+            writer = csv.writer(f, delimiter='\t')
+            writer.writerow(('key', 'value', 'is_plural', 'phrase'))
+
+            for phrase, key, value, is_plural in phrases:
+                writer.writerow((safe_encode(key), safe_encode(value),
+                                str(int(is_plural)), safe_encode(phrase)))
+
+    print('Done')
+
+if __name__ == '__main__':
+    main()
--- a/scripts/geodata/chains/init.py
+++ b/scripts/geodata/chains/init.py
--- a/scripts/geodata/chains/chains.sh
+++ b/scripts/geodata/chains/chains.sh
@@ -0,0 +1,23 @@
+if [ "$#" -ge 1 ]; then
+    DATA_DIR=$1
+else
+    DATA_DIR=$(pwd)
+fi
+
+PWD=$(pwd)
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+python $SCRIPT_DIR/chains_tsv.py $DATA_DIR/planet-venues.osm $DATA_DIR/chains.tsv
+
+cd $DATA_DIR
+split -d -C524200 chains.tsv chains.split.
+
+for filename in chains.split.*; do 
+    extension="${filename##*.0}"
+    name="${filename%%.*}"
+    echo -e "name_lower\tname\tcanonical\tknown_chain\tcount" | cat - $filename > /tmp/out
+    mv /tmp/out $name.$extension.tsv
+    rm $filename
+done
+
+cd $PWD
--- a/scripts/geodata/chains/chains_tsv.py
+++ b/scripts/geodata/chains/chains_tsv.py
@@ -0,0 +1,78 @@
+import csv
+import os
+import glob
+import six
+import sys
+
+from collections import defaultdict
+from collections import Counter
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.address_expansions.address_dictionaries import ADDRESS_EXPANSIONS_DIR
+from geodata.osm.extract import *
+from geodata.encoding import safe_encode
+
+
+class VenueNames(object):
+    def __init__(self, venues_filename):
+        self.venues_filename = venues_filename
+        self.all_chains = set()
+        self.chain_canonical = {}
+
+        for filename in glob.glob(os.path.join(ADDRESS_EXPANSIONS_DIR, '**', 'chains.txt')):
+            f = open(filename)
+            for line in f:
+                line = line.rstrip()
+                phrases = safe_decode(line).split(six.u('|'))
+                self.all_chains |= set(phrases)
+                canonical = phrases[0]
+                for p in phrases[1:]:
+                    self.chain_canonical[p] = canonical
+
+        self.names = Counter()
+        self.names_lower = Counter()
+        self.names_cap = defaultdict(Counter)
+
+    def count(self):
+        i = 0
+        for node_id, value, deps in parse_osm(self.venues_filename):
+            name = value.get('name')
+            if not name:
+                continue
+            self.names[name] += 1
+            self.names_lower[name.lower()] += 1
+            self.names_cap[name.lower()][name] += 1
+
+            if i % 1000 == 0 and i > 0:
+                print 'did', i
+            i += 1
+
+    def write_to_tsv(self, out_filename, min_threshold=5):
+        writer = csv.writer(open(out_filename, 'w'), delimiter='\t')
+        for k, v in self.names_lower.most_common():
+            if v < min_threshold:
+                break
+            canonical = self.chain_canonical.get(k)
+            if canonical:
+                canonical = self.names_cap[canonical].most_common(1)[0][0]
+            else:
+                canonical = ''
+            most_common_cap = self.names_cap[k].most_common(1)[0][0]
+            writer.writerow((safe_encode(k),
+                             safe_encode(most_common_cap),
+                             safe_encode(canonical),
+                             safe_encode(1) if k in self.all_chains else '',
+                             safe_encode(v)))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print('Usage: python chains_tsv.py infile outfile')
+        sys.exit(1)
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+
+    names = VenueNames(input_file)
+    names.count()
+    names.write_to_tsv(output_file)
--- a/scripts/geodata/chains/query.py
+++ b/scripts/geodata/chains/query.py
@@ -0,0 +1,100 @@
+import random
+import six
+
+from collections import namedtuple
+
+from geodata.addresses.config import address_config
+from geodata.address_expansions.gazetteers import chains_gazetteer
+from geodata.categories.config import category_config
+from geodata.categories.preposition import CategoryPreposition
+from geodata.math.sampling import weighted_choice, cdf
+from geodata.text.normalize import normalized_tokens
+from geodata.text.tokenize import tokenize, token_types
+from geodata.encoding import safe_decode
+
+ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name, add_address')
+
+NULL_CHAIN_QUERY = ChainQuery(None, None, False, False)
+
+
+class Chain(object):
+    @classmethod
+    def tokenize_name(cls, name):
+        if not name:
+            return []
+        tokens = normalized_tokens(name)
+        return tokens
+
+    @classmethod
+    def possible_chain(cls, name):
+        '''
+        Determines if a venue name contains the name of a known chain store.
+
+        Returns a tuple of:
+
+        (True/False, known chain phrases, other tokens)
+
+        Handles cases like "Hard Rock Cafe Times Square" and allows for downstream
+        decision making (i.e. if the tokens have a low IDF in the local area we might
+        want to consider it a chain).
+        '''
+        tokens = cls.tokenize_name(name)
+        if not tokens:
+            return False, [], []
+        matches = chains_gazetteer.filter(tokens)
+        other_tokens = []
+        phrases = []
+        for t, c, l, d in matches:
+            if c == token_types.PHRASE:
+                phrases.append((t, c, l, d))
+            else:
+                other_tokens.append((t, c))
+
+        return len(phrases) > 0, phrases, other_tokens if len(phrases) > 0 else []
+
+    @classmethod
+    def extract(cls, name):
+        '''
+        Determines if an entire venue name matches a known chain store.
+
+        Note: to avoid false positives, only return True if all of the tokens
+        in the venue's name are part of a single chain store phrase. This will
+        miss a few things like "Hard Rock Cafe Times Square" and the like.
+
+        It will however handle compound chain stores like Subway/Taco Bell
+        '''
+
+        possible, phrases, other_tokens = cls.possible_chain(name)
+        is_chain = possible and not any((c in token_types.WORD_TOKEN_TYPES for t, c in other_tokens))
+        return is_chain, phrases if is_chain else []
+
+    @classmethod
+    def alternate_form(cls, language, dictionary, canonical):
+        choices = address_config.sample_phrases.get((language, dictionary), {}).get(canonical)
+        if not choices:
+            return canonical
+        return random.choice(choices)
+
+    @classmethod
+    def phrase(cls, chain, language, country=None):
+        if not chain:
+            return NULL_CHAIN_QUERY
+
+        chain_phrase = safe_decode(chain)
+
+        prep_phrase_type = CategoryPreposition.random(language, country=country)
+
+        if prep_phrase_type in (None, CategoryPreposition.NULL):
+            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
+
+        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
+        if not values:
+            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
+
+        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
+        prep_phrase = safe_decode(prep_phrase)
+
+        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
+        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
+
+        return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
--- a/scripts/geodata/configs/init.py
+++ b/scripts/geodata/configs/init.py
--- a/scripts/geodata/configs/utils.py
+++ b/scripts/geodata/configs/utils.py
@@ -0,0 +1,61 @@
+import six
+from collections import Mapping
+
+
+def recursive_merge(a, b):
+    for k, v in six.iteritems(b):
+        if isinstance(v, Mapping) and v:
+            existing = a.get(k, v)
+            merged = recursive_merge(existing, v)
+            a[k] = merged
+        else:
+            a[k] = b[k]
+    return a
+
+
+class DoesNotExist:
+    pass
+
+
+def nested_get(obj, keys, default=DoesNotExist):
+    if len(keys) == 0:
+        return obj
+    try:
+        for key in keys[:-1]:
+            obj = obj.get(key, {})
+            if not hasattr(obj, 'items'):
+                return default
+        key = keys[-1]
+        return obj.get(key, default)
+    except AttributeError:
+        return default
+
+
+def alternative_probabilities(properties):
+    if properties is None:
+        return None, None
+
+    probs = []
+    alternatives = []
+
+    if 'probability' in properties:
+        prob = properties['probability']
+        props = properties['default']
+        probs.append(prob)
+        alternatives.append(props)
+    elif 'alternatives' not in properties and 'default' in properties:
+        prob = 1.0
+        props = properties['default']
+        probs.append(prob)
+        alternatives.append(props)
+    elif 'alternatives' not in properties and 'default' not in properties:
+        return None, None
+
+    alts = properties.get('alternatives', [])
+    for alt in alts:
+        prob = alt.get('probability', 1.0 / len(alts))
+        props = alt['alternative']
+        probs.append(prob)
+        alternatives.append(props)
+
+    return alternatives, probs
--- a/scripts/geodata/coordinates/init.py
+++ b/scripts/geodata/coordinates/init.py
--- a/scripts/geodata/coordinates/conversion.py
+++ b/scripts/geodata/coordinates/conversion.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+'''
+geodata.coordinates.conversion
+------------------------------
+
+Geographic coordinates typically come in two flavors: decimal and
+DMS (degree-minute-second). This module parses a coordinate string
+in just about any format. This was originally created for parsing
+lat/lons found on the web.
+
+Usage:
+    >>> latlon_to_decimal('40°42′46″N', '74°00′21″W') # returns (40.71277777777778, 74.00583333333333)
+    >>> latlon_to_decimal('40,74 N', '74,001 W') # returns (40.74, -74.001)
+    >>> to_valid_longitude(360.0)
+    >>> latitude_is_valid(90.0)
+'''
+
+import math
+import re
+
+from geodata.encoding import safe_decode
+from geodata.math.floats import isclose
+
+beginning_re = re.compile('^[^0-9\-]+', re.UNICODE)
+end_re = re.compile('[^0-9]+$', re.UNICODE)
+
+latitude_dms_regex = re.compile(ur'^(-?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(N|n|S|s)?$', re.I | re.UNICODE)
+longitude_dms_regex = re.compile(ur'^(-?1[0-8][0-9]|0?[0-9]{1,2})[ ]*[ :°ºd][ ]*([0-5]?[0-9])?[ ]*[:\'\u2032m]?[ ]*([0-5]?[0-9](?:\.\d+)?)?[ ]*[:\?\"\u2033s]?[ ]*(E|e|W|w)?$', re.I | re.UNICODE)
+
+latitude_decimal_with_direction_regex = re.compile('^(-?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(N|n|S|s)$', re.I)
+longitude_decimal_with_direction_regex = re.compile('^(-?1[0-8][0-9]|0?[0-9][0-9](?:\.[0-9]+))[ ]*[ :°ºd]?[ ]*(E|e|W|w)$', re.I)
+
+direction_sign_map = {'n': 1, 's': -1, 'e': 1, 'w': -1}
+
+
+def direction_sign(d):
+    if d is None:
+        return 1
+    d = d.lower().strip()
+    if d in direction_sign_map:
+        return direction_sign_map[d]
+    else:
+        raise ValueError('Invalid direction: {}'.format(d))
+
+
+def int_or_float(d):
+    try:
+        return int(d)
+    except ValueError:
+        return float(d)
+
+
+def degrees_to_decimal(degrees, minutes, seconds):
+    degrees = int_or_float(degrees)
+    minutes = int_or_float(minutes)
+    seconds = int_or_float(seconds)
+
+    return degrees + (minutes / 60.0) + (seconds / 3600.0)
+
+
+def is_valid_latitude(latitude):
+    '''Latitude must be real number between -90.0 and 90.0'''
+    try:
+        latitude = float(latitude)
+    except (ValueError, TypeError):
+        return False
+
+    if latitude > 90.0 or latitude < -90.0 or math.isinf(latitude) or math.isnan(latitude):
+        return False
+    return True
+
+
+def is_valid_longitude(longitude):
+    '''Allow any valid real number to be a longitude'''
+    try:
+        longitude = float(longitude)
+    except (ValueError, TypeError):
+        return False
+    return not math.isinf(longitude) and not math.isnan(longitude)
+
+
+def to_valid_latitude(latitude):
+    '''Convert longitude into the -180 to 180 scale'''
+    if not is_valid_latitude(latitude):
+        raise ValueError('Invalid latitude {}'.format(latitude))
+
+    if isclose(latitude, 90.0):
+        latitude = 89.9999
+    elif isclose(latitude, -90.0):
+        latitude = -89.9999
+
+    return latitude
+
+
+def to_valid_longitude(longitude):
+    '''Convert longitude into the -180 to 180 scale'''
+    if not is_valid_longitude(longitude):
+        raise ValueError('Invalid longitude {}'.format(longitude))
+
+    while longitude <= -180.0:
+        longitude += 360.0
+
+    while longitude > 180.0:
+        longitude -= 360.0
+
+    return longitude
+
+
+def latlon_to_decimal(latitude, longitude):
+    have_lat = False
+    have_lon = False
+
+    latitude = safe_decode(latitude).strip(u' ,;|')
+    longitude = safe_decode(longitude).strip(u' ,;|')
+
+    latitude = latitude.replace(u',', u'.')
+    longitude = longitude.replace(u',', u'.')
+
+    lat_dms = latitude_dms_regex.match(latitude)
+    lat_dir = latitude_decimal_with_direction_regex.match(latitude)
+
+    if lat_dms:
+        d, m, s, c = lat_dms.groups()
+        sign = direction_sign(c)
+        latitude = degrees_to_decimal(d or 0, m or 0, s or 0)
+        have_lat = True
+    elif lat_dir:
+        d, c = lat_dir.groups()
+        sign = direction_sign(c)
+        latitude = return_type(d) * sign
+        have_lat = True
+    else:
+        latitude = re.sub(beginning_re, u'', latitude)
+        latitude = re.sub(end_re, u'', latitude)
+
+    lon_dms = longitude_dms_regex.match(longitude)
+    lon_dir = longitude_decimal_with_direction_regex.match(longitude)
+
+    if lon_dms:
+        d, m, s, c = lon_dms.groups()
+        sign = direction_sign(c)
+        longitude = degrees_to_decimal(d or 0, m or 0, s or 0)
+        have_lon = True
+    elif lon_dir:
+        d, c = lon_dir.groups()
+        sign = direction_sign(c)
+        longitude = return_type(d) * sign
+        have_lon = True
+    else:
+        longitude = re.sub(beginning_re, u'', longitude)
+        longitude = re.sub(end_re, u'', longitude)
+
+    latitude = float(latitude)
+    longitude = float(longitude)
+
+    if not is_valid_latitude(latitude):
+        raise ValueError('Invalid latitude: {}'.format(latitude))
+
+    if not is_valid_longitude(longitude):
+        raise ValueError('Invalid longitude: {}'.format(longitude))
+
+    latitude = to_valid_latitude(latitude)
+    longitude = to_valid_longitude(longitude)
+
+    return latitude, longitude
--- a/scripts/geodata/countries/init.py
+++ b/scripts/geodata/countries/init.py
--- a/scripts/geodata/countries/constants.py
+++ b/scripts/geodata/countries/constants.py
@@ -0,0 +1,262 @@
+import pycountry
+
+
+class Countries(object):
+    AFGHANISTAN = 'af'
+    ALAND_ISLANDS = 'ax'
+    ALBANIA = 'al'
+    ALGERIA = 'dz'
+    AMERICAN_SAMOA = 'as'
+    ANDORRA = 'ad'
+    ANGOLA = 'ao'
+    ANGUILLA = 'ai'
+    ANTARCTICA = 'aq'
+    ANTIGUA_AND_BARBUDA = 'ag'
+    ARGENTINA = 'ar'
+    ARMENIA = 'am'
+    ARUBA = 'aw'
+    AUSTRALIA = 'au'
+    AUSTRIA = 'at'
+    AZERBAIJAN = 'az'
+    BAHAMAS = 'bs'
+    BAHRAIN = 'bh'
+    BANGLADESH = 'bd'
+    BARBADOS = 'bb'
+    BELARUS = 'by'
+    BELGIUM = 'be'
+    BELIZE = 'bz'
+    BENIN = 'bj'
+    BERMUDA = 'bm'
+    BHUTAN = 'bt'
+    BOLIVIA = 'bo'
+    BONAIRE = 'bq'
+    BOSNIA_AND_HERZEGOVINA = 'bq'
+    BOTSWANA = 'bw'
+    BOUVET_ISLAND = 'bv'
+    BRAZIL = 'br'
+    BRITISH_INDIAN_OCEAN_TERRITORY = 'io'
+    BRITISH_VIRGIN_ISLANDS = 'vg'
+    BRUNEI_DARUSSALAM = 'bn'
+    BULGARIA = 'bg'
+    BURKINA_FASO = 'bf'
+    BURUNDI = 'bi'
+    CAMBODIA = 'kh'
+    CAMEROON = 'cm'
+    CANADA = 'ca'
+    CAPE_VERDE = 'cv'
+    CAYMAN_ISLANDS = 'ky'
+    CENTRAL_AFRICAN_REPUBLIC = 'cf'
+    CHAD = 'td'
+    CHILE = 'cl'
+    CHINA = 'cn'
+    CHRISTMAS_ISLAND = 'cx'
+    COCOS_KEELING_ISLANDS = 'cc'
+    COLOMBIA = 'co'
+    COMOROS = 'km'
+    COOK_ISLANDS = 'ck'
+    COSTA_RICA = 'cr'
+    COTE_DIVOIRE = 'ci'
+    CROATIA = 'hr'
+    CUBA = 'cu'
+    CURACAO = 'cw'
+    CYPRUS = 'cy'
+    CZECH_REPUBLIC = 'cz'
+    DENMARK = 'dk'
+    DEMOCRATIC_REPUBLIC_OF_THE_CONGO = 'cd'
+    DJIBOUTI = 'dj'
+    DOMINICA = 'dm'
+    DOMINICAN_REPUBLIC = 'do'
+    ECUADOR = 'ec'
+    EGYPT = 'eg'
+    EL_SALVADOR = 'sv'
+    EQUATORIAL_GUINEA = 'gq'
+    ERITREA = 'er'
+    ESTONIA = 'ee'
+    ETHIOPIA = 'et'
+    FALKLAND_ISLANDS_MALVINAS = 'fk'
+    FAROE_ISLANDS = 'fo'
+    FEDERATED_STATES_OF_MICRONESIA = 'fm'
+    FIJI = 'fj'
+    FINLAND = 'fi'
+    FRANCE = 'fr'
+    FRENCH_GUIANA = 'gf'
+    FRENCH_POLYNESIA = 'pf'
+    FRENCH_SOUTHERN_TERRITORIES = 'tf'
+    GABON = 'ga'
+    GAMBIA = 'gm'
+    GEORGIA = 'ge'
+    GERMANY = 'de'
+    GHANA = 'gh'
+    GIBRALTAR = 'gi'
+    GREECE = 'gr'
+    GREENLAND = 'gl'
+    GRENADA = 'gd'
+    GUADELOUPE = 'gp'
+    GUAM = 'gu'
+    GUATEMALA = 'gt'
+    GUERNSEY = 'gg'
+    GUINEA = 'gn'
+    GUINEA_BISSAU = 'gw'
+    GUYANA = 'gy'
+    HAITI = 'ht'
+    HEARD_ISLAND_AND_MCDONALD_ISLANDS = 'hm'
+    HONDURAS = 'hn'
+    HONG_KONG = 'hk'
+    HUNGARY = 'hu'
+    ICELAND = 'is'
+    INDIA = 'in'
+    INDONESIA = 'id'
+    IRAN = 'ir'
+    IRAQ = 'iq'
+    IRELAND = 'ie'
+    ISLE_OF_MAN = 'im'
+    ISRAEL = 'il'
+    ITALY = 'it'
+    JAMAICA = 'jm'
+    JAPAN = 'jp'
+    JERSEY = 'je'
+    JORDAN = 'jo'
+    KAZAKHSTAN = 'kz'
+    KENYA = 'ke'
+    KIRIBATI = 'ki'
+    KUWAIT = 'kw'
+    KYRGYZSTAN = 'kg'
+    LAOS = 'la'
+    LATVIA = 'lv'
+    LEBANON = 'lb'
+    LESOTHO = 'ls'
+    LIBERIA = 'lr'
+    LIBYA = 'ly'
+    LIECHTENSTEIN = 'li'
+    LITHUANIA = 'lt'
+    LUXEMBOURG = 'lu'
+    MACAO = 'mo'
+    MACEDONIA = 'mk'
+    MADAGASCAR = 'mg'
+    MALAWI = 'mw'
+    MALAYSIA = 'my'
+    MALDIVES = 'mv'
+    MALI = 'ml'
+    MALTA = 'mt'
+    MARSHALL_ISLANDS = 'mh'
+    MARTINIQUE = 'mq'
+    MAURITANIA = 'mr'
+    MAURITIUS = 'mu'
+    MAYOTTE = 'yt'
+    MEXICO = 'mx'
+    MOLDOVA = 'md'
+    MONACO = 'mc'
+    MONGOLIA = 'mn'
+    MONTENEGRO = 'me'
+    MONTSERRAT = 'ms'
+    MOROCCO = 'ma'
+    MOZAMBIQUE = 'mz'
+    MYANMAR = 'mm'
+    NAMIBIA = 'na'
+    NAURU = 'nr'
+    NEPAL = 'np'
+    NETHERLANDS = 'nl'
+    NEW_CALEDONIA = 'nc'
+    NEW_ZEALAND = 'nz'
+    NICARAGUA = 'ni'
+    NIGER = 'ne'
+    NIGERIA = 'ng'
+    NIUE = 'nu'
+    NORFOLK_ISLAND = 'nf'
+    NORTH_KOREA = 'kp'
+    NORTHERN_MARIANA_ISLANDS = 'mp'
+    NORWAY = 'no'
+    OMAN = 'om'
+    PAKISTAN = 'pk'
+    PALAU = 'pw'
+    PALESTINE = 'ps'
+    PANAMA = 'pa'
+    PAPUA_NEW_GUINEA = 'pg'
+    PARAGUAY = 'py'
+    PERU = 'pe'
+    PHILIPPINES = 'ph'
+    PITCAIRN_ISLANDS = 'pn'
+    POLAND = 'pl'
+    PORTUGAL = 'pt'
+    PUERTO_RICO = 'pr'
+    QATAR = 'qa'
+    REPUBLIC_OF_CONGO = 'cg'
+    REUNION = 're'
+    ROMANIA = 'ro'
+    RUSSIA = 'ru'
+    RWANDA = 'rw'
+    SAINT_BARTHELEMY = 'bl'
+    SAINT_HELENA_ASCENSION_AND_TRISTAN_DA_CUNHA = 'sh'
+    SAINT_KITTS_AND_NEVIS = 'kn'
+    SAINT_LUCIA = 'lc'
+    SAINT_MARTIN = 'mf'
+    SAINT_PIERRE_AND_MIQUELON = 'pm'
+    SAINT_VINCENT_AND_THE_GRENADINES = 'vc'
+    SAMOA = 'ws'
+    SAN_MARINO = 'sm'
+    SAO_TOME_AND_PRINCIPE = 'st'
+    SAUDI_ARABIA = 'sa'
+    SENEGAL = 'sn'
+    SERBIA = 'rs'
+    SEYCHELLES = 'sc'
+    SIERRA_LEONE = 'sl'
+    SINGAPORE = 'sg'
+    SINT_MAARTEN = 'sx'
+    SLOVAKIA = 'sk'
+    SLOVENIA = 'si'
+    SOLOMON_ISLANDS = 'sb'
+    SOMALIA = 'so'
+    SOUTH_AFRICA = 'za'
+    SOUTH_GEORGIA_AND_THE_SOUTH_SANDWICH_ISLANDS = 'gs'
+    SOUTH_KOREA = 'kr'
+    SOUTH_SUDAN = 'ss'
+    SPAIN = 'es'
+    SRI_LANKA = 'lk'
+    SUDAN = 'sd'
+    SURINAME = 'sr'
+    SVALBARD_AND_JAN_MAYEN = 'sj'
+    SWAZILAND = 'sz'
+    SWEDEN = 'se'
+    SWITZERLAND = 'ch'
+    SYRIA = 'sy'
+    TAIWAN = 'tw'
+    TAJIKISTAN = 'tj'
+    TANZANIA = 'tz'
+    THAILAND = 'th'
+    TIMOR_LESTE = 'tl'
+    TOGO = 'tg'
+    TOKELAU = 'tk'
+    TONGA = 'to'
+    TRINIDAD_AND_TOBAGO = 'tt'
+    TUNISIA = 'tn'
+    TURKEY = 'tr'
+    TURKMENISTAN = 'tm'
+    TURKS_AND_CAICOS_ISLANDS = 'tc'
+    TUVALU = 'tv'
+    UGANDA = 'ug'
+    UKRAINE = 'ua'
+    UNITED_ARAB_EMIRATES = 'ae'
+    UNITED_KINGDOM = 'gb'
+    UNITED_STATES = 'us'
+    UNITED_STATES_MINOR_OUTLYING_ISLANDS = 'um'
+    URUGUAY = 'uy'
+    US_VIRGIN_ISLANDS = 'vi'
+    UZBEKISTAN = 'uz'
+    VANUATU = 'vu'
+    VATICAN = 'va'
+    VENEZUELA = 've'
+    VIETNAM = 'vn'
+    WALLIS_AND_FUTUNA = 'wf'
+    WESTERN_SAHARA = 'eh'
+    YEMEN = 'ye'
+    ZAMBIA = 'zm'
+    ZIMBABWE = 'zw'
+
+    FORMER_SOVIET_UNION_COUNTRIES = set([RUSSIA, UKRAINE, BELARUS, KAZAKHSTAN, AZERBAIJAN, KYRGYZSTAN, GEORGIA, UZBEKISTAN, ARMENIA, TAJIKISTAN, MOLDOVA, TURKMENISTAN, LATVIA, LITHUANIA, ESTONIA])
+    CJK_COUNTRIES = set([CHINA, JAPAN, SOUTH_KOREA, TAIWAN, HONG_KONG, MACAO])
+
+    all_country_iso_codes = set([c.alpha2.lower() for c in pycountry.countries])
+
+    @classmethod
+    def is_valid_country_code(cls, alpha2_code):
+        return alpha2_code and alpha2_code.lower() in cls.all_country_iso_codes
--- a/scripts/geodata/countries/names.py
+++ b/scripts/geodata/countries/names.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+import os
+import six
+import sys
+
+import pycountry
+
+from collections import OrderedDict
+
+from lxml import etree
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.i18n.unicode_paths import CLDR_DIR
+from geodata.i18n.languages import *
+from geodata.encoding import safe_decode
+
+CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
+
+COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                              'resources', 'countries', 'names.yaml')
+
+IGNORE_COUNTRIES = set([six.u('ZZ')])
+
+COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')])
+COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')])
+
+LANGUAGE_COUNTRY_OVERRIDES = {
+    'en': {
+        'CD': safe_decode('Democratic Republic of the Congo'),
+        'CG': safe_decode('Republic of the Congo'),
+    },
+
+    # Countries where the local language is absent from CLDR
+
+    # Tajik / Tajikistan
+    'tg': {
+        'TJ': safe_decode('Тоҷикистон'),
+    },
+
+    # Maldivan / Maldives
+    'dv': {
+        'MV': safe_decode('ދިވެހިރާއްޖެ'),
+    }
+
+}
+
+
+class CountryNames(object):
+    def __init__(self, base_dir=CLDR_MAIN_PATH):
+        self.base_dir = base_dir
+
+        self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
+        self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries}
+
+        self.language_country_names = {}
+        self.country_language_names = defaultdict(dict)
+
+        self.country_official_names = defaultdict(OrderedDict)
+        self.country_local_names = defaultdict(OrderedDict)
+
+        local_languages = {}
+
+        country_local_language_names = defaultdict(dict)
+
+        for filename in os.listdir(base_dir):
+            lang = filename.split('.xml')[0]
+            if len(lang) > 3:
+                continue
+
+            names = self.cldr_country_names(lang)
+            lang = lang.lower()
+            self.language_country_names[lang] = names
+
+            for country, name in names.iteritems():
+                country = country.lower()
+
+                languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
+                local_languages[country] = languages
+
+                self.country_language_names[country.lower()][lang.lower()] = name
+
+                if lang in local_languages.get(country, {}):
+                    country_local_language_names[country][lang] = name
+
+        for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES):
+            if l not in self.language_country_names:
+                self.language_country_names[l.lower()] = names
+
+            for c, name in six.iteritems(names):
+                self.country_language_names[c.lower()][l.lower()] = name
+                if c.lower() not in country_local_language_names:
+                    country_local_language_names[c.lower()][l.lower()] = name
+
+        for country, langs in six.iteritems(local_languages):
+            names = country_local_language_names[country]
+            num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
+            for i, (lang, default) in enumerate(langs.iteritems()):
+                name = names.get(lang)
+                if not name:
+                    continue
+                if default or num_defaults == 0:
+                    self.country_official_names[country][lang] = name
+                    if num_defaults == 0:
+                        break
+                self.country_local_names[country][lang] = name
+
+    def cldr_country_names(self, language):
+        '''
+        Country names are tricky as there can be several versions
+        and levels of verbosity e.g. United States of America
+        vs. the more commonly used United States. Most countries
+        have a similarly verbose form.
+
+        The CLDR repo (http://cldr.unicode.org/) has the most
+        comprehensive localized database of country names
+        (among other things), organized by language. This function
+        parses CLDR XML for a given language and returns a dictionary
+        of {country_code: name} for that language.
+        '''
+        filename = os.path.join(self.base_dir, '{}.xml'.format(language))
+        xml = etree.parse(open(filename))
+
+        country_names = defaultdict(dict)
+
+        for territory in xml.xpath('*//territories/*'):
+            country_code = territory.attrib['type']
+
+            if country_code in IGNORE_COUNTRIES or country_code.isdigit():
+                continue
+
+            country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
+
+        display_names = {}
+
+        for country_code, names in country_names.iteritems():
+            if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
+                display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
+                continue
+
+            default_name = names.get(None)
+
+            if country_code in COUNTRY_USE_SHORT_NAME:
+                display_names[country_code] = names.get('short', default_name)
+            elif country_code in COUNTRY_USE_VARIANT_NAME:
+                display_names[country_code] = names.get('variant', default_name)
+            elif default_name is not None:
+                display_names[country_code] = default_name
+
+        return display_names
+
+    def localized_name(self, country_code, language=None):
+        '''
+        Get the display name for a country code in the local language
+        e.g. Россия for Russia, España for Spain, etc.
+
+        For most countries there is a single official name. For countries
+        with more than one official language, this will return a concatenated
+        version separated by a slash e.g. Maroc / المغرب for Morocco.
+
+        Note that all of the exceptions in road_sign_languages.tsv are also
+        taken into account here so India for example uses the English name
+        rather than concatenating all 27 toponyms.
+
+        This method should be roughly consistent with OSM's display names.
+
+        Usage:
+            >>> country_names.localized_name('jp')     # returns '日本'
+            >>> country_names.localized_name('be')     # returns 'België / Belgique / Belgien'
+        '''
+
+        country_code = country_code.lower()
+        if language is None:
+            return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' '))
+                                     for n in self.country_official_names[country_code].values()).keys())
+        else:
+            return self.country_language_names.get(country_code, {}).get(language)
+
+    def alpha3_code(self, alpha2_code):
+        alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
+        return alpha3.upper() if alpha3 else None
+
+    def iso_3166_name(self, alpha2_code):
+        return self.iso_3166_names.get(alpha2_code.lower())
+
+country_names = CountryNames()
--- a/scripts/geodata/csv_utils.py
+++ b/scripts/geodata/csv_utils.py
@@ -0,0 +1,16 @@
+import csv
+import re
+from encoding import safe_encode, safe_decode
+
+newline_regex = re.compile('\r\n|\r|\n')
+
+csv.register_dialect('tsv_no_quote', delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')
+
+
+def tsv_string(s):
+    return safe_encode(newline_regex.sub(u', ', safe_decode(s).strip()).replace(u'\t', u' '))
+
+
+def unicode_csv_reader(filename, **kw):
+    for line in csv.reader(filename, **kw):
+        yield [unicode(c, 'utf-8') for c in line]
--- a/scripts/geodata/distance/init.py
+++ b/scripts/geodata/distance/init.py
--- a/scripts/geodata/distance/haversine.py
+++ b/scripts/geodata/distance/haversine.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+import math
+
+EARTH_RADIUS_KM = 6373
+
+
+def haversine_distance(lat1, lon1, lat2, lon2, radius=EARTH_RADIUS_KM):
+    """Calculate the Haversine distance between two lat/lon pairs, given by:
+    a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
+    c = 2 ⋅ atan2( √a, √(1−a) )
+    d = R ⋅ c
+
+    where R is the radius of the Earth (in kilometers). By default we use 6373 km,
+    a radius optimized for calculating distances at approximately 39 degrees from
+    the equator i.e. Washington, DC
+
+    :param lat1: first latitude
+    :param lon1: first longitude (use negative range for longitudes West of the Prime Meridian)
+    :param lat2: second latitude
+    :param lon2: second longitude (use negative range for longitudes West of the Prime Meridian)
+    :param radius: radius of the Earth in (miles|kilometers) depending on the desired units
+    """
+    lat1 = math.radians(lat1)
+    lat2 = math.radians(lat2)
+    lon1 = math.radians(lon1)
+    lon2 = math.radians(lon2)
+
+    dlon = lon2 - lon1
+    dlat = lat2 - lat1
+    a = (math.sin(dlat / 2.0)) ** 2 + math.cos(lat1) * math.cos(lat2) * (math.sin(dlon/2.0)) ** 2
+    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
+    d = radius * c
+    return d
--- a/scripts/geodata/encoding.py
+++ b/scripts/geodata/encoding.py
@@ -0,0 +1,34 @@
+import six
+
+text_type = six.text_type
+string_types = six.string_types
+binary_type = six.binary_type
+
+
+def safe_decode(value, encoding='utf-8', errors='strict'):
+    if isinstance(value, text_type):
+        return value
+
+    if isinstance(value, (string_types, binary_type)):
+        return value.decode(encoding, errors)
+    else:
+        return binary_type(value).decode(encoding, errors)
+
+
+def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
+    if not isinstance(value, (string_types, binary_type)):
+        return binary_type(value)
+
+    if isinstance(value, text_type):
+        return value.encode(encoding, errors)
+    else:
+        if hasattr(incoming, 'lower'):
+            incoming = incoming.lower()
+        if hasattr(encoding, 'lower'):
+            encoding = encoding.lower()
+
+        if value and encoding != incoming:
+            value = safe_decode(value, encoding, errors)
+            return value.encode(encoding, errors)
+        else:
+            return value
--- a/scripts/geodata/enum.py
+++ b/scripts/geodata/enum.py
@@ -0,0 +1,62 @@
+
+class EnumValue(object):
+    def __init__(self, value, name=None):
+        self.value = value
+        self.name = name
+
+    def __hash__(self):
+        return self.value
+
+    def __cmp__(self, other):
+        if isinstance(other, EnumValue):
+            return self.value.__cmp__(other.value)
+        else:
+            return self.value.__cmp__(other)
+
+    def __unicode__(self):
+        return self.name
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return self.name
+
+
+class EnumMeta(type):
+    def __init__(self, name, bases, dict_):
+        self.registry = self.registry.copy()
+        self.name_registry = self.name_registry.copy()
+        for k, v in dict_.iteritems():
+            if isinstance(v, EnumValue) and v not in self.registry:
+                if v.name is None:
+                    v.name = k
+                self.registry[v.value] = v
+                self.name_registry[v.name] = v
+        return super(EnumMeta, self).__init__(name, bases, dict_)
+
+    def __iter__(self):
+        return self.registry.itervalues()
+
+    def __getitem__(self, key):
+        return self.registry[key]
+
+
+class Enum(object):
+    __metaclass__ = EnumMeta
+    registry = {}
+    name_registry = {}
+
+    @classmethod
+    def from_id(cls, value):
+        try:
+            return cls.registry[value]
+        except KeyError:
+            raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value))
+
+    @classmethod
+    def from_string(cls, name):
+        try:
+            return cls.name_registry[name]
+        except KeyError:
+            raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))
--- a/scripts/geodata/file_utils.py
+++ b/scripts/geodata/file_utils.py
@@ -0,0 +1,38 @@
+import os
+import subprocess
+import six
+
+
+def download_file(url, dest, retries=3, retry_delay=5):
+    ensure_dir(os.path.dirname(dest))
+    return subprocess.check_output(['curl', url, '-L', '-w', '%{http_code}',
+                                    '--retry', six.text_type(retries),
+                                    '--retry-delay', six.text_type(retry_delay),
+                                    '-o', dest, '--silent']) == '200'
+
+
+def unzip_file(filename, dest):
+    ensure_dir(dest)
+    return subprocess.check_call(['unzip', '-o', filename, '-d', dest]) == 0
+
+
+def remove_file(filename):
+    os.unlink(filename)
+
+
+def ensure_dir(d):
+    if not os.path.exists(d):
+        os.makedirs(d)
+
+
+class cd:
+    """Context manager for changing the current working directory"""
+    def __init__(self, path):
+        self.path = path
+
+    def __enter__(self):
+        self.saved_path = os.getcwd()
+        os.chdir(self.path)
+
+    def __exit__(self, etype, value, traceback):
+        os.chdir(self.saved_path)
--- a/scripts/geodata/geonames/init.py
+++ b/scripts/geodata/geonames/init.py
--- a/scripts/geodata/geonames/create_geonames_tsv.py
+++ b/scripts/geodata/geonames/create_geonames_tsv.py
@@ -0,0 +1,688 @@
+'''
+create_geonames_tsv.py
+----------------------
+
+This script formats the open GeoNames database (as well as
+its accompanying postal codes data set) into a schema'd
+tab-separated value file.
+
+It generates a C header which uses an enum for the field names.
+This way if new fields are added or there's a typo, etc. the
+error will show up at compile-time.
+
+The relevant C modules which operate on this data are:
+    geodb_builder.c
+    geonames.c
+
+As well as the generated headers:
+    geonames_fields.h
+    postal_fields.h
+'''
+
+import argparse
+import csv
+import logging
+import operator
+import os
+import re
+import sqlite3
+import subprocess
+import sys
+
+import pycountry
+
+import unicodedata
+
+import urllib
+import urlparse
+
+from collections import defaultdict, OrderedDict
+from lxml import etree
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.csv_utils import *
+from geodata.file_utils import *
+from geodata.countries.country_names import *
+from geodata.encoding import safe_encode, safe_decode
+from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
+from geodata.i18n.languages import *
+from geodata.i18n.unicode_paths import CLDR_DIR
+from geodata.log import log_to_file
+
+multispace_regex = re.compile('[\s]+')
+
+
+def encode_field(value):
+    return multispace_regex.sub(' ', safe_encode((value if value is not None else '')))
+
+log_to_file(sys.stderr)
+
+DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
+                                os.path.pardir, 'data', 'geonames')
+
+COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
+CONTINENT_FEATURE_CODES = ('CONT',)
+
+ADMIN_1_FEATURE_CODES = ('ADM1',)
+ADMIN_2_FEATURE_CODES = ('ADM2',)
+ADMIN_3_FEATURE_CODES = ('ADM3',)
+ADMIN_4_FEATURE_CODES = ('ADM4',)
+OTHER_ADMIN_FEATURE_CODES = ('ADM5',)
+ADMIN_OTHER_FEATURE_CODES = ('ADMD', )
+
+POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
+                                 'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL',
+                                 'PPLR', 'PPLS', 'STLMT')
+NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
+
+
+class boundary_types:
+    COUNTRY = 0
+    ADMIN1 = 1
+    ADMIN2 = 2
+    ADMIN3 = 3
+    ADMIN4 = 4
+    ADMIN_OTHER = 5
+    LOCALITY = 6
+    NEIGHBORHOOD = 7
+
+geonames_admin_dictionaries = OrderedDict([
+    (boundary_types.COUNTRY, COUNTRY_FEATURE_CODES),
+    (boundary_types.ADMIN1, ADMIN_1_FEATURE_CODES),
+    (boundary_types.ADMIN2, ADMIN_2_FEATURE_CODES),
+    (boundary_types.ADMIN3, ADMIN_3_FEATURE_CODES),
+    (boundary_types.ADMIN4, ADMIN_4_FEATURE_CODES),
+    (boundary_types.ADMIN_OTHER, ADMIN_OTHER_FEATURE_CODES),
+    (boundary_types.LOCALITY, POPULATED_PLACE_FEATURE_CODES),
+    (boundary_types.NEIGHBORHOOD, NEIGHBORHOOD_FEATURE_CODES),
+])
+
+# Inserted post-query
+DUMMY_BOUNDARY_TYPE = '-1 as type'
+DUMMY_HAS_WIKIPEDIA_ENTRY = '0 as has_wikipedia_entry'
+DUMMY_LANGUAGE_PRIORITY = '0 as language_priority'
+
+
+class GeonamesField(object):
+    def __init__(self, name, c_constant, default=None, is_dummy=False):
+        self.name = name
+        self.c_constant = c_constant
+        self.default = default
+        self.is_dummy = is_dummy
+
+geonames_fields = [
+    # Field if alternate_names present, default field name if not, C header constant
+    GeonamesField('alternate_name', 'GEONAMES_NAME', default='gn.name'),
+    GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'),
+    GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'),
+    GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True),
+    GeonamesField(DUMMY_HAS_WIKIPEDIA_ENTRY, 'GEONAMES_HAS_WIKIPEDIA_ENTRY', is_dummy=True),
+    GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"),
+    GeonamesField(DUMMY_LANGUAGE_PRIORITY, 'GEONAMES_LANGUAGE_PRIORITY', is_dummy=True),
+    GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'),
+    GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
+    GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
+    GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'),
+    GeonamesField('gn.population', 'GEONAMES_POPULATION'),
+    GeonamesField('gn.latitude', 'GEONAMES_LATITUDE'),
+    GeonamesField('gn.longitude', 'GEONAMES_LONGITUDE'),
+    GeonamesField('gn.feature_code', 'GEONAMES_FEATURE_CODE'),
+    GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
+    GeonamesField('c.geonames_id as country_gn_id', 'GEONAMES_COUNTRY_ID'),
+    GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
+    GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
+    GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
+    GeonamesField('a2.geonames_id as a2_gn_id', 'GEONAMES_ADMIN2_ID'),
+    GeonamesField('gn.admin3_code as admin3_code', 'GEONAMES_ADMIN3_CODE'),
+    GeonamesField('a3.geonames_id as a3_gn_id', 'GEONAMES_ADMIN3_ID'),
+    GeonamesField('gn.admin4_code as admin4_code', 'GEONAMES_ADMIN4_CODE'),
+    GeonamesField('a4.geonames_id as a4_gn_id', 'GEONAMES_ADMIN4_ID'),
+]
+
+def geonames_field_index(s):
+    for i, f in enumerate(geonames_fields):
+        if f.c_constant == s:
+            return i
+    return None
+
+
+DUMMY_BOUNDARY_TYPE_INDEX = geonames_field_index('GEONAMES_BOUNDARY_TYPE')
+DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX = geonames_field_index('GEONAMES_HAS_WIKIPEDIA_ENTRY')
+
+GEONAMES_ID_INDEX = geonames_field_index('GEONAMES_ID')
+LANGUAGE_INDEX = geonames_field_index('GEONAMES_ISO_LANGUAGE')
+
+DUMMY_LANGUAGE_PRIORITY_INDEX = geonames_field_index('GEONAMES_LANGUAGE_PRIORITY')
+
+CANONICAL_NAME_INDEX = geonames_field_index('GEONAMES_CANONICAL')
+
+NAME_INDEX = geonames_field_index('GEONAMES_NAME')
+
+COUNTRY_CODE_INDEX = geonames_field_index('GEONAMES_COUNTRY_CODE')
+
+POPULATION_INDEX = geonames_field_index('GEONAMES_POPULATION')
+
+PREFERRED_INDEX = geonames_field_index('GEONAMES_IS_PREFERRED_NAME')
+
+HISTORICAL_INDEX = geonames_field_index('GEONAMES_IS_HISTORICAL')
+
+
+geonames_admin_joins = '''
+left join admin1_codes a1
+    on a1.code = gn.admin1_code
+    and a1.country_code = gn.country_code
+left join admin2_codes a2
+    on a2.code = gn.admin2_code
+    and a2.admin1_code = gn.admin1_code
+    and a2.country_code = gn.country_code
+left join admin3_codes a3
+    on a3.code = gn.admin3_code
+    and a3.admin1_code = gn.admin1_code
+    and a3.admin2_code = gn.admin2_code
+    and a3.country_code = gn.country_code
+left join admin4_codes a4
+    on a4.code = gn.admin4_code
+    and a4.admin1_code = gn.admin1_code
+    and a4.admin2_code = gn.admin2_code
+    and a4.admin3_code = gn.admin3_code
+    and a4.country_code = gn.country_code
+'''
+
+# Canonical names are stored in the geonames table with alternates
+# stored in a separate table. UNION ALL query will capture them all.
+
+base_geonames_query = '''
+select {geonames_fields}
+from geonames gn
+join countries c
+    on gn.country_code = c.country_code
+{admin_joins}
+{{predicate}}
+union all
+select {alt_name_fields}
+from geonames gn
+join countries c
+    on gn.country_code = c.country_code
+join alternate_names an
+    on an.geonames_id = gn.geonames_id
+    and iso_language not in ('doi','faac','iata',
+                             'icao','link','post','tcid')
+    and an.alternate_name != gn.name
+{admin_joins}
+{{predicate}}
+'''.format(
+    geonames_fields=', '.join((f.name if f.default is None else
+                               '{} as {}'.format(f.default, f.name)
+                               for f in geonames_fields)),
+    alt_name_fields=', '.join((f.name for f in geonames_fields)),
+    admin_joins=geonames_admin_joins
+)
+
+IGNORE_COUNTRY_POSTAL_CODES = set([
+    'AR',   # GeoNames has pre-1999 postal codes
+])
+
+postal_code_fields = [
+    GeonamesField('postal_code', 'GN_POSTAL_CODE'),
+    GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
+    GeonamesField('c.geonames_id as country_geonames_id', 'GN_POSTAL_COUNTRY_GEONAMES_ID'),
+    GeonamesField('c.population as country_population', 'GN_POSTAL_COUNTRY_POPULATION'),
+    GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
+    GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
+    GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
+    GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
+]
+
+def postal_code_field_index(s):
+    for i, f in enumerate(postal_code_fields):
+        if f.c_constant == s:
+            return i
+    return None
+
+POSTAL_CODE_INDEX = postal_code_field_index('GN_POSTAL_CODE')
+POSTAL_CODE_POP_INDEX = postal_code_field_index('GN_POSTAL_COUNTRY_POPULATION')
+
+postal_codes_query = '''
+select
+{fields}
+from postal_codes p
+join countries c
+    on p.country_code = c.country_code
+left join (
+    select
+    gn.geonames_id,
+    alternate_name,
+    country_code,
+    gn.name
+    from alternate_names an
+    join geonames gn
+        on an.geonames_id = gn.geonames_id
+    where iso_language = 'post'
+) as n
+on p.postal_code = n.alternate_name
+and p.country_code = n.country_code
+left join admin1_codes a1
+    on a1.code = p.admin1_code
+    and p.country_code = a1.country_code
+left join admin2_codes a2
+    on a2.code = p.admin2_code
+    and a2.admin1_code = p.admin1_code
+    and a2.country_code = p.country_code
+left join admin3_codes a3
+    on a3.code = p.admin3_code
+    and a3.admin1_code = p.admin1_code
+    and a3.admin2_code = p.admin2_code
+    and a3.country_code = p.country_code
+where p.country_code not in ({exclude_country_codes})
+group by postal_code, p.country_code
+'''.format(
+    fields=','.join([f.name for f in postal_code_fields]),
+    exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
+
+
+wikipedia_query = '''
+select alternate_name, geonames_id, is_preferred_name
+from alternate_names
+where iso_language = 'link'
+and alternate_name like '%%en.wikipedia%%'
+order by alternate_name, is_preferred_name
+'''
+
+BATCH_SIZE = 2000
+
+
+wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
+
+
+def normalize_wikipedia_title(title):
+    return safe_decode(title).replace(u'_', u' ')
+
+
+def normalize_wikipedia_url(url):
+    url = urllib.unquote_plus(url)
+
+    parsed = urlparse.urlsplit(url)
+    if parsed.query:
+        params = urlparse.parse_qs(parsed.query)
+        if 'title' in params:
+            return normalize_wikipedia_title(params['title'][0])
+
+    title = parsed.path.rsplit('/', 1)[-1]
+    if title not in ('index.php', 'index.html'):
+        return normalize_wikipedia_title(title)
+
+    return None
+
+
+def normalize_name(name):
+    name = name.replace('&', 'and')
+    name = name.replace('-', ' ')
+    name = name.replace(', ', ' ')
+    name = name.replace(',', ' ')
+    return name
+
+
+saint_replacements = [
+    ('st.', 'saint'),
+    ('st.', 'st'),
+    ('st', 'saint')
+]
+
+
+abbreviated_saint_regex = re.compile(r'\bSt(\.|\b)')
+
+
+def normalize_display_name(name):
+    return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
+
+
+def utf8_normalize(s, form='NFD'):
+    return unicodedata.normalize(form, s)
+
+
+def get_wikipedia_titles(db):
+    d = defaultdict(dict)
+
+    cursor = db.execute(wikipedia_query)
+
+    while True:
+        batch = cursor.fetchmany(BATCH_SIZE)
+        if not batch:
+            break
+
+        for (url, geonames_id, is_preferred) in batch:
+            title = normalize_wikipedia_url(safe_encode(url))
+            if title is not None and title.strip():
+                title = utf8_normalize(normalize_name(title))
+                d[title.lower()][geonames_id] = int(is_preferred or 0)
+
+    return d
+
+
+def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
+    '''
+    Writes geonames.tsv using the specified db to the specified data directory
+    '''
+    filename = os.path.join(out_dir, 'geonames.tsv')
+    temp_filename = filename + '.tmp'
+
+    f = open(temp_filename, 'w')
+
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    init_languages()
+
+    init_country_names()
+
+    wiki_titles = get_wikipedia_titles(db)
+    logging.info('Fetched Wikipedia titles')
+
+    # Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
+    for boundary_type, codes in geonames_admin_dictionaries.iteritems():
+        if boundary_type != boundary_types.COUNTRY:
+            predicate = 'where gn.feature_code in ({codes})'.format(
+                codes=','.join(['"{}"'.format(c) for c in codes])
+            )
+        else:
+            # The query for countries in GeoNames is somewhat non-trivial
+            predicate = 'where gn.geonames_id in (select geonames_id from countries)'
+
+        query = base_geonames_query.format(
+            predicate=predicate
+        )
+
+        cursor = db.execute(query)
+        i = 1
+        while True:
+            # Fetch rows in batches to save memory
+            batch = cursor.fetchmany(BATCH_SIZE)
+            if not batch:
+                break
+            rows = []
+            for row in batch:
+                row = list(row)
+                row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
+
+                language = row[LANGUAGE_INDEX]
+
+                country_code = row[COUNTRY_CODE_INDEX]
+
+                is_preferred = int(row[PREFERRED_INDEX] or 0)
+                is_historical = int(row[HISTORICAL_INDEX] or 0)
+
+                lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
+                lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
+                null_language = not language.strip()
+
+                is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
+
+                alpha2_code = None
+                is_orig_name = False
+
+                if boundary_type == boundary_types.COUNTRY:
+                    alpha2_code = row[COUNTRY_CODE_INDEX]
+
+                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
+                    # Set the canonical for countries to the local name, see country_official_name in country_names.py
+                    country_canonical = country_localized_display_name(alpha2_code.lower())
+                    if not country_canonical or not country_canonical.strip():
+                        raise ValueError('Could not get local canonical name for country code={}'.format(alpha2_code))
+                    row[CANONICAL_NAME_INDEX] = country_canonical
+
+                geonames_id = row[GEONAMES_ID_INDEX]
+
+                name = utf8_normalize(safe_decode(row[NAME_INDEX]))
+
+                # For non-postal codes, don't count
+                if name.isdigit():
+                    continue
+
+                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))
+
+                row[NAME_INDEX] = name
+
+                if boundary_type == boundary_types.COUNTRY:
+                    norm_name = normalize_name(name.lower())
+                    for s, repl in saint_replacements:
+                        if not wikipedia_entries:
+                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
+
+                wiki_row = []
+
+                have_wikipedia = geonames_id in wikipedia_entries
+                wiki_preferred = wikipedia_entries.get(geonames_id, 0)
+
+                '''
+                The following set of heuristics assigns a numerical value to a given name
+                alternative, such that in the case of ambiguous names, this value can be
+                used as part of the ranking function (as indeed it will be during sort).
+                The higher the value, the more likely the given entity resolution.
+                '''
+                if is_historical:
+                    # Historical names, unlikely to be used
+                    language_priority = 0
+                elif not null_language and language != 'abbr' and lang_spoken is None:
+                    # Name of a place in language not widely spoken e.g. Japanese name for a US toponym
+                    language_priority = 1
+                elif null_language and not is_preferred and not is_canonical:
+                    # Null-language alternate names not marked as preferred, dubious
+                    language_priority = 2
+                elif language == 'abbr' and not is_preferred:
+                    # Abbreviation, not preferred
+                    language_priority = 3
+                elif language == 'abbr' and is_preferred:
+                    # Abbreviation, preferred e.g. NYC, UAE
+                    language_priority = 4
+                elif lang_spoken and not lang_official and not is_preferred:
+                    # Non-preferred name but in a spoken (non-official) language
+                    language_priority = 5
+                elif lang_official == 1 and not is_preferred:
+                    # Name in an official language, not preferred
+                    language_priority = 6
+                elif null_language and not is_preferred and is_canonical:
+                    # Canonical name, may be overly official e.g. Islamic Republic of Pakistan
+                    language_priority = 7
+                elif is_preferred and not lang_official:
+                    # Preferred names, not an official language
+                    language_priority = 8
+                elif is_preferred and lang_official:
+                    # Official language preferred
+                    language_priority = 9
+
+                row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
+
+                if have_wikipedia:
+                    wiki_row = row[:]
+                    wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
+                    rows.append(map(encode_field, wiki_row))
+
+                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
+                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
+
+                have_normalized = False
+
+                if is_orig_name:
+                    canonical_row = wiki_row[:] if have_wikipedia else row[:]
+
+                    canonical_row_name = normalize_display_name(name)
+                    if canonical_row_name != name:
+                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
+                        have_normalized = True
+                        rows.append(map(encode_field, canonical_row))
+
+                if not have_wikipedia:
+                    rows.append(map(encode_field, row))
+
+                # Country names have more specialized logic
+                if boundary_type == boundary_types.COUNTRY:
+                    wikipedia_entries = wiki_titles.get(canonical.lower(), {})
+
+                    canonical_row_name = normalize_display_name(canonical)
+
+                    canonical_row = row[:]
+
+                    if is_orig_name:
+                        canonical = safe_decode(canonical)
+                        canonical_row[NAME_INDEX] = safe_encode(canonical)
+
+                        norm_name = normalize_name(canonical.lower())
+                        for s, repl in saint_replacements:
+                            if not wikipedia_entries:
+                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
+
+                        if not wikipedia_entries:
+                            norm_name = normalize_name(canonical_row_name.lower())
+                            for s, repl in saint_replacements:
+                                if not wikipedia_entries:
+                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
+
+                        have_wikipedia = geonames_id in wikipedia_entries
+                        wiki_preferred = wikipedia_entries.get(geonames_id, 0)
+
+                        if have_wikipedia:
+                            canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
+
+                        if (name != canonical):
+                            rows.append(map(encode_field, canonical_row))
+
+                    if canonical_row_name != canonical and canonical_row_name != name:
+                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
+                        rows.append(map(encode_field, canonical_row))
+
+                    if alpha2_code and is_orig_name:
+                        alpha2_row = row[:]
+                        alpha2_row[NAME_INDEX] = alpha2_code
+                        alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
+                        rows.append(map(encode_field, alpha2_row))
+
+                    if alpha2_code.lower() in country_alpha3_map and is_orig_name:
+                        alpha3_row = row[:]
+                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
+                        alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
+                        rows.append(map(encode_field, alpha3_row))
+
+            writer.writerows(rows)
+            logging.info('Did {} batches'.format(i))
+            i += 1
+
+        cursor.close()
+        f.flush()
+
+    f.close()
+
+    logging.info('Sorting...')
+
+    env = os.environ.copy()
+    env['LC_ALL'] = 'C'
+
+    command = ['sort', '-t\t', '-u', '--ignore-case',
+               '-k{0},{0}'.format(NAME_INDEX + 1),
+               # If there's a Wikipedia link to this name for the given id, sort first
+               '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
+               # Language priority rules as above
+               '-k{0},{0}nr'.format(DUMMY_LANGUAGE_PRIORITY_INDEX + 1),
+               # Sort descending by population (basic proxy for relevance)
+               '-k{0},{0}nr'.format(POPULATION_INDEX + 1),
+               # group rows for the same geonames ID together
+               '-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
+               # preferred names come first within that grouping
+               '-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
+               # since uniquing is done on the sort key, add language
+               '-k{0},{0}'.format(LANGUAGE_INDEX + 1),
+               '-o', filename, temp_filename]
+
+    p = subprocess.Popen(command, env=env)
+
+    return_code = p.wait()
+    if return_code != 0:
+        raise subprocess.CalledProcessError(return_code, command)
+
+    os.unlink(temp_filename)
+
+
+def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
+    filename = os.path.join(out_dir, 'postal_codes.tsv')
+    temp_filename = filename + '.tmp'
+    f = open(temp_filename, 'w')
+
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    cursor = db.execute(postal_codes_query)
+
+    i = 1
+    while True:
+        batch = cursor.fetchmany(BATCH_SIZE)
+        if not batch:
+            break
+        rows = [
+            map(encode_field, row)
+            for row in batch
+        ]
+        writer.writerows(rows)
+        logging.info('Did {} batches'.format(i))
+        i += 1
+
+    cursor.close()
+    f.close()
+
+    logging.info('Sorting...')
+
+    subprocess.check_call([
+        'sort', '-t\t', '--ignore-case',
+        '-k{0},{0}'.format(POSTAL_CODE_INDEX + 1),
+        '-k{0},{0}nr'.format(POSTAL_CODE_POP_INDEX + 1),
+        '-o', filename,
+        temp_filename
+    ])
+    os.unlink(temp_filename)
+
+# Generates a C header telling us the order of the fields as written
+GEONAMES_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                      'src', 'geonames_fields.h')
+
+GEONAMES_FIELDS_HEADER_FILE = '''enum geonames_fields {{
+    {fields},
+    NUM_GEONAMES_FIELDS
+}};
+'''.format(fields=''',
+    '''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(geonames_fields)]))
+
+
+def write_geonames_fields_header(filename=GEONAMES_FIELDS_HEADER):
+    with open(filename, 'w') as f:
+        f.write(GEONAMES_FIELDS_HEADER_FILE)
+
+POSTAL_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                    'src', 'postal_fields.h')
+
+POSTAL_FIELDS_HEADER_FILE = '''enum gn_postal_fields {{
+    {fields},
+    NUM_POSTAL_FIELDS
+}};
+'''.format(fields=''',
+    '''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(postal_code_fields)]))
+
+
+def write_postal_fields_header(filename=POSTAL_FIELDS_HEADER):
+    with open(filename, 'w') as f:
+        f.write(POSTAL_FIELDS_HEADER_FILE)
+
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--db',
+                        default=DEFAULT_GEONAMES_DB_PATH,
+                        help='SQLite db file')
+    parser.add_argument('-o', '--out',
+                        default=DEFAULT_DATA_DIR, help='output directory')
+    args = parser.parse_args()
+    db = sqlite3.connect(args.db)
+
+    create_geonames_tsv(db, args.out)
+    create_postal_codes_tsv(db, args.out)
+    write_geonames_fields_header()
+    write_postal_fields_header()
+    db.close()
--- a/scripts/geodata/geonames/db.py
+++ b/scripts/geodata/geonames/db.py
@@ -0,0 +1,30 @@
+import sqlite3
+from collections import defaultdict
+
+
+class GeoNamesDB(object):
+    names_query = '''
+    select iso_language, alternate_name,
+    is_preferred_name, is_short_name
+    from alternate_names
+    where geonames_id = ?
+    and is_historic != '1'
+    and is_colloquial != '1'
+    and iso_language != 'post'
+    order by iso_language, cast(is_preferred_name as integer) desc, cast(is_short_name as integer)
+    '''
+
+    def __init__(self, filename):
+        self.db = sqlite3.connect(filename)
+
+    def query(self, query, *params):
+        return self.db.execute(self.names_query, params)
+
+    def get_alternate_names(self, geonames_id):
+        cursor = self.query(self.names_query, geonames_id)
+        language_names = defaultdict(list)
+        for language, name, is_preferred, is_short in cursor:
+            language_names[language].append((name,
+                                             int(is_preferred or 0),
+                                             int(is_short or 0)))
+        return dict(language_names)
--- a/scripts/geodata/geonames/geonames_sqlite.py
+++ b/scripts/geodata/geonames/geonames_sqlite.py
@@ -0,0 +1,333 @@
+import os
+import shutil
+import sqlite3
+
+import tempfile
+import urlparse
+import urllib2
+import subprocess
+
+import logging
+
+import argparse
+
+import csv
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode
+from geodata.geonames.paths import *
+
+from geodata.file_utils import *
+from geodata.log import *
+
+from itertools import islice, chain
+
+log_to_file(sys.stderr)
+logger = logging.getLogger('geonames.sqlite')
+
+BASE_URL = 'http://download.geonames.org/export/'
+
+DUMP_URL = urlparse.urljoin(BASE_URL, 'dump/')
+ALL_COUNTRIES_ZIP_FILE = 'allCountries.zip'
+HIERARCHY_ZIP_FILE = 'hierarchy.zip'
+ALTERNATE_NAMES_ZIP_FILE = 'alternateNames.zip'
+
+ZIP_URL = urlparse.urljoin(BASE_URL, 'zip/')
+
+GEONAMES_DUMP_FILES = (ALL_COUNTRIES_ZIP_FILE,
+                       HIERARCHY_ZIP_FILE,
+                       ALTERNATE_NAMES_ZIP_FILE)
+
+# base_url, local_dir, is_gzipped, local_filename
+
+
+GEONAMES_FILES = [(DUMP_URL, '', True, ALL_COUNTRIES_ZIP_FILE),
+                  (DUMP_URL, '', True, HIERARCHY_ZIP_FILE),
+                  (DUMP_URL, '', True, ALTERNATE_NAMES_ZIP_FILE),
+                  (ZIP_URL, 'zip', True, ALL_COUNTRIES_ZIP_FILE),
+                  ]
+
+
+def download_file(url, dest):
+    logger.info('Downloading file from {}'.format(url))
+    subprocess.check_call(['wget', url, '-O', dest])
+
+
+def admin_ddl(admin_level):
+    columns = ['country_code TEXT'] + \
+              ['admin{}_code TEXT'.format(i)
+               for i in xrange(1, admin_level)]
+
+    create = '''
+    CREATE TABLE admin{level}_codes (
+    geonames_id INT,
+    code TEXT,
+    name TEXT,
+    {fields}
+    )'''.format(level=admin_level,
+                fields=''',
+    '''.join(columns))
+
+    indices = (
+        '''CREATE INDEX admin{}_code_index ON
+        admin{}_codes (code)'''.format(admin_level, admin_level),
+        '''CREATE INDEX admin{}_gn_id_index ON
+        admin{}_codes (geonames_id)'''.format(admin_level, admin_level),
+    )
+
+    return (create, ) + indices
+
+geonames_ddl = {
+    'geonames': (
+        '''CREATE TABLE geonames (
+        geonames_id INT PRIMARY KEY,
+        name TEXT,
+        ascii_name TEXT,
+        alternate_names TEXT,
+        latitude DOUBLE,
+        longitude DOUBLE,
+        feature_class TEXT,
+        feature_code TEXT,
+        country_code TEXT,
+        cc2 TEXT,
+        admin1_code TEXT,
+        admin2_code TEXT,
+        admin3_code TEXT,
+        admin4_code TEXT,
+        population LONG DEFAULT 0,
+        elevation INT,
+        dem INT,
+        timezone TEXT,
+        modification_date TEXT)''',
+        '''CREATE INDEX feature_code ON
+        geonames (feature_code)''',
+        '''CREATE INDEX country_code ON
+        geonames (country_code)''',
+        '''CREATE INDEX admin_codes ON
+        geonames (country_code, admin1_code, admin2_code, admin3_code, admin4_code)''',
+    ),
+
+    'alternate_names': (
+        '''CREATE TABLE alternate_names (
+        alternate_name_id INT PRIMARY KEY,
+        geonames_id INT,
+        iso_language TEXT,
+        alternate_name TEXT,
+        is_preferred_name BOOLEAN DEFAULT 0,
+        is_short_name BOOLEAN DEFAULT 0,
+        is_colloquial BOOLEAN DEFAULT 0,
+        is_historic BOOLEAN DEFAULT 0)''',
+        '''CREATE INDEX geonames_id_index ON
+        alternate_names (geonames_id)''',
+        '''CREATE INDEX geonames_id_alt_name_index ON
+        alternate_names(geonames_id, alternate_name)''',
+    ),
+
+    'hierarchy': (
+        '''CREATE TABLE hierarchy (
+        parent_id INT,
+        child_id INT,
+        type TEXT
+        );''',
+        '''CREATE INDEX parent_child_index ON
+        hierarchy (parent_id, child_id)''',
+        '''CREATE INDEX child_parent_index ON
+        hierarchy (child_id, parent_id)''',
+    ),
+
+    'postal_codes': (
+        '''CREATE TABLE postal_codes (
+        country_code TEXT,
+        postal_code TEXT,
+        place_name TEXT,
+        admin1 TEXT,
+        admin1_code TEXT,
+        admin2 TEXT,
+        admin2_code TEXT,
+        admin3 TEXT,
+        admin3_code TEXT,
+        latitude DOUBLE,
+        longitude DOUBLE,
+        accuracy INT
+        )''',
+        '''CREATE INDEX post_code_index ON
+        postal_codes (country_code, postal_code)''',
+        '''CREATE INDEX postal_code_admins ON
+        postal_codes (country_code, admin1_code, admin2_code, admin3_code)''',
+    ),
+    'admin1_codes': admin_ddl(1),
+    'admin2_codes': admin_ddl(2),
+    'admin3_codes': admin_ddl(3),
+    'admin4_codes': admin_ddl(4),
+
+}
+
+geonames_file_table_map = {
+    ('', ALL_COUNTRIES_ZIP_FILE): 'geonames',
+    ('', ALTERNATE_NAMES_ZIP_FILE): 'alternate_names',
+    ('', HIERARCHY_ZIP_FILE): 'hierarchy',
+    ('zip', ALL_COUNTRIES_ZIP_FILE): 'postal_codes',
+}
+
+
+country_codes_create_table = (
+    'drop table if exists country_codes',
+    '''
+    create table country_codes as
+    select distinct country_code from geonames
+    where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS', 'TERR')
+    ''',
+)
+
+proper_countries_create_table = (
+    'drop table if exists proper_countries',
+    '''
+    create table proper_countries as
+    select * from geonames
+    where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
+    and country_code in (select country_code from country_codes)
+    ''',
+)
+
+territories_create_table = (
+    'drop table if exists territories',
+    '''
+    create table territories as
+    select * from geonames where feature_code = 'TERR'
+    and country_code not in (select country_code from proper_countries);
+    ''',
+)
+
+countries_create_table = (
+    'drop table if exists countries',
+    '''
+    create table countries as
+    select * from proper_countries
+    union
+    select * from territories;
+    ''',
+    'create index country_geonames_id on countries (geonames_id)',
+    'create index conntry_country_code on countries (country_code)',
+)
+
+country_alises_create_table = (
+    'drop table if exists country_aliases',
+    '''
+    create table country_aliases as
+    select name, country_code
+    from countries
+    union
+    select alternate_name, country_code
+    from alternate_names an
+    join countries c
+        on c.geonames_id = an.geonames_id
+    where alternate_name != ''
+    and iso_language not in ('doi','faac','iata',
+                             'icao','link','post','tcid')
+    '''
+)
+
+country_table_create_statements = list(chain(country_codes_create_table,
+                                             proper_countries_create_table,
+                                             territories_create_table,
+                                             countries_create_table,
+                                             country_alises_create_table))
+
+
+def create_table(conn, table):
+    cursor = conn.cursor()
+    create_statements = geonames_ddl[table]
+    cursor.execute('DROP TABLE IF EXISTS {}'.format(table))
+    for statement in create_statements:
+        cursor.execute(statement)
+    conn.commit()
+
+
+def batch_iter(iterable, batch_size):
+    source_iter = iter(iterable)
+    while True:
+        batch = list(islice(source_iter, batch_size))
+        if len(batch) > 0:
+            yield batch
+        else:
+            return
+
+
+def populate_admin_table(conn, admin_level):
+    logging.info('Doing admin level {}'.format(admin_level))
+
+    columns = ['geonames_id',
+               'admin{}_code'.format(admin_level),
+               'name',
+               'country_code']
+    columns.extend(['admin{}_code'.format(i)
+                    for i in xrange(1, admin_level)])
+
+    admin_insert_statement = '''
+    insert into "admin{}_codes"
+    select {}
+    from geonames
+    where feature_code = "ADM{}"
+    '''.format(admin_level, ','.join(columns), admin_level)
+
+    conn.execute(admin_insert_statement)
+    conn.commit()
+
+    logging.info('Done with admin level {}'.format(admin_level))
+
+
+def import_geonames_table(conn, table, f, batch_size=2000):
+    # escape the brackets around the values format string so we can use later
+    statement = 'INSERT INTO "{}" VALUES ({{}})'.format(table)
+    cursor = conn.cursor()
+    for i, batch in enumerate(batch_iter(f, batch_size)):
+        num_cols = len(batch[0])
+        cursor.executemany(statement.format(','.join(['?'] * num_cols)), batch)
+        conn.commit()
+        cursor = conn.cursor()
+        logging.info('imported {} batches ({} records)'.format(i + 1, (i + 1) * batch_size))
+    cursor.close()
+
+
+def create_geonames_sqlite_db(temp_dir, db_file=DEFAULT_GEONAMES_DB_PATH):
+    conn = sqlite3.connect(db_file)
+    logging.info('Created database at {}'.format(db_file))
+    for url, directory, is_gzipped, filename in GEONAMES_FILES:
+        table = geonames_file_table_map[(directory, filename)]
+        create_table(conn, table)
+        full_url = urlparse.urljoin(url, filename)
+        dest_dir = os.path.join(temp_dir, directory)
+        ensure_dir(dest_dir)
+        dest_file = os.path.join(dest_dir, filename)
+        download_file(full_url, dest_file)
+        if is_gzipped:
+            unzip_file(dest_file, dest_dir)
+            filename = dest_file.replace('.zip', '.txt')
+        reader = csv.reader(open(filename), delimiter='\t', quotechar=None)
+        lines = (map(safe_decode, line) for line in reader)
+        import_geonames_table(conn, table, lines)
+    logging.info('Creating countries tables')
+    for statement in country_table_create_statements:
+        conn.execute(statement)
+        conn.commit()
+    logging.info('Creating admin tables')
+    for admin_level in xrange(1, 5):
+        create_table(conn, 'admin{}_codes'.format(admin_level))
+        populate_admin_table(conn, admin_level)
+    conn.close()
+
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--temp-dir',
+                        default=tempfile.gettempdir(),
+                        help='Temporary work directory')
+    parser.add_argument('-o', '--out',
+                        default=DEFAULT_GEONAMES_DB_PATH,
+                        help='SQLite3 db filename')
+    args = parser.parse_args()
+    create_geonames_sqlite_db(args.temp_dir, args.out)
--- a/scripts/geodata/geonames/paths.py
+++ b/scripts/geodata/geonames/paths.py
@@ -0,0 +1,9 @@
+import os
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+GEONAMES_DB_NAME = 'geonames.db'
+
+DEFAULT_GEONAMES_DB_PATH = os.path.join(this_dir, os.path.pardir,
+                                        os.path.pardir, os.path.pardir,
+                                        'data', 'geonames', GEONAMES_DB_NAME)
--- a/scripts/geodata/geoplanet/init.py
+++ b/scripts/geodata/geoplanet/init.py
--- a/scripts/geodata/geoplanet/cleanup_geoplanet_db.sql
+++ b/scripts/geodata/geoplanet/cleanup_geoplanet_db.sql
--- a/scripts/geodata/geoplanet/create_geoplanet_db.sh
+++ b/scripts/geodata/geoplanet/create_geoplanet_db.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+
+: '
+create_geoplanet_db.sh
+-------------------------
+
+Shell script to download Geo Planet and derive inputs
+for address parser training set construction.
+
+Usage: ./create_geoplanet_db.sh out_dir
+'
+
+if [ "$#" -ge 1 ]; then
+    OUT_DIR=$1
+    mkdir -p $OUT_DIR
+else
+    OUT_DIR=$(pwd)
+fi
+
+GEOPLANET_ZIP_FILE="geoplanet_data_7.10.0.zip"
+# Internet Archive URL
+GEOPLANET_URL="https://archive.org/download/$GEOPLANET_ZIP_FILE/$GEOPLANET_ZIP_FILE"
+GEOPLANET_ORIGINAL_PLACES_FILE="geoplanet_places_7.10.0.tsv"
+GEOPLANET_ADMINS_FILE="geoplanet_admins_7.10.0.tsv"
+GEOPLANET_ORIGINAL_ALIASES_FILE="geoplanet_aliases_7.10.0.tsv"
+
+GEOPLANET_ALL_PLACES_FILE="geoplanet_all_places.tsv"
+GEOPLANET_PLACES_FILE="geoplanet_places.tsv"
+GEOPLANET_POSTAL_CODES_FILE="geoplanet_postal_codes.tsv"
+GEOPLANET_ALIASES_FILE="geoplanet_aliases.tsv"
+
+GEOPLANET_GEONAMES_CONCORDANCE_FILE="geonames-geoplanet-matches.csv"
+GEOPLANET_GEONAMES_CONCORDANCE_URL="https://github.com/blackmad/geoplanet-concordance/raw/master/current/$GEOPLANET_GEONAMES_CONCORDANCE_FILE"
+
+GEOPLANET_DB_FILE="geoplanet.db"
+
+function download_file() {
+    echo "Downloading $1"
+    response=$(curl -sL -w "%{http_code}" $1 --retry 3 --retry-delay 5 -o $OUT_DIR/$2)
+    if [ $response -ne "200" ]; then
+        echo "Could not download $GEOPLANET_URL"
+        exit 1
+    fi    
+}
+
+
+if [ ! -f $OUT_DIR/$GEOPLANET_ZIP_FILE ]; then
+    echo "Downloading GeoPlanet"
+    download_file $GEOPLANET_URL $GEOPLANET_ZIP_FILE
+fi
+
+cd $OUT_DIR
+echo "Unzipping GeoPlanet file"
+unzip -o $GEOPLANET_ZIP_FILE
+
+echo "Creating GeoPlanet postal codes file"
+awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Zip") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_POSTAL_CODES_FILE
+
+echo "Creating GeoPlanet all places file"
+tail -n+2 $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_ALL_PLACES_FILE
+
+echo "Creating GeoPlanet places file"
+awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Continent" || $5 == "Country" || $5 == "Nationality" || $5 == "State" || $5 == "County" ||  $5 == "Town" || $5 == "LocalAdmin" || $5 == "Island" || $5 == "Suburb") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_PLACES_FILE
+
+echo "Creating GeoPlanet aliases file"
+tail -n+2 $GEOPLANET_ORIGINAL_ALIASES_FILE > $GEOPLANET_ALIASES_FILE
+
+echo "Fetching GeoNames concordance"
+download_file $GEOPLANET_GEONAMES_CONCORDANCE_URL $GEOPLANET_GEONAMES_CONCORDANCE_FILE
+
+echo "Creating SQLite db"
+
+echo "
+DROP TABLE IF EXISTS places;
+CREATE TABLE places (
+    id integer primary key,
+    country_code text,
+    name text,
+    language text,
+    place_type text,
+    parent_id integer
+);
+
+.separator \t
+.import $OUT_DIR/$GEOPLANET_PLACES_FILE places
+
+CREATE INDEX places_parent_id_index on places(parent_id);
+CREATE INDEX places_country_code on places(country_code);
+
+DROP TABLE IF EXISTS all_places;
+CREATE TABLE all_places AS SELECT * FROM places WHERE 0;
+.import $OUT_DIR/$GEOPLANET_ALL_PLACES_FILE all_places
+
+DROP TABLE IF EXISTS postal_codes;
+CREATE TABLE postal_codes (
+    id integer primary key,
+    country_code text,
+    name text,
+    language text,
+    place_type text,
+    parent_id integer
+);
+
+.import $OUT_DIR/$GEOPLANET_POSTAL_CODES_FILE postal_codes
+CREATE INDEX postal_codes_parent_id_index on postal_codes(parent_id);
+CREATE INDEX postal_codes_country_code on postal_codes(country_code);
+
+DROP TABLE IF EXISTS admins;
+CREATE TABLE admins (
+    id integer primary key,
+    country_code text,
+    state_id integer,
+    county_id integer,
+    local_admin_id integer,
+    country_id integer,
+    continent_id integer
+);
+
+.import $OUT_DIR/$GEOPLANET_ADMINS_FILE admins
+
+CREATE INDEX admin_country_code on admins(country_code);
+CREATE INDEX admin_state_id on admins(state_id);
+CREATE INDEX admin_county_id on admins(county_id);
+CREATE INDEX admin_local_admin_id on admins(local_admin_id);
+CREATE INDEX admin_country_id on admins(country_id);
+CREATE INDEX admin_continent_id on admins(continent_id);
+
+DROP TABLE IF EXISTS aliases;
+CREATE TABLE aliases (
+    id integer,
+    name text,
+    name_type text,
+    language text
+);
+
+.import $OUT_DIR/$GEOPLANET_ALIASES_FILE aliases
+
+CREATE INDEX alias_id on aliases(id);
+
+DROP TABLE IF EXISTS geonames_concordance;
+CREATE TABLE geonames_concordance (
+    id integer primary key,
+    geonames_id integer,
+    name text,
+    lat number,
+    lon number
+);
+
+.mode csv
+.import $OUT_DIR/$GEOPLANET_GEONAMES_CONCORDANCE_FILE geonames_concordance
+
+CREATE INDEX geonames_concordance_geonames_id on geonames_concordance(geonames_id);
+
+" | sqlite3 $OUT_DIR/$GEOPLANET_DB_FILE
--- a/scripts/geodata/geoplanet/geoplanet_training_data.py
+++ b/scripts/geodata/geoplanet/geoplanet_training_data.py
@@ -0,0 +1,353 @@
+import argparse
+import csv
+import itertools
+import os
+import six
+import sqlite3
+import sys
+
+from collections import defaultdict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.address_expansions.abbreviations import abbreviate
+from geodata.address_expansions.equivalence import equivalent
+from geodata.address_expansions.gazetteers import *
+
+from geodata.address_formatting.formatter import AddressFormatter
+
+from geodata.countries.names import country_names
+from geodata.postal_codes.validation import postcode_regexes
+from geodata.names.normalization import name_affixes
+from geodata.places.config import place_config
+
+from geodata.csv_utils import tsv_string, unicode_csv_reader
+
+GEOPLANET_DB_FILE = 'geoplanet.db'
+GEOPLANET_FORMAT_DATA_TAGGED_FILENAME = 'geoplanet_formatted_addresses_tagged.tsv'
+GEOPLANET_FORMAT_DATA_FILENAME = 'geoplanet_formatted_addresses.tsv'
+
+
+class GeoPlanetFormatter(object):
+    # Map of GeoPlanet language codes to ISO-639 alpha2 language codes
+    language_codes = {
+        'ENG': 'en',
+        'JPN': 'ja',
+        'GER': 'de',
+        'SPA': 'es',
+        'FRE': 'fr',
+        'UNK': 'unk',
+        'ITA': 'it',
+        'POR': 'pt',
+        'POL': 'pl',
+        'ARA': 'ar',
+        'CZE': 'cs',
+        'SWE': 'sv',
+        'CHI': 'zh',
+        'RUM': 'ro',
+        'FIN': 'fi',
+        'DUT': 'nl',
+        'NOR': 'nb',
+        'DAN': 'da',
+        'HUN': 'hu',
+        'KOR': 'kr',
+    }
+
+    non_latin_script_languages = {
+        'JPN',  # Japanese
+        'ARA',  # Arabic
+        'CHI',  # Chinese
+        'KOR',  # Korean
+    }
+
+    ALIAS_PREFERRED = 'P'
+    ALIAS_PREFERRED_FOREIGN = 'Q'
+    ALIAS_VARIANT = 'V'
+    ALIAS_ABBREVIATED = 'A'
+    ALIAS_COLLOQUIAL = 'S'
+
+    # Map of GeoPlanet place types to address formatter types
+    place_types = {
+        'Continent': AddressFormatter.WORLD_REGION,
+        'Country': AddressFormatter.COUNTRY,
+        'CountryRegion': AddressFormatter.COUNTRY_REGION,
+        'State': AddressFormatter.STATE,
+        'County': AddressFormatter.STATE_DISTRICT,
+        'Island': AddressFormatter.ISLAND,
+        'Town': AddressFormatter.CITY,
+        # Note: if we do general place queris from GeoPlanet, this
+        # may have to be mapped more carefully
+        'LocalAdmin': AddressFormatter.CITY_DISTRICT,
+        'Suburb': AddressFormatter.SUBURB,
+    }
+
+    def __init__(self, geoplanet_db):
+        self.db = sqlite3.connect(geoplanet_db)
+
+        # These aren't too large and it's easier to have them in memory
+        self.places = {row[0]: row[1:] for row in self.db.execute('select * from places')}
+        self.aliases = defaultdict(list)
+
+        self.coterminous_admins = {}
+        self.admins_with_ambiguous_city = set()
+
+        print('Doing admin ambiguities')
+        for row in self.db.execute('''select p.id,
+                                             (select count(*) from places where parent_id = p.id) as num_places,
+                                             (select count(*) from places where parent_id = p.id and place_type = "Town") as num_towns,
+                                             p2.id
+                                      from places p
+                                      join places p2
+                                          on p2.parent_id = p.id
+                                          and p.name = p2.name
+                                          and p.place_type != "Town"
+                                          and p2.place_type = "Town"
+                                      group by p.id'''):
+            place_id, num_places, num_towns, coterminous_town_id = row
+            num_places = int(num_places)
+            num_towns = int(num_towns)
+
+            if num_places == 1 and num_towns == 1:
+                self.coterminous_admins[place_id] = coterminous_town_id
+            self.admins_with_ambiguous_city.add(place_id)
+
+        print('num coterminous: {}'.format(len(self.coterminous_admins)))
+        print('num ambiguous: {}'.format(len(self.admins_with_ambiguous_city)))
+
+        print('Doing aliases')
+        for row in self.db.execute('''select a.* from aliases a
+                                      left join places p
+                                          on a.id = p.id
+                                          and p.place_type in ("State", "County")
+                                          and a.language != p.language
+                                      where name_type != "S" -- no colloquial aliases like "The Big Apple"
+                                      and name_type != "V" -- variants can often be demonyms like "Welsh" or "English" for UK
+                                      and p.id is NULL -- exclude foreign-language states/county names
+                                      order by id, language,
+                                      case name_type
+                                          when "P" then 1
+                                          when "Q" then 2
+                                          when "V" then 3
+                                          when "A" then 4
+                                          when "S" then 5
+                                          else 6
+                                      end'''):
+            place = self.places.get(row[0])
+            if not place:
+                continue
+
+            self.aliases[row[0]].append(row[1:])
+
+        print('Doing variant aliases')
+        variant_aliases = 0
+        for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a
+                                                   join places p using(id)
+                                                   where a.name_type = "V"
+                                                   and a.language = p.language''')):
+            place_name, country_code = row[-2:]
+            country = country_code.lower()
+
+            row = row[:-2]
+            place_id, alias, name_type, language = row
+
+            language = self.language_codes[language]
+            if language != 'unk':
+                alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country)
+                if alias_sans_affixes:
+                    alias = alias_sans_affixes
+
+                place_name_sans_affixes = name_affixes.replace_affixes(place_name, language, country=country)
+                if place_name_sans_affixes:
+                    place_name = place_name_sans_affixes
+            else:
+                language = None
+
+            if equivalent(place_name, alias, toponym_abbreviations_gazetteer, language):
+                self.aliases[row[0]].append(row[1:])
+                variant_aliases += 1
+
+            if i % 10000 == 0 and i > 0:
+                print('tested {} variant aliases with {} positives'.format(i, variant_aliases))
+
+        self.aliases = dict(self.aliases)
+
+        self.formatter = AddressFormatter()
+
+    def get_place_hierarchy(self, place_id):
+        all_places = []
+        original_place_id = place_id
+        place = self.places[place_id]
+        all_places.append((place_id, ) + place)
+        place_id = place[-1]
+        while place_id != 1 and place_id != original_place_id:
+            place = self.places[place_id]
+            all_places.append((place_id,) + place)
+            place_id = place[-1]
+        return all_places
+
+    def get_aliases(self, place_id):
+        return self.aliases.get(place_id, [])
+
+    def cleanup_name(self, name):
+        return name.strip(' ,-')
+
+    def format_postal_codes(self, tag_components=True):
+        all_postal_codes = self.db.execute('select * from postal_codes')
+        for postal_code_id, country, postal_code, language, place_type, parent_id in all_postal_codes:
+            country = country.lower()
+            postcode_language = language
+
+            language = self.language_codes[language]
+
+            if len(postal_code) <= 3:
+                postcode_regex = postcode_regexes.get(country)
+
+                valid_postcode = False
+                if postcode_regex:
+                    match = postcode_regex.match(postal_code)
+                    if match and match.end() == len(postal_code):
+                        valid_postcode = True
+
+                if not valid_postcode:
+                    continue
+
+            # If the county/state is coterminous with a city and contains only one place,
+            # set the parent_id to the city instead
+            if parent_id in self.coterminous_admins:
+                parent_id = self.coterminous_admins[parent_id]
+
+            place_hierarchy = self.get_place_hierarchy(parent_id)
+
+            containing_places = defaultdict(set)
+
+            language_places = {None: containing_places}
+
+            original_language = language
+
+            have_default_language = False
+
+            if place_hierarchy:
+                base_place_id, _, _, _, base_place_type, _ = place_hierarchy[0]
+                base_place_type = self.place_types[base_place_type]
+            else:
+                base_place_id = None
+                base_place_type = None
+
+            place_types_seen = set()
+
+            for place_id, country, name, lang, place_type, parent in place_hierarchy:
+                country = country.lower()
+
+                # First language
+                if not have_default_language and lang != postcode_language:
+                    language = self.language_codes[lang]
+                    have_default_language = True
+
+                place_type = self.place_types[place_type]
+                if AddressFormatter.CITY not in place_types_seen and place_id in self.admins_with_ambiguous_city:
+                    continue
+
+                name = self.cleanup_name(name)
+                containing_places[place_type].add(name)
+
+                aliases = self.get_aliases(place_id)
+                for name, name_type, alias_lang in aliases:
+                    if not alias_lang:
+                        alias_lang = 'UNK'
+                    if alias_lang == lang and lang != 'UNK':
+                        alias_language = None
+                    else:
+                        alias_language = self.language_codes[alias_lang]
+
+                    language_places.setdefault(alias_language, defaultdict(set))
+                    lang_places = language_places[alias_language]
+
+                    name = self.cleanup_name(name)
+
+                    lang_places[place_type].add(name)
+
+                place_types_seen.add(place_type)
+
+            default_city_names = set([name.lower() for name in language_places.get(None, {}).get(AddressFormatter.CITY, [])])
+
+            for language, containing_places in six.iteritems(language_places):
+                if language is None:
+                    language = original_language
+
+                country_localized_name = country_names.localized_name(country, language)
+                if country_localized_name:
+                    containing_places[AddressFormatter.COUNTRY].add(country_localized_name)
+                country_alpha3_code = country_names.alpha3_code(country)
+                if country_alpha3_code and language in (None, 'ENG'):
+                    containing_places[AddressFormatter.COUNTRY].add(country_alpha3_code)
+
+                keys = containing_places.keys()
+                all_values = containing_places.values()
+
+                keys_set = set(keys)
+
+                for i, values in enumerate(itertools.product(*all_values)):
+                    components = {
+                        AddressFormatter.POSTCODE: postal_code
+                    }
+
+                    if not default_city_names:
+                        components.update(zip(keys, values))
+                    else:
+                        for k, v in zip(keys, values):
+                            if k == AddressFormatter.CITY or AddressFormatter.CITY in keys_set or v.lower() not in default_city_names:
+                                components[k] = v
+
+                    format_language = language if self.formatter.template_language_matters(country, language) else None
+                    formatted = self.formatter.format_address(components, country, language=format_language,
+                                                              minimal_only=False, tag_components=tag_components)
+
+                    yield (language, country, formatted)
+
+                    component_keys = set(components)
+                    components = place_config.dropout_components(components, (), country=country, population=0)
+
+                    if len(components) > 1 and set(components) ^ component_keys:
+                        formatted = self.formatter.format_address(components, country, language=format_language,
+                                                                  minimal_only=False, tag_components=tag_components)
+                        yield (language, country, formatted)
+
+    def build_training_data(self, out_dir, tag_components=True):
+        if tag_components:
+            formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_TAGGED_FILENAME), 'w')
+            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
+        else:
+            formatted_tagged_file = open(os.path.join(out_dir, GEOPLANET_FORMAT_DATA_FILENAME), 'w')
+            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
+
+        i = 0
+
+        for language, country, formatted_address in self.format_postal_codes(tag_components=tag_components):
+            if not formatted_address or not formatted_address.strip():
+                continue
+
+            formatted_address = tsv_string(formatted_address)
+            if not formatted_address or not formatted_address.strip():
+                continue
+
+            if tag_components:
+                row = (language, country, formatted_address)
+            else:
+                row = (formatted_address,)
+
+            writer.writerow(row)
+            i += 1
+            if i % 1000 == 0 and i > 0:
+                print('did {} formatted addresses'.format(i))
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        sys.exit('Usage: python geoplanet_training_data.py geoplanet_db_path out_dir')
+
+    geoplanet_db_path = sys.argv[1]
+    out_dir = sys.argv[2]
+
+    geoplanet = GeoPlanetFormatter(geoplanet_db_path)
+    geoplanet.build_training_data(out_dir)
--- a/scripts/geodata/graph/init.py
+++ b/scripts/geodata/graph/init.py
--- a/scripts/geodata/graph/scc.py
+++ b/scripts/geodata/graph/scc.py
@@ -0,0 +1,41 @@
+VISIT, VISIT_EDGE, POST_VISIT = range(3)
+
+
+def strongly_connected_components(graph):
+    '''
+    Find strongly connected components in a graph using iterative
+    depth-first search.
+
+    Based on:
+    http://code.activestate.com/recipes/578507-strongly-connected-components-of-a-directed-graph/
+    '''
+    identified = set()
+    stack = []
+    index = {}
+    boundaries = []
+
+    for v in graph:
+        if v not in index:
+            todo = [(VISIT, v)]
+            while todo:
+                op, v = todo.pop()
+                if op == VISIT:
+                    index[v] = len(stack)
+                    stack.append(v)
+                    boundaries.append(index[v])
+                    todo.append((POST_VISIT, v))
+                    todo.extend([(VISIT_EDGE, w) for w in graph[v]])
+                elif op == VISIT_EDGE:
+                    if v not in index:
+                        todo.append((VISIT, v))
+                    elif v not in identified:
+                        while index[v] < boundaries[-1]:
+                            boundaries.pop()
+                else:
+                    # op == POST_VISIT
+                    if boundaries[-1] == index[v]:
+                        boundaries.pop()
+                        scc = stack[index[v]:]
+                        del stack[index[v]:]
+                        identified.update(scc)
+                        yield scc
--- a/scripts/geodata/graph/topsort.py
+++ b/scripts/geodata/graph/topsort.py
@@ -0,0 +1,32 @@
+
+def topsort(graph):
+    '''
+    Topological sort for a dependency graph, e.g.
+
+    Usage:
+
+    >>> graph = {
+            'a': ['b'],
+            'b': ['d'],
+            'c': ['d', 'a'],
+            'd': [],
+        }
+    >>> topsort(graph)
+
+    Returns: ['d', 'b', 'a', 'c']
+
+    '''
+    todos = set(graph.keys())
+    seen = set()
+    result = []
+    while todos:
+        for key in todos:
+            deps = graph[key]
+            if len([d for d in deps if d in seen]) == len(deps):
+                break
+        else:
+            raise Exception('Cycle: {}'.format(todos))
+        todos.remove(key)
+        result.append(key)
+        seen.add(key)
+    return result
--- a/scripts/geodata/i18n/init.py
+++ b/scripts/geodata/i18n/init.py
--- a/scripts/geodata/i18n/cldr_languages.py
+++ b/scripts/geodata/i18n/cldr_languages.py
@@ -0,0 +1,139 @@
+import argparse
+import csv
+import os
+import requests
+
+from collections import Counter
+
+from cStringIO import StringIO
+from lxml import etree
+
+from unicode_paths import CLDR_DIR
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+DEFAULT_LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                     'resources', 'language', 'countries')
+
+CLDR_SUPPLEMENTAL_DATA = os.path.join(CLDR_DIR, 'common', 'supplemental',
+                                      'supplementalData.xml')
+
+ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
+ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'
+
+ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
+MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
+COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
+SCRIPT_LANGUAGES_FILENAME = 'script_languages.tsv'
+
+REGIONAL = 'official_regional'
+UNKNOWN_COUNTRY = 'zz'
+UNKNOWN_LANGUAGES = ('und', 'zxx')
+
+
+def write_country_official_languages_file(xml, out_dir):
+    lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
+    lang_writer = csv.writer(lang_file, delimiter='\t')
+
+    def get_population_pct(lang):
+        return int(lang.attrib.get('populationPercent', 0))
+
+    lang_scripts = {}
+    for lang in xml.xpath('//languageData/language'):
+        language_code = lang.attrib['type'].lower()
+        scripts = lang.get('scripts')
+        if not scripts:
+            continue
+        territories = lang.get('territories')
+        if (language_code, None) not in lang_scripts:
+            lang_scripts[(language_code, None)] = scripts
+
+        if not territories:
+            continue
+        for territory in territories.strip().split():
+            lang_scripts[(language_code, territory.lower())] = scripts
+
+    for territory in xml.xpath('//territoryInfo/territory'):
+        country_code = territory.attrib['type'].lower()
+        if country_code == UNKNOWN_COUNTRY:
+            continue
+        langs = territory.xpath('languagePopulation')
+        languages = Counter()
+        official = set()
+        regional = set()
+        for lang in langs:
+            language = lang.attrib['type'].lower().split('_')[0]
+            official_status = lang.attrib.get('officialStatus')
+            languages[language] += float(lang.attrib['populationPercent'])
+            if official_status and official_status != REGIONAL:
+                official.add(language)
+            elif official_status == REGIONAL:
+                regional.add(language)
+
+        if official:
+            languages = Counter({l: c for l, c in languages.iteritems()
+                                 if l in official or l in regional})
+        else:
+            languages = Counter({l: c for l, c in languages.most_common(1)})
+
+        for lang, pct in languages.most_common():
+            if lang in UNKNOWN_LANGUAGES:
+                continue
+
+            script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))
+
+            lang_writer.writerow((country_code, lang, script.replace(' ', ','),
+                                  str(min(pct, 100.0)), str(int(lang in official))))
+
+RETIRED = 'R'
+INDIVIDUAL = 'I'
+MACRO = 'M'
+LIVING = 'L'
+
+
+def write_languages_file(langs, macro, out_dir):
+    lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
+    writer = csv.writer(lang_file, delimiter='\t')
+    writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
+                     'ISO 639-1', 'type', 'macro'))
+
+    macro_reader = csv.reader(StringIO(macro), delimiter='\t')
+    headers = macro_reader.next()
+    assert len(headers) == 3
+    macros = {minor_code: macro_code for (macro_code, minor_code, status)
+              in macro_reader if status != RETIRED}
+
+    lang_reader = csv.reader(StringIO(langs), delimiter='\t')
+    headers = lang_reader.next()
+    assert headers[:6] == ['Id', 'Part2B', 'Part2T',
+                           'Part1', 'Scope', 'Language_Type']
+
+    for line in lang_reader:
+        iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
+        macro = macros.get(iso639_3, '')
+        # Only living languages that are either individual or macro
+        if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
+            writer.writerow((iso639_3, iso639_2b, iso639_2t,
+                             iso639_1, scope, macro))
+
+
+def fetch_cldr_languages(out_dir=DEFAULT_LANGUAGES_DIR):
+    response = requests.get(ISO_639_3)
+    langs = response.content
+
+    response = requests.get(ISO_MACROLANGUAGES)
+    macro = response.content
+    write_languages_file(langs, macro, out_dir)
+
+    supplemental = open(CLDR_SUPPLEMENTAL_DATA)
+    xml = etree.parse(supplemental)
+    write_country_official_languages_file(xml, out_dir)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out',
+                        default=DEFAULT_LANGUAGES_DIR,
+                        help='Out directory')
+    args = parser.parse_args()
+
+    fetch_cldr_languages(args.out)
--- a/scripts/geodata/i18n/download_cldr.py
+++ b/scripts/geodata/i18n/download_cldr.py
@@ -0,0 +1,30 @@
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+from unicode_paths import CLDR_DIR
+from geodata.file_utils import ensure_dir
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+CLDR_URL = 'http://www.unicode.org/Public/cldr/latest/core.zip'
+
+
+def download_cldr(temp_dir=None):
+    if os.path.exists(CLDR_DIR):
+        shutil.rmtree(CLDR_DIR)
+    ensure_dir(CLDR_DIR)
+
+    if not temp_dir:
+        temp_dir = tempfile.gettempdir()
+
+    cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1])
+
+    subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename])
+    subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
+
+if __name__ == '__main__':
+    download_cldr(*sys.argv[1:])
--- a/scripts/geodata/i18n/google.py
+++ b/scripts/geodata/i18n/google.py
@@ -0,0 +1,37 @@
+import re
+import requests
+import six.moves.urllib_parse as urlparse
+import ujson
+
+requests.models.json = ujson
+
+
+GOOGLE_I18N_API = 'http://i18napis.appspot.com'
+GOOGLE_ADDRESS_DATA_API = urlparse.urljoin(GOOGLE_I18N_API, 'address/data/')
+
+
+class GoogleI18N(object):
+    '''
+    Fetches data from e.g. http://i18napis.appspot.com/address/data/GB
+    and caches it in a dictionary for each country. These requests are
+    lightweight, so for a given run of a program, max 250 requests
+    will be made.
+    '''
+    def __init__(self):
+        self.responses = {}
+
+    def get(self, country_code):
+        ret = self.responses.get(country_code.lower())
+
+        if ret is None:
+            url = urlparse.urljoin(GOOGLE_ADDRESS_DATA_API, country_code.upper())
+            response = requests.get(url)
+            if response.ok:
+                ret = response.json()
+                self.responses[country_code.lower()] = ret
+            else:
+                self.responses[country_code.lower()] = {}
+        return ret
+
+
+google_i18n = GoogleI18N()
--- a/scripts/geodata/i18n/languages.py
+++ b/scripts/geodata/i18n/languages.py
@@ -0,0 +1,86 @@
+import os
+import csv
+import sys
+
+from collections import defaultdict, OrderedDict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.csv_utils import unicode_csv_reader
+
+LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                             'resources', 'language')
+
+country_languages = defaultdict(OrderedDict)
+# Only official and de facto official, no official_regional
+official_languages = defaultdict(OrderedDict)
+
+regional_languages = defaultdict(OrderedDict)
+road_language_overrides = defaultdict(OrderedDict)
+
+languages = set()
+all_languages = languages
+
+osm_admin1_ids = set()
+
+languages_initialized = False
+
+
+def init_languages(languages_dir=LANGUAGES_DIR):
+    global languages_initialized
+    if languages_initialized:
+        return
+    path = os.path.join(languages_dir, 'countries', 'country_language.tsv')
+    if not os.path.exists(path):
+        raise ValueError('File does not exist: {}'.format(path))
+
+    for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
+        country_languages[country][lang] = int(is_official)
+        languages.add(lang)
+
+    for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
+        if int(is_official) or len(country_languages[country]) == 1:
+            official_languages[country][lang] = 1
+
+    path = os.path.join(languages_dir, 'countries', 'road_sign_languages.tsv')
+    for country, lang, default in csv.reader(open(path), delimiter='\t'):
+        road_language_overrides[country][lang] = int(default)
+        if lang not in languages:
+            languages.add(lang)
+
+    path = os.path.join(languages_dir, 'regional', 'adm1.tsv')
+
+    for country, key, value, langs, default in unicode_csv_reader(open(path), delimiter='\t'):
+        if key == 'osm':
+            osm_admin1_ids.add(tuple(value.split(':')))
+        for lang in langs.split(','):
+            regional_languages[(country, key, value)][lang] = int(default)
+            if lang not in country_languages[country]:
+                country_languages[country][lang] = 0
+            if lang not in languages:
+                languages.add(lang)
+
+    languages_initialized = True
+
+
+init_languages()
+
+
+def get_country_languages(country, official=True, overrides=True):
+    if official:
+        languages = official_languages[country]
+    else:
+        languages = country_languages[country]
+
+    if overrides:
+        road_overrides = road_language_overrides.get(country)
+        if road_overrides and road_overrides.values()[0]:
+            languages = road_overrides
+        elif road_overrides:
+            languages.update(road_overrides)
+    return languages
+
+
+def get_regional_languages(country, key, value):
+    return regional_languages.get((country, key, value), OrderedDict())
--- a/scripts/geodata/i18n/normalize.py
+++ b/scripts/geodata/i18n/normalize.py
@@ -0,0 +1,5 @@
+import unicodedata
+
+
+def strip_accents(s):
+    return u''.join([c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'])
--- a/scripts/geodata/i18n/scanner.py
+++ b/scripts/geodata/i18n/scanner.py
@@ -0,0 +1,37 @@
+import re
+import os
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode
+
+class Scanner(object):
+    '''
+    Simple scanner implementation in Python using regular expression groups.
+    Used to create dynamic lexicons for parsing various CLDR files
+    without compiling a C scanner. Only C scanners are used at runtime
+    '''
+
+    def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
+        self.lexicon = lexicon
+
+        regexes, responses = zip(*lexicon)
+
+        self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
+        self.responses = responses
+
+    def scan(self, s):
+
+        for match in self.regex.finditer(safe_decode(s)):
+            i = match.lastindex
+            response = self.responses[i - 1]
+            token = match.group(i)
+            if not callable(response):
+                yield (token, response)
+            else:
+                responses = response(match, token)
+                if responses is not None:
+                    for response, token in responses:
+                        yield (token, response)
--- a/scripts/geodata/i18n/transliteration_rules.py
+++ b/scripts/geodata/i18n/transliteration_rules.py
--- a/scripts/geodata/i18n/unicode_data.py
+++ b/scripts/geodata/i18n/unicode_data.py
@@ -0,0 +1,273 @@
+'''
+unicode_data.py
+---------------
+
+Python's unicodedata module uses an outdated spec (Unicode 5.2) and since
+e.g. unicode categories are used in tokenization, we'd like to keep this
+as up-to-date as possible with the latest standard.
+'''
+import csv
+import os
+import sys
+from collections import defaultdict, namedtuple
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.file_utils import download_file
+from geodata.string_utils import wide_unichr, wide_ord
+
+from unicode_properties import *
+
+from unicode_paths import UNICODE_DATA_DIR
+
+UNIDATA_URL = 'http://unicode.org/Public/UNIDATA/UnicodeData.txt'
+
+UNIDATA_DIR = os.path.join(UNICODE_DATA_DIR, 'unidata')
+LOCAL_UNIDATA_FILE = os.path.join(UNIDATA_DIR, 'UnicodeData.txt')
+
+unicode_categories = defaultdict(list)
+unicode_blocks = defaultdict(list)
+unicode_combining_classes = defaultdict(list)
+unicode_general_categories = defaultdict(list)
+unicode_scripts = defaultdict(list)
+unicode_properties = {}
+
+unicode_script_ids = {}
+
+unicode_blocks = {}
+unicode_category_aliases = {}
+unicode_property_aliases = {}
+unicode_property_value_aliases = {}
+unicode_word_breaks = {}
+
+
+# Ref: ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
+UNIDATA_FIELDS = [
+    'code',
+    'name',
+    'category',
+    'combining',
+    'bidi_category',
+    'decomp_mapping',
+    'decimal_value',
+    'digit_value',
+    'numeric_value',
+    'mirrored',
+    'unicode_1_name',
+    'comment',
+    'upper_mapping',
+    'lower_mapping',
+    'title_mapping',
+]
+
+UnicodeDataRow = namedtuple('UnicodeDataRow', ','.join(UNIDATA_FIELDS))
+
+
+def parse_unicode_data():
+    '''
+    Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
+    '''
+    if not os.path.exists(LOCAL_UNIDATA_FILE):
+        download_file(UNIDATA_URL, LOCAL_UNIDATA_FILE)
+    unidata_file = open(LOCAL_UNIDATA_FILE)
+
+    for line in csv.reader(unidata_file, delimiter=';'):
+        yield UnicodeDataRow(*line)
+
+
+def iter_unicode_combining_classes():
+    return unicode_combining_classes.iteritems()
+
+
+def iter_unicode_categories():
+    return unicode_categories.iteritems()
+
+
+def get_unicode_category(cat):
+    return unicode_categories[cat]
+
+
+def get_unicode_combining_class(c):
+    return unicode_combining_classes[c]
+
+
+def get_unicode_categories():
+    '''
+    Build dict of unicode categories e.g.
+
+    {
+        'Lu': ['A', 'B', 'C', ...]
+        'Ll': ['a', 'b', 'c', ...]
+    }
+    '''
+    categories = defaultdict(list)
+    for row in parse_unicode_data():
+        categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
+    return dict(categories)
+
+
+def get_unicode_combining_classes():
+    '''
+    Build dict of unicode combining classes e.g.
+
+    {
+        '0': ['\x00', '\x01', \x02', ...]
+    }
+    '''
+    combining_classes = defaultdict(list)
+    for row in parse_unicode_data():
+        combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
+    return dict(combining_classes)
+
+unicode_category_aliases = {
+    'letter': 'L',
+    'lower': 'Ll',
+    'lowercase': 'Ll',
+    'lowercaseletter': 'Ll',
+    'upper': 'Lu',
+    'uppercase': 'Lu',
+    'uppercaseletter': 'Lu',
+    'title': 'Lt',
+    'nonspacing mark': 'Mn',
+    'mark': 'M',
+}
+
+COMBINING_CLASS_PROP = 'canonical_combining_class'
+BLOCK_PROP = 'block'
+GENERAL_CATEGORY_PROP = 'general_category'
+SCRIPT_PROP = 'script'
+WORD_BREAK_PROP = 'word_break'
+
+
+def init_unicode_categories():
+    '''
+    Initialize module-level dictionaries
+    '''
+    global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
+    global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
+    global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
+
+    unicode_categories.update(get_unicode_categories())
+    unicode_combining_classes.update(get_unicode_combining_classes())
+
+    for key in unicode_categories.keys():
+        unicode_general_categories[key[0]].extend(unicode_categories[key])
+
+    script_chars = get_chars_by_script()
+    for i, script in enumerate(script_chars):
+        if script:
+            unicode_scripts[script.lower()].append(wide_unichr(i))
+
+    unicode_scripts = dict(unicode_scripts)
+
+    unicode_script_ids.update(build_master_scripts_list(script_chars))
+
+    unicode_blocks.update(get_unicode_blocks())
+    unicode_properties.update(get_unicode_properties())
+    unicode_property_aliases.update(get_property_aliases())
+
+    unicode_word_breaks.update(get_word_break_properties())
+
+    for key, value in get_property_value_aliases().iteritems():
+        key = unicode_property_aliases.get(key, key)
+        if key == GENERAL_CATEGORY_PROP:
+            for k, v in value.iteritems():
+                k = k.lower()
+                unicode_category_aliases[k] = v
+                if '_' in k:
+                    unicode_category_aliases[k.replace('_', '')] = v
+
+        unicode_property_value_aliases[key] = value
+
+
+regex_chars = re.compile('([\[\]\{\}\-\^])')
+
+
+def replace_regex_chars(s):
+    return regex_chars.sub(r'\\\1', s)
+
+
+def format_regex_char(i):
+    c = wide_unichr(i)
+    return replace_regex_chars(c.encode('unicode-escape'))
+
+
+def make_char_set_regex(chars):
+    '''
+    Build a regex character set from a list of characters
+    '''
+    group_start = None
+    group_end = None
+    last_ord = -2
+
+    ords = map(wide_ord, chars)
+    ords.sort()
+
+    ords.append(None)
+
+    groups = []
+
+    for i, o in enumerate(ords):
+        if o is not None and o == last_ord + 1:
+            group_end = o
+        elif group_start is not None and group_end is not None:
+            groups.append('-'.join((format_regex_char(group_start), format_regex_char(group_end))))
+            group_end = None
+            group_start = o
+        elif group_start is not None and group_end is None:
+            groups.append(format_regex_char(group_start))
+            group_start = o
+        else:
+            group_start = o
+
+        last_ord = o
+
+    return u'[{}]'.format(u''.join(groups))
+
+
+name_category = [
+    ('control_chars', 'Cc'),
+    ('other_format_chars', 'Cf'),
+    ('other_not_assigned_chars', 'Cn'),
+    ('other_private_use_chars', 'Co'),
+    ('other_surrogate_chars', 'Cs'),
+    ('letter_lower_chars', 'Ll'),
+    ('letter_modifier_chars', 'Lm'),
+    ('letter_other_chars', 'Lo'),
+    ('letter_title_chars', 'Lt'),
+    ('letter_upper_chars', 'Lu'),
+    ('mark_spacing_combining_chars', 'Mc'),
+    ('mark_enclosing_chars', 'Me'),
+    ('mark_nonspacing_chars', 'Mn'),
+    ('number_or_digit_chars', 'Nd'),
+    ('number_letter_chars', 'Nl'),
+    ('number_other_chars', 'No'),
+    ('punct_connector_chars', 'Pc'),
+    ('punct_dash_chars', 'Pd'),
+    ('punct_close_chars', 'Pe'),
+    ('punct_final_quote_chars', 'Pf'),
+    ('punct_initial_quote_chars', 'Pi'),
+    ('punct_other_chars', 'Po'),
+    ('punct_open_chars', 'Ps'),
+    ('currency_symbol_chars', 'Sc'),
+    ('symbol_modifier_chars', 'Sk'),
+    ('symbol_math_chars', 'Sm'),
+    ('symbol_other_chars', 'So'),
+    ('separator_line_chars', 'Zl'),
+    ('separator_paragraph_chars', 'Zp'),
+    ('space', 'Zs'),
+]
+
+
+def main():
+    init_unicode_categories()
+    for name, cat in name_category:
+        if cat not in unicode_categories:
+            continue
+        chars = unicode_categories[cat]
+        print u'{} = {};'.format(name, make_char_set_regex(chars))
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/geodata/i18n/unicode_paths.py
+++ b/scripts/geodata/i18n/unicode_paths.py
@@ -0,0 +1,11 @@
+import os
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources')
+
+UNICODE_DATA_DIR = os.path.join(DATA_DIR, 'unicode')
+
+CLDR_DIR = os.path.join(UNICODE_DATA_DIR, 'cldr')
--- a/scripts/geodata/i18n/unicode_properties.py
+++ b/scripts/geodata/i18n/unicode_properties.py
@@ -0,0 +1,463 @@
+'''
+scripts.py
+
+This code uses the latest copy of Scripts.txt from unicode.org
+to generate a C file (and header) defining which script every character
+belongs to.
+'''
+
+import csv
+import os
+import requests
+import re
+import sys
+import tempfile
+import requests
+import subprocess
+
+from cStringIO import StringIO
+
+from collections import OrderedDict, defaultdict
+from itertools import islice
+
+from lxml import etree
+
+from operator import itemgetter
+
+from zipfile import ZipFile
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_encode, safe_decode
+from geodata.file_utils import ensure_dir, download_file
+from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
+
+from cldr_languages import *
+from download_cldr import download_cldr
+from languages import get_country_languages
+from unicode_paths import UNICODE_DATA_DIR
+from word_breaks import script_regex, regex_char_range
+
+SRC_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src')
+
+SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
+LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
+LOCAL_ISO_15924_FILE = os.path.join(SCRIPTS_DATA_DIR, 'iso15924.txt')
+
+BLOCKS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'blocks')
+LOCAL_BLOCKS_FILE = os.path.join(BLOCKS_DATA_DIR, 'Blocks.txt')
+
+PROPS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'props')
+LOCAL_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'PropList.txt')
+LOCAL_PROP_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyAliases.txt')
+LOCAL_PROP_VALUE_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyValueAliases.txt')
+LOCAL_DERIVED_CORE_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'DerivedCoreProperties.txt')
+
+WORD_BREAKS_DIR = os.path.join(UNICODE_DATA_DIR, 'word_breaks')
+LOCAL_WORD_BREAKS_FILE = os.path.join(WORD_BREAKS_DIR, 'WordBreakProperty.txt')
+
+SCRIPTS_HEADER = 'unicode_script_types.h'
+SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
+
+SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
+BLOCKS_URL = 'http://unicode.org/Public/UNIDATA/Blocks.txt'
+PROPS_URL = 'http://unicode.org/Public/UNIDATA/PropList.txt'
+PROP_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyAliases.txt'
+PROP_VALUE_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt'
+DERIVED_CORE_PROPS_URL = 'http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt'
+WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt'
+
+ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
+
+scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
+#define UNICODE_SCRIPT_TYPES_H
+
+#include <stdlib.h>
+
+#define NUM_CODEPOINTS {num_codepoints}
+#define MAX_LANGS {max_langs}
+
+typedef enum {{
+    {script_enum}
+    NUM_SCRIPTS
+}} script_t;
+
+#endif
+'''
+
+scripts_c_data_template = u'''
+script_t char_scripts[] = {{
+    {char_scripts}
+}};
+
+script_code_t script_codes[] = {{
+    {script_codes}
+}};
+
+script_languages_t script_languages[] = {{
+    {script_languages}
+}};
+'''
+
+script_code_template = '{{SCRIPT_{name}, "{code}"}}'
+
+script_language_template = '{{{num_langs}, {languages}}}'
+
+
+def unicode_to_integer(u):
+    return int('0x{}'.format(u), 16)
+
+
+def script_name_constant(i, u):
+    return u'SCRIPT_{} = {}'.format(u.upper(), i)
+
+
+UNKNOWN_SCRIPT = 'Unknown'
+COMMON_SCRIPT = 'Common'
+
+
+def parse_char_range(r):
+    return [unicode_to_integer(u) for u in r.split('..')]
+
+
+def get_chars_by_script():
+    scripts_file = open(LOCAL_SCRIPTS_FILE)
+    scripts = [None] * NUM_CODEPOINTS
+
+    # Lines look like:
+    # 0041..005A    ; Latin # L&  [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+    for char_range, script, char_class in script_regex.findall(scripts_file.read()):
+        script_range = parse_char_range(char_range)
+        if len(script_range) == 2:
+            for i in xrange(script_range[0], script_range[1] + 1):
+                scripts[i] = script
+        elif script_range:
+            scripts[script_range[0]] = script
+
+    return scripts
+
+
+COMMENT_CHAR = '#'
+DELIMITER_CHAR = ';'
+
+
+def parse_file(f):
+    for line in f:
+        line = line.split(COMMENT_CHAR)[0].strip()
+        if not line:
+            continue
+        tokens = line.split(DELIMITER_CHAR)
+        if tokens:
+            yield [t.strip() for t in tokens]
+
+
+def get_property_aliases():
+    prop_aliases_file = open(LOCAL_PROP_ALIASES_FILE)
+
+    aliases = {}
+
+    for line in parse_file(prop_aliases_file):
+        prop = line[1]
+        prop_aliases = [line[0]] + line[2:]
+
+        for alias in prop_aliases:
+            aliases[alias.lower()] = prop.lower()
+
+    return aliases
+
+
+def get_property_value_aliases():
+    prop_value_aliases_file = open(LOCAL_PROP_VALUE_ALIASES_FILE)
+
+    value_aliases = defaultdict(dict)
+
+    for line in parse_file(prop_value_aliases_file):
+        prop = line[0]
+        if prop not in ('ccc', 'gc'):
+            value = line[2]
+            aliases = [line[1]] + line[3:]
+        else:
+            value = line[1]
+            aliases = line[2:]
+
+        for alias in aliases:
+            value_aliases[prop.lower()][alias] = value
+
+    return dict(value_aliases)
+
+
+def get_unicode_blocks():
+    blocks_file = open(LOCAL_BLOCKS_FILE)
+
+    blocks = defaultdict(list)
+
+    for line in parse_file(blocks_file):
+        char_range, block = line
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                blocks[block.lower()].append(wide_unichr(i))
+        elif char_range:
+            blocks[block.lower()].append(wide_unichr(char_range[0]))
+
+    return dict(blocks)
+
+
+def get_unicode_properties():
+    props_file = open(LOCAL_PROPS_FILE)
+
+    props = defaultdict(list)
+
+    for line in parse_file(props_file):
+        char_range, prop = line
+
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                props[prop.lower()].append(wide_unichr(i))
+        elif char_range:
+            props[prop.lower()].append(wide_unichr(char_range[0]))
+
+    derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
+    for line in parse_file(derived_props_file):
+        char_range, prop = line
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                props[prop.lower()].append(wide_unichr(i))
+        elif char_range:
+            props[prop.lower()].append(wide_unichr(char_range[0]))
+
+    return dict(props)
+
+
+def get_word_break_properties():
+    props_file = open(LOCAL_WORD_BREAKS_FILE)
+
+    props = defaultdict(list)
+
+    for line in parse_file(props_file):
+        char_range, prop = line
+
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                props[prop].append(wide_unichr(i))
+        elif char_range:
+            props[prop].append(wide_unichr(char_range[0]))
+
+    return dict(props)
+
+
+def build_master_scripts_list(chars):
+    all_scripts = OrderedDict.fromkeys(filter(bool, chars))
+
+    for i, script in enumerate(all_scripts.keys()):
+        all_scripts[script] = i + 1
+
+    # Unknown script for all characters not covered
+    all_scripts[UNKNOWN_SCRIPT] = 0
+
+    return all_scripts
+
+
+SCRIPT_ALIASES_SUPPLEMENTAL = {
+    'Hant': 'Han',
+    'Hans': 'Han'
+}
+
+
+def get_script_codes(all_scripts):
+
+    if not os.path.exists(LOCAL_ISO_15924_FILE):
+        temp_dir = tempfile.gettempdir()
+
+        script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])
+
+        # This comes as a .zip
+        script_codes_response = requests.get(ISO_15924_URL)
+        zf = ZipFile(StringIO(script_codes_response.content))
+        iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]
+
+        # Strip out the comments, etc.
+        temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
+                                        if line.strip() and not line.strip().startswith('#')])
+
+        f = open(LOCAL_ISO_15924_FILE, 'w')
+        f.write(safe_encode(temp_iso15924_file))
+        f.close()
+
+    script_codes_file = open(LOCAL_ISO_15924_FILE)
+
+    script_codes = {}
+    seen_scripts = set()
+
+    # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
+    for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
+        if name in all_scripts:
+            script_codes[code] = name
+            seen_scripts.add(name)
+        else:
+            normalized_name = name.split('(')[0].strip()
+            if normalized_name in all_scripts and normalized_name not in seen_scripts:
+                script_codes[code] = normalized_name
+                seen_scripts.add(normalized_name)
+
+    value_aliases = get_property_value_aliases()
+    script_aliases = value_aliases['sc']
+
+    for code, script in script_aliases.iteritems():
+        if code not in script_codes and script in all_scripts:
+            script_codes[code] = script
+
+    script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)
+
+    return script_codes
+
+
+SCRIPT_CODE_ALIASES = {
+    'Jpan': ['Hani', 'Hira', 'Kana'],
+    'Kore': ['Hang', 'Han']
+}
+
+
+def extract_language_scripts(xml):
+    language_scripts = defaultdict(list)
+
+    for lang in xml.xpath('//languageData/language'):
+        language_code = lang.attrib['type'].lower()
+        scripts = lang.get('scripts')
+        if not scripts:
+            continue
+        for script in scripts.split():
+            script_aliases = SCRIPT_CODE_ALIASES.get(script)
+            if not script_aliases:
+                language_scripts[language_code].append(script)
+            else:
+                language_scripts[language_code].extend(script_aliases)
+
+    return language_scripts
+
+
+def batch_iter(iterable, batch_size):
+    source_iter = iter(iterable)
+    while True:
+        batch = list(islice(source_iter, batch_size))
+        if len(batch) > 0:
+            yield batch
+        else:
+            return
+
+
+def get_script_languages():
+    # For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
+    # to identify the language. We keep track of those single language scripts to inform
+    # the language classifier
+
+    chars = get_chars_by_script()
+    all_scripts = build_master_scripts_list(chars)
+    script_codes = get_script_codes(all_scripts)
+
+    cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
+    cldr_xml = etree.parse(cldr_supplemental_data)
+    language_scripts = extract_language_scripts(cldr_xml)
+
+    country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
+    if not os.path.exists(country_languages_path):
+        fetch_cldr_languages(DEFAULT_LANGUAGES_DIR)
+
+    country_language_file = open(country_languages_path)
+    country_language_reader = csv.reader(country_language_file, delimiter='\t')
+
+    countries = set([country for country, lang, script, pct, is_official
+                     in country_language_reader])
+
+    spoken_languages = set.union(*(set(get_country_languages(country)) for country in countries))
+
+    script_code_languages = defaultdict(list)
+    for language, scripts in language_scripts.iteritems():
+        if language not in spoken_languages:
+            continue
+        for script in scripts:
+            script_code_languages[script].append(language)
+
+    script_languages = defaultdict(list)
+
+    for script_code, script_name in script_codes.iteritems():
+        langs = script_code_languages.get(script_code, [])
+        script_languages[script_name].extend(langs)
+
+    for name in all_scripts.iterkeys():
+        script_languages.setdefault(name, [])
+
+    return script_languages
+
+
+def main(out_dir=SRC_DIR):
+    # Output is a C header and data file, see templates
+    out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
+    out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
+
+    download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
+    download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
+    download_file(PROPS_URL, LOCAL_PROPS_FILE)
+    download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
+    download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
+    download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
+    download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)
+
+    if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
+        download_cldr()
+
+    chars = get_chars_by_script()
+    all_scripts = build_master_scripts_list(chars)
+    script_codes = get_script_codes(all_scripts)
+
+    script_languages = get_script_languages()
+
+    max_langs = 0
+
+    for script, langs in script_languages.iteritems():
+        num_langs = len(langs)
+        if num_langs > max_langs:
+            max_langs = num_langs
+
+    # Generate C header and constants
+
+    script_enum = u'''
+    '''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])
+
+    out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
+                     max_langs=max_langs,
+                     script_enum=script_enum))
+    out_header.close()
+
+    # Generate C data file
+
+    char_scripts_data = u''',
+    '''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
+
+    script_codes_data = u''',
+    '''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])
+
+    sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]
+
+    script_language_data = u''',
+    '''.join([script_language_template.format(num_langs=len(langs),
+              languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL'))
+              for langs in sorted_lang_scripts])
+
+    out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
+                   char_scripts=char_scripts_data,
+                   script_codes=script_codes_data,
+                   script_languages=script_language_data))
+    out_file.close()
+
+
+if __name__ == '__main__':
+    main(*sys.argv[1:])
--- a/scripts/geodata/i18n/word_breaks.py
+++ b/scripts/geodata/i18n/word_breaks.py
@@ -0,0 +1,140 @@
+'''
+word_breaks.py
+
+This script is used to automatically build ranges of unicode characters
+from the unicode spec's word break properties. These ranges help us
+build a tokenizer that does the right thing in every language with regard
+to word segmentation. The lines outputted by this script can be pasted
+into scanner.re before compliation.
+'''
+
+import requests
+from collections import defaultdict
+import re
+
+# Operate on WordBreakProperty.txt file
+hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
+format_regex = re.compile('^([^\s]+)[\s]+; Format ')
+extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
+katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
+other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
+mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
+mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
+mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
+numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
+extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
+
+# Operate on Scripts.txt file
+other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
+
+script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
+
+WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
+HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
+SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
+
+ideographic_scripts = set([
+    'han',
+    'hiragana',
+    'hangul',
+    'tibetan',
+    'thai',
+    'lao',
+    'javanese',
+    'balinese',
+    'yi',
+])
+
+
+def regex_char_range(match):
+    r = match.split('..')
+    # Wide version
+    return u'-'.join([('\u{}'.format(c.lower()) if len(c) < 5 else '\U{}'.format(c.lower().rjust(8, '0'))) for c in r])
+
+
+def get_letter_range(text, *regexes):
+    char_ranges = []
+    for line in text.split('\n'):
+        for regex in regexes:
+            m = regex.match(line)
+            if m:
+                char_ranges.append(regex_char_range(m.group(1)))
+    return char_ranges
+
+
+def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
+    char_ranges = []
+    for char_range, script, char_class in script_regex.findall(text):
+        if script.lower() in scripts and char_class_regex.match(char_class):
+            char_ranges.append(regex_char_range(char_range))
+    return char_ranges
+
+
+def get_char_class(text, char_class_regex):
+    char_ranges = []
+    for char_range, script, char_class in script_regex.findall(text):
+        if char_class_regex.match(char_class):
+            char_ranges.append(regex_char_range(char_range))
+    return char_ranges
+
+
+hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
+
+
+def get_hangul_syllable_ranges(text):
+    char_ranges = defaultdict(list)
+    for line in text.split('\n'):
+        m = hangul_syllable_type_regex.match(line)
+        if m:
+            char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
+    return dict(char_ranges)
+
+
+name_funcs = [
+    ('hebrew_letter_chars', hebrew_letter_regex),
+    ('format_chars', format_regex),
+    ('extend_chars', extend_regex),
+    ('katakana_chars', katakana_regex),
+    ('letter_other_alpha_chars', other_alpha_letter_regex),
+    ('mid_letter_chars', mid_letter_regex),
+    ('mid_number_chars', mid_number_regex),
+    ('mid_num_letter_chars', mid_num_letter_regex),
+    ('numeric_chars', numeric_regex),
+    ('extend_num_letter_chars', extend_num_letter_regex),
+]
+
+IDEOGRAPHIC_CHARS = 'ideographic_chars'
+IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
+
+numbers_regex = re.compile('N[ol]', re.I)
+letters_regex = re.compile('L*', re.I)
+
+
+def main():
+    ''' Insert these lines into scanner.re '''
+    response = requests.get(WORD_BREAK_PROPERTIES_URL)
+
+    if response.ok:
+        for name, reg in name_funcs:
+            s = get_letter_range(response.content, reg)
+            print '{} = [{}];'.format(name, ''.join(s))
+
+    response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
+
+    if response.ok:
+        syllable_ranges = get_hangul_syllable_ranges(response.content)
+        for name, ranges in syllable_ranges.iteritems():
+            print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
+
+    response = requests.get(SCRIPTS_URL)
+    if response.ok:
+        s = ''.join(get_char_class(response.content, numbers_regex))
+
+        print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
+
+        s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
+        print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/geodata/intersections/init.py
+++ b/scripts/geodata/intersections/init.py
--- a/scripts/geodata/intersections/query.py
+++ b/scripts/geodata/intersections/query.py
@@ -0,0 +1,18 @@
+from collections import namedtuple
+
+from geodata.addresses.config import address_config
+from geodata.math.sampling import weighted_choice
+
+IntersectionQuery = namedtuple('IntersectionQuery', 'road1, intersection_phrase, road2')
+
+NULL_INTERSECTION_QUERY = IntersectionQuery(None, None, None)
+
+
+class Intersection(object):
+    @classmethod
+    def phrase(cls, language, country=None):
+        values, probs = address_config.alternative_probabilities('cross_streets.intersection', language, country=country)
+        if not values:
+            return None
+        phrase, props = weighted_choice(values, probs)
+        return phrase
--- a/scripts/geodata/language_id/init.py
+++ b/scripts/geodata/language_id/init.py
--- a/scripts/geodata/language_id/create_language_training_data.py
+++ b/scripts/geodata/language_id/create_language_training_data.py
@@ -0,0 +1,100 @@
+import argparse
+import logging
+import os
+import subprocess
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.osm.osm_address_training_data import WAYS_LANGUAGE_DATA_FILENAME, ADDRESS_LANGUAGE_DATA_FILENAME, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME, TOPONYM_LANGUAGE_DATA_FILENAME
+
+LANGUAGES_ALL_FILE = 'languages.all'
+LANGAUGES_RANDOM_FILE = 'languages.random'
+LANGUAGES_TRAIN_FILE = 'languages.train'
+LANGUAGES_CV_FILE = 'languages.cv'
+LANGUAGES_TEST_FILE = 'languages.test'
+
+
+def create_language_training_data(osm_dir, split_data=True, train_split=0.8, cv_split=0.1):
+    language_all_path = os.path.join(osm_dir, LANGUAGES_ALL_FILE)
+
+    ways_path = os.path.join(osm_dir, WAYS_LANGUAGE_DATA_FILENAME)
+
+    if os.system(' '.join(['cat', ways_path, '>', language_all_path])) != 0:
+        raise SystemError('Could not find {}'.format(ways_path))
+
+    addresses_path = os.path.join(osm_dir, ADDRESS_LANGUAGE_DATA_FILENAME)
+
+    if os.system(' '.join(['cat', addresses_path, '>>', language_all_path])) != 0:
+        raise SystemError('Could not find {}'.format(addresses_path))
+
+    formatted_path = os.path.join(osm_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME)
+
+    if os.system(' '.join(['cat', formatted_path, '>>', language_all_path])) != 0:
+        raise SystemError('Could not find {}'.format(formatted_path))
+
+    toponyms_path = os.path.join(osm_dir, TOPONYM_LANGUAGE_DATA_FILENAME)
+
+    if os.system(' '.join(['cat', toponyms_path, '>>', language_all_path])) != 0:
+        raise SystemError('Could not find {}'.format(toponyms_path))
+
+    languages_random_path = os.path.join(osm_dir, LANGAUGES_RANDOM_FILE)
+
+    if os.system(u' '.join(['shuf', '--random-source=/dev/urandom', language_all_path, '>', languages_random_path])) != 0:
+        raise SystemError('shuffle failed')
+
+    languages_train_path = os.path.join(osm_dir, LANGUAGES_TRAIN_FILE)
+
+    if split_data:
+        languages_test_path = os.path.join(osm_dir, LANGUAGES_TEST_FILE)
+
+        num_lines = sum((1 for line in open(languages_random_path)))
+        train_lines = int(train_split * num_lines)
+
+        test_lines = num_lines - train_lines
+        cv_lines = int(test_lines * (cv_split / (1.0 - train_split))) + 1
+
+        subprocess.check_call(['split', '-l', str(train_lines), languages_random_path, os.path.join(osm_dir, 'language-split-')])
+        subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_train_path])
+        subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
+
+        languages_cv_path = os.path.join(osm_dir, LANGUAGES_CV_FILE)
+
+        subprocess.check_call(['split', '-l', str(cv_lines), languages_test_path, os.path.join(osm_dir, 'language-split-')])
+        subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-aa'), languages_cv_path])
+        subprocess.check_call(['mv', os.path.join(osm_dir, 'language-split-ab'), languages_test_path])
+    else:
+        subprocess.check_call(['mv', languages_random_path, languages_train_path])
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-n', '--no-split',
+                        action='store_false',
+                        default=True,
+                        help='Do not split data into train/cv/test')
+
+    parser.add_argument('-t', '--train-split',
+                        type=float,
+                        default=0.8,
+                        help='Train split percentage as a float (default 0.8)')
+
+    parser.add_argument('-c', '--cv-split',
+                        type=float,
+                        default=0.1,
+                        help='Cross-validation split percentage as a float (default 0.1)')
+
+    parser.add_argument('-o', '--osm-dir',
+                        default=os.getcwd(),
+                        help='OSM directory')
+
+    args = parser.parse_args()
+    if args.train_split + args.cv_split >= 1.0:
+        raise ValueError('Train split + cross-validation split must be less than 1.0')
+
+    if not os.path.exists(args.osm_dir):
+        raise ValueError('OSM directory does not exist')
+
+    create_language_training_data(args.osm_dir, split_data=args.no_split, train_split=args.train_split, cv_split=args.cv_split)
--- a/scripts/geodata/language_id/disambiguation.py
+++ b/scripts/geodata/language_id/disambiguation.py
@@ -0,0 +1,176 @@
+import os
+import six
+import sys
+
+from collections import defaultdict, OrderedDict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python')))
+
+from geodata.address_expansions.gazetteers import *
+from geodata.encoding import safe_decode, safe_encode
+from geodata.string_utils import wide_iter, wide_ord
+from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages
+from geodata.text.normalize import normalized_tokens, normalize_string
+from geodata.text.tokenize import tokenize
+from geodata.text.token_types import token_types
+
+WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt'])
+
+# For toponyms, we want to limit the countries we consider to those where
+# the place names can themselves be considered training examples of the language
+WELL_REPRESENTED_LANGUAGE_COUNTRIES = {
+    'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']),
+    'fr': set(['fr']),
+    'it': set(['it']),
+    'de': set(['de', 'at']),
+    'nl': set(['nl']),
+    'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy',
+               've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa',
+               'ni', 'hn']),
+    'pt': set(['pt', 'br']),
+}
+
+char_scripts = get_chars_by_script()
+script_languages = {script: set(langs) for script, langs in six.iteritems(get_script_languages())}
+lang_scripts = defaultdict(set)
+
+for script, langs in six.iteritems(script_languages):
+    for lang in langs:
+        lang_scripts[lang].add(script)
+
+lang_scripts = dict(lang_scripts)
+
+UNKNOWN_SCRIPT = 'Unknown'
+COMMON_SCRIPT = 'Common'
+MAX_ASCII = 127
+
+
+def get_string_script(s):
+    s = safe_decode(s)
+    str_len = len(s)
+    script = last_script = UNKNOWN_SCRIPT
+    is_ascii = True
+    script_len = 0
+    for c in wide_iter(s):
+        script = char_scripts[wide_ord(c)]
+
+        if script == COMMON_SCRIPT and last_script != UNKNOWN_SCRIPT:
+            script = last_script
+        if last_script != script and last_script != UNKNOWN_SCRIPT and last_script != COMMON_SCRIPT:
+            if (script_len < str_len):
+                for c in reversed(list(wide_iter(s[:script_len]))):
+                    if char_scripts[wide_ord(c)] == COMMON_SCRIPT:
+                        script_len -= 1
+            break
+        is_ascii = is_ascii and ord(c) <= MAX_ASCII
+        script_len += 1
+        if script != UNKNOWN_SCRIPT:
+            last_script = script
+    return (last_script, script_len, is_ascii)
+
+LATIN_SCRIPT = 'Latin'
+UNKNOWN_LANGUAGE = 'unk'
+AMBIGUOUS_LANGUAGE = 'xxx'
+
+
+def disambiguate_language_script(text, languages):
+    script_langs = {}
+    read_len = 0
+    while read_len < len(text):
+        script, script_len, is_ascii = get_string_script(text[read_len:])
+        if script != LATIN_SCRIPT:
+            script_valid = [l for l, d in languages if l in script_languages.get(script, [])]
+            script_langs[script] = set(script_valid)
+
+            if script_len == len(text) and len(script_valid) == 1:
+                return script_valid[0], script_langs
+
+        read_len += script_len
+
+    return UNKNOWN_LANGUAGE, script_langs
+
+LATIN_TRANSLITERATED_SCRIPTS = {'Arabic', 'Cyrllic'}
+
+
+def has_non_latin_script(languages):
+    for lang, is_default in languages:
+        scripts = lang_scripts.get(lang, set())
+        if LATIN_SCRIPT not in scripts or scripts & LATIN_TRANSLITERATED_SCRIPTS:
+            return True
+    return False
+
+
+def disambiguate_language(text, languages, scripts_only=False):
+    text = safe_decode(text)
+    valid_languages = OrderedDict(languages)
+
+    language_script, script_langs = disambiguate_language_script(text, languages)
+    if language_script is not UNKNOWN_LANGUAGE:
+        return language_script
+
+    num_defaults = sum((1 for lang, default in valid_languages.iteritems() if default))
+
+    tokens = normalized_tokens(text)
+
+    current_lang = None
+    possible_lang = None
+
+    seen_languages = set()
+
+    for t, c, l, data in street_types_gazetteer.filter(tokens):
+        if c == token_types.PHRASE:
+            valid = OrderedDict()
+            data = [safe_decode(d).split(u'|') for d in data]
+            potentials = set([l for l, d, i, c in data if l in valid_languages])
+            potential_defaults = set([l for l in potentials if valid_languages[l]])
+
+            phrase_len = sum((len(t_i[0]) for t_i in t))
+            for lang, dictionary, is_canonical, canonical in data:
+                is_canonical = int(is_canonical)
+                is_stopword = dictionary == 'stopword'
+                if lang not in valid_languages or (is_stopword and len(potentials) > 1):
+                    continue
+                is_default = valid_languages[lang]
+
+                lang_valid = is_default or not seen_languages or lang in seen_languages
+
+                if lang_valid and phrase_len > 1 and ((is_canonical and not is_stopword) or (is_default and (len(potentials) == 1 or len(potential_defaults) == 1))):
+                    valid[lang] = 1
+                elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang:
+                    return AMBIGUOUS_LANGUAGE
+                elif is_stopword and is_canonical and not is_default and lang in seen_languages:
+                    valid[lang] = 1
+                elif not seen_languages and len(potentials) == 1 and phrase_len > 1:
+                    possible_lang = lang if possible_lang is None or possible_lang == lang else None
+
+            if seen_languages and valid and not any((l in seen_languages for l in valid)) and \
+               (not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))):
+                return AMBIGUOUS_LANGUAGE
+
+            valid = valid.keys()
+
+            if len(valid) == 1:
+                current_lang = valid[0]
+            else:
+                valid_default = [l for l in valid if valid_languages.get(l)]
+                if len(valid_default) == 1 and current_lang is not None and valid_default[0] != current_lang:
+                    return AMBIGUOUS_LANGUAGE
+                elif len(valid_default) == 1:
+                    current_lang = valid_default[0]
+
+            if any((current_lang not in langs for script, langs in script_langs.iteritems())):
+                return AMBIGUOUS_LANGUAGE
+
+            seen_languages.update(valid)
+
+    if current_lang is not None:
+        return current_lang
+    elif possible_lang is not None:
+        if not any((possible_lang not in langs for script, langs in script_langs.iteritems())):
+            return possible_lang
+        else:
+            return AMBIGUOUS_LANGUAGE
+    return UNKNOWN_LANGUAGE
--- a/scripts/geodata/language_id/sample.py
+++ b/scripts/geodata/language_id/sample.py
@@ -0,0 +1,53 @@
+import random
+import bisect
+
+from collections import OrderedDict
+
+'''
+Top languages on the Interwebs. Not a probability distribution
+as it doesn't sum to 1 and websites can be in more than one
+language. Reference:
+
+https://en.wikipedia.org/wiki/Languages_used_on_the_Internet#Content_languages_for_websites
+'''
+INTERNET_LANGUAGE_DISTRIBUTION = OrderedDict([
+    ('en', 0.555),
+    ('ru', 0.059),
+    ('de', 0.058),
+    ('ja', 0.05),
+    ('es', 0.046),
+    ('fr', 0.04),
+    ('zh', 0.028),
+    ('pt', 0.025),
+    ('it', 0.019),
+    ('pl', 0.017),
+    ('tr', 0.015),
+    ('nl', 0.013),
+    ('fa', 0.009),
+    ('ar', 0.008),
+    ('ko', 0.007),
+])
+
+
+def cdf(probs):
+    total = float(sum(probs))
+
+    result = []
+    cumulative = 0.0
+    for w in probs:
+        cumulative += w
+        result.append(cumulative / total)
+    return result
+
+
+MOST_COMMON_INTERNET_LANGUAGES = INTERNET_LANGUAGE_DISTRIBUTION.keys()
+INTERNET_LANGUAGES_CDF = cdf(INTERNET_LANGUAGE_DISTRIBUTION.values())
+
+
+def sample_random_language(keys=MOST_COMMON_INTERNET_LANGUAGES,
+                           cdf=INTERNET_LANGUAGES_CDF):
+    assert len(keys) == len(cdf)
+
+    sample = random.random()
+    idx = bisect.bisect(cdf, sample)
+    return keys[idx]
--- a/scripts/geodata/log.py
+++ b/scripts/geodata/log.py
@@ -0,0 +1,10 @@
+import logging
+import sys
+
+
+def log_to_file(f, level=logging.INFO):
+    handler = logging.StreamHandler(f)
+    formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s')
+    handler.setFormatter(formatter)
+    logging.root.addHandler(handler)
+    logging.root.setLevel(level)
--- a/scripts/geodata/math/init.py
+++ b/scripts/geodata/math/init.py
--- a/scripts/geodata/math/floats.py
+++ b/scripts/geodata/math/floats.py
@@ -0,0 +1,5 @@
+FLOAT_EPSILON = 1e-09
+
+
+def isclose(a, b, rel_tol=FLOAT_EPSILON, abs_tol=0.0):
+    return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
--- a/scripts/geodata/math/sampling.py
+++ b/scripts/geodata/math/sampling.py
@@ -0,0 +1,42 @@
+import bisect
+import random
+import sys
+
+from geodata.math.floats import isclose, FLOAT_EPSILON
+
+
+def weighted_choice(values, cdf):
+    """Pick one of n values given a discrete cumulative distribution"""
+    assert values and cdf, 'values and probabilities cannot be empty/None'
+    assert len(values) == len(cdf), 'len(values) != len(probs)'
+    assert all(p >= 0.0 and p <= (1.0 + FLOAT_EPSILON) for p in cdf), 'Probabilities not valid: {}'.format(cdf)
+
+    x = random.random()
+    i = bisect.bisect(cdf, x)
+    return values[i]
+
+
+def check_probability_distribution(probs):
+    cumulative = 0.0
+    for p in probs:
+        assert p >= 0.0, 'Probabilities cannot be negative'
+        assert p <= 1.0, 'Probabilities cannot be > 1.0'
+        cumulative += p
+    assert isclose(cumulative, 1.0), 'Probabilities must sum to 1: probs={}, cumulative={}'.format(probs, cumulative)
+
+
+def cdf(probs):
+    total = 0.0
+    cumulative = [0.0] * len(probs)
+    for i, p in enumerate(probs):
+        total += p
+        cumulative[i] = total
+
+    return cumulative
+
+
+def zipfian_distribution(n, b=1.0):
+    """Distribution where the ith item's frequency is proportional to its rank"""
+    frequencies = [1. / (i ** b) for i in xrange(1, n + 1)]
+    total = sum(frequencies)
+    return [f / total for f in frequencies]
--- a/scripts/geodata/metro_stations/init.py
+++ b/scripts/geodata/metro_stations/init.py
--- a/scripts/geodata/metro_stations/reverse_geocode.py
+++ b/scripts/geodata/metro_stations/reverse_geocode.py
@@ -0,0 +1,52 @@
+import argparse
+import logging
+import os
+import sys
+import six
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.address_expansions.abbreviations import abbreviate
+from geodata.coordinates.conversion import latlon_to_decimal
+from geodata.math.floats import isclose
+from geodata.osm.extract import parse_osm
+from geodata.places.reverse_geocode import PlaceReverseGeocoder
+from geodata.encoding import safe_decode
+
+
+class MetroStationReverseGeocoder(PlaceReverseGeocoder):
+    GEOHASH_PRECISION = 7
+
+    include_property_patterns = PlaceReverseGeocoder.include_property_patterns | set([
+        'operator',
+        'network',
+        'station',
+    ])
+
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-m', '--osm-metro-stations-file',
+                        help='Path to OSM metro stations file')
+
+    parser.add_argument('-p', '--precision',
+                        type=int,
+                        default=MetroStationReverseGeocoder.GEOHASH_PRECISION,
+                        help='Geohash precision')
+
+    parser.add_argument('-o', '--out-dir',
+                        default=os.getcwd(),
+                        help='Output directory')
+
+    logging.basicConfig(level=logging.INFO)
+
+    args = parser.parse_args()
+    if args.osm_metro_stations_file:
+        index = MetroStationReverseGeocoder.create_from_osm_file(args.osm_metro_stations_file, args.out_dir, precision=args.precision)
+    else:
+        parser.error('Must specify metro stations file')
+
+    index.save()
--- a/scripts/geodata/names/init.py
+++ b/scripts/geodata/names/init.py
--- a/scripts/geodata/names/deduping.py
+++ b/scripts/geodata/names/deduping.py
@@ -0,0 +1,102 @@
+from geodata.text.normalize import *
+from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity
+
+from collections import Counter
+
+
+class NameDeduper(object):
+    '''
+    Base class for deduping geographic entity names e.g. for matching names
+    from different databases (concordances).
+
+    By default uses Soft TFIDF similarity (see geodata.names.similarity)
+    for non-ideographic names and Jaccard similarity with word frequencies
+    for ideographic names.
+
+    See class attributes for options.
+    '''
+
+    stopwords = set()
+    '''Set of words which should not be considered in similarity'''
+
+    discriminative_words = set()
+    '''Set of words which break similarity e.g. North, Heights'''
+
+    discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
+    '''Set of categories which, if not contained in both sets, break similarity'''
+
+    content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
+    '''Set of categories representing content tokens (default setting ignores punctuation)'''
+
+    replacements = {}
+    '''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
+
+    dupe_threshold = 0.9
+    '''Similarity threshold above which entities are considered dupes'''
+
+    ignore_parentheticals = True
+    '''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
+
+    @classmethod
+    def tokenize(cls, s):
+        return normalized_tokens(s)
+
+    @classmethod
+    def content_tokens(cls, s):
+        tokens = cls.tokenize(s)
+        if cls.ignore_parentheticals:
+            tokens = remove_parens(tokens)
+        return [(cls.replacements.get(t, t), c)
+                for t, c in tokens
+                if c in cls.content_categories and
+                t not in cls.stopwords]
+
+    @classmethod
+    def possible_match(cls, tokens1, tokens2):
+        if not cls.discriminative_categories and not cls.discriminative_words:
+            return True
+
+        intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
+        invalid = any((True for t, c in tokens1 + tokens2
+                      if t not in intersection and
+                      (c in cls.discriminative_categories or t in cls.discriminative_words)
+                       ))
+        return not invalid
+
+    @classmethod
+    def compare_ideographs(cls, s1, s2):
+        tokens1 = cls.content_tokens(s1)
+        tokens2 = cls.content_tokens(s2)
+
+        if not cls.possible_match(tokens1, tokens2):
+            return 0.0
+
+        tokens1_only = [t for t, c in tokens1]
+        tokens2_only = [t for t, c in tokens2]
+
+        if u''.join(tokens1_only) == u''.join(tokens2_only):
+            return 1.0
+        else:
+            # Many Han/Hangul characters are common, shouldn't use IDF
+            return jaccard_similarity(tokens1_only, tokens2_only)
+
+    @classmethod
+    def compare(cls, s1, s2, idf):
+        tokens1 = cls.content_tokens(s1)
+        tokens2 = cls.content_tokens(s2)
+
+        if not cls.possible_match(tokens1, tokens2):
+            return 0.0
+
+        tokens1_only = [t for t, c in tokens1]
+        tokens2_only = [t for t, c in tokens2]
+
+        # Test exact equality, also handles things like Cabbage Town == Cabbagetown
+        if u''.join(tokens1_only) == u''.join(tokens2_only):
+            return 1.0
+        else:
+            return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
+
+    @classmethod
+    def is_dupe(cls, sim):
+        return sim >= cls.dupe_threshold
--- a/scripts/geodata/names/normalization.py
+++ b/scripts/geodata/names/normalization.py
@@ -0,0 +1,119 @@
+import os
+import re
+import six
+import yaml
+
+from geodata.encoding import safe_decode
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+AFFIX_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                'resources', 'boundaries', 'names', 'languages')
+
+
+class NameAffixes(object):
+    def __init__(self, config_dir=AFFIX_CONFIG_DIR):
+        self.config_dir = config_dir
+
+        self.language_prefixes = {}
+        self.language_suffixes = {}
+
+        self.language_prefix_regexes = {}
+        self.language_suffix_regexes = {}
+
+        self.language_prefix_sim_only_regexes = {}
+        self.language_suffix_sim_only_regexes = {}
+
+        for filename in os.listdir(config_dir):
+            if not filename.endswith('.yaml'):
+                continue
+            lang = filename.rsplit('.yaml')[0]
+
+            conf = yaml.load(open(os.path.join(config_dir, filename)))
+            self.add_affixes(lang, conf)
+
+            for country, country_conf in six.iteritems(conf.get('countries', {})):
+                country_lang = (country, lang)
+                self.add_affixes(country_lang, country_conf)
+
+    def add_affixes(self, lang, *confs):
+        prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
+        prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
+
+        self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
+
+        suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
+        suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
+
+        self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
+
+        whitespace_phrase = six.u('[ \-]')
+
+        all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
+        all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
+
+        if all_prefixes:
+            prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
+            self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
+
+        if all_suffixes:
+            suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
+            self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
+
+        sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
+        if sim_only_prefixes:
+            sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
+            self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
+
+        sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
+        if sim_only_suffixes:
+            sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
+
+            self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
+
+    def replace_prefixes(self, name, lang, country=None, sim_only=False):
+        name = safe_decode(name).strip()
+
+        if not sim_only or lang not in self.language_prefix_sim_only_regexes:
+            d = self.language_prefix_regexes
+        else:
+            d = self.language_prefix_sim_only_regexes
+
+        re = None
+        if country is not None:
+            re = d.get((country, lang))
+            if re:
+                name = re.sub(six.u(''), name)
+
+        re = d.get(lang)
+
+        if not re:
+            return name
+
+        return re.sub(six.u(''), name)
+
+    def replace_suffixes(self, name, lang, country=None, sim_only=False):
+        name = safe_decode(name).strip()
+
+        if not sim_only or lang not in self.language_suffix_sim_only_regexes:
+            d = self.language_suffix_regexes
+        else:
+            d = self.language_suffix_sim_only_regexes
+
+        re = None
+        if country is not None:
+            re = d.get((country, lang))
+            if re:
+                name = re.sub(six.u(''), name)
+
+        re = d.get(lang)
+
+        if not re:
+            return name
+
+        return re.sub(six.u(''), name)
+
+    def replace_affixes(self, name, lang, country=None, sim_only=False):
+        return self.replace_prefixes(self.replace_suffixes(name, lang, country=country, sim_only=sim_only), lang, country=country, sim_only=sim_only)
+
+name_affixes = NameAffixes()
--- a/scripts/geodata/names/similarity.py
+++ b/scripts/geodata/names/similarity.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+import Levenshtein
+from collections import OrderedDict
+
+
+def ordered_word_count(tokens):
+    counts = OrderedDict()
+    for k in tokens:
+        counts[k] = counts.get(k, 0) + 1
+    return counts
+
+
+def soft_tfidf_similarity(tokens1, tokens2, idf,
+                          sim_func=Levenshtein.jaro_winkler, theta=0.95,
+                          common_word_threshold=100):
+    '''
+    Soft TFIDF is a hybrid distance function using both global statistics
+    (inverse document frequency) and local similarity (Jaro-Winkler).
+
+    For each token t1 in the first string, find the token t2 which is most
+    similar to t1 in terms of the local distance function.
+
+    The SoftTFIDF similarity is the dot product of the max token similarities
+    and the cosine similarity of the TF-IDF vectors for all tokens where
+    the max similarity is >= a given threshold theta.
+
+    sim_func should return a number in the range (0, 1) inclusive and theta
+    should be in the same range i.e. this would _not_ work for a metric like
+    basic Levenshtein or Damerau-Levenshtein distance where we'd want the
+    value to be below the threshold. Those metrics can be transformed into
+    a (0, 1) measure.
+
+    @param tokens1: normalized tokens of string 1 (list of strings only)
+    @param tokens2: normalized tokens of string 2 (list of strings only)
+
+    @param idf: IDFIndex from geodata.statistics.tf_idf
+    @param sim_func: similarity function which takes 2 strings and returns
+                     a number between 0 and 1
+    @param theta: token-level threshold on sim_func's return value at
+                  which point two tokens are considered "close"
+
+    Reference:
+    https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
+    '''
+
+    token1_counts = ordered_word_count(tokens1)
+    token2_counts = ordered_word_count(tokens2)
+
+    tfidf1 = idf.tfidf_vector(token1_counts)
+    tfidf2 = idf.tfidf_vector(token2_counts)
+
+    total_sim = 0.0
+
+    t1_len = len(token1_counts)
+    t2_len = len(token2_counts)
+
+    if t2_len < t1_len:
+        token1_counts, token2_counts = token2_counts, token1_counts
+        tfidf1, tfidf2 = tfidf2, tfidf1
+
+    for i, t1 in enumerate(token1_counts):
+        sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)])
+        if sim >= theta:
+            total_sim += sim * tfidf1[i] * tfidf2[j]
+
+    return total_sim
+
+
+def jaccard_similarity(tokens1, tokens2):
+    '''
+    Traditionally Jaccard similarity is defined for two sets:
+
+    Jaccard(A, B) = (A ∩ B) / (A ∪ B)
+
+    Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
+    would be 1.0, which is not ideal for entity name matching.
+
+    In this implementation the cardinality of the set intersections/unions
+    are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
+    '''
+    token1_counts = ordered_word_count(tokens1)
+    token2_counts = ordered_word_count(tokens2)
+
+    intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
+    return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)
--- a/scripts/geodata/neighborhoods/init.py
+++ b/scripts/geodata/neighborhoods/init.py
--- a/scripts/geodata/neighborhoods/reverse_geocode.py
+++ b/scripts/geodata/neighborhoods/reverse_geocode.py
@@ -0,0 +1,622 @@
+# -*- coding: utf-8 -*-
+import argparse
+import fnmatch
+import logging
+import operator
+import os
+import re
+import six
+import subprocess
+import sys
+import yaml
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.address_formatting.formatter import AddressFormatter
+from geodata.coordinates.conversion import latlon_to_decimal
+from geodata.encoding import safe_decode
+from geodata.file_utils import ensure_dir, download_file
+from geodata.i18n.unicode_properties import get_chars_by_script
+from geodata.i18n.word_breaks import ideographic_scripts
+from geodata.names.deduping import NameDeduper
+from geodata.osm.admin_boundaries import OSMNeighborhoodPolygonReader
+from geodata.osm.components import osm_address_components
+from geodata.osm.definitions import osm_definitions
+from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
+from geodata.polygons.index import *
+from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
+from geodata.statistics.tf_idf import IDFIndex
+
+
+class NeighborhoodDeduper(NameDeduper):
+    # Lossless conversions only
+    replacements = {
+        u'saint': u'st',
+        u'and': u'&',
+        u'〇': u'0',
+        u'一': u'1',
+        u'二': u'2',
+        u'三': u'3',
+        u'四': u'4',
+        u'五': u'5',
+        u'六': u'6',
+        u'七': u'7',
+        u'八': u'8',
+        u'九': u'9',
+        u'十': u'10',
+    }
+
+    discriminative_words = set([
+        # Han numbers
+        u'〇', u'一',
+        u'二', u'三',
+        u'四', u'五',
+        u'六', u'七',
+        u'八', u'九',
+        u'十', u'百',
+        u'千', u'万',
+        u'億', u'兆',
+        u'京', u'第',
+
+        # Roman numerals
+        u'i', u'ii',
+        u'iii', u'iv',
+        u'v', u'vi',
+        u'vii', u'viii',
+        u'ix', u'x',
+        u'xi', u'xii',
+        u'xiii', u'xiv',
+        u'xv', u'xvi',
+        u'xvii', u'xviii',
+        u'xix', u'xx',
+
+        # English directionals
+        u'north', u'south',
+        u'east', u'west',
+        u'northeast', u'northwest',
+        u'southeast', u'southwest',
+
+        # Spanish, Portguese and Italian directionals
+        u'norte', u'nord', u'sur', u'sul', u'sud',
+        u'est', u'este', u'leste', u'oeste', u'ovest',
+
+        # New in various languages
+        u'new',
+        u'nova',
+        u'novo',
+        u'nuevo',
+        u'nueva',
+        u'nuovo',
+        u'nuova',
+
+        # Qualifiers
+        u'heights',
+        u'hills',
+
+        u'upper', u'lower',
+        u'little', u'great',
+
+        u'park',
+        u'parque',
+
+        u'village',
+
+    ])
+
+    stopwords = set([
+        u'cp',
+        u'de',
+        u'la',
+        u'urbanizacion',
+        u'do',
+        u'da',
+        u'dos',
+        u'del',
+        u'community',
+        u'bairro',
+        u'barrio',
+        u'le',
+        u'el',
+        u'mah',
+        u'раион',
+        u'vila',
+        u'villa',
+        u'kampung',
+        u'ahupua`a',
+
+    ])
+
+
+class ClickThatHoodReverseGeocoder(GeohashPolygonIndex):
+    persistent_polygons = False
+    cache_size = 0
+
+    SCRATCH_DIR = '/tmp'
+
+    # Contains accurate boundaries for neighborhoods sans weird GeoPlanet names like "Adelphi" or "Crown Heights South"
+    NEIGHBORHOODS_REPO = 'https://github.com/codeforamerica/click_that_hood'
+
+    config_path = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                               'resources', 'neighborhoods', 'click_that_hood.yaml')
+
+    config = yaml.load(open(config_path))
+
+    @classmethod
+    def clone_repo(cls, path):
+        subprocess.check_call(['rm', '-rf', path])
+        subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path])
+
+    @classmethod
+    def create_neighborhoods_index(cls):
+        scratch_dir = cls.SCRATCH_DIR
+        repo_path = os.path.join(scratch_dir, 'click_that_hood')
+        cls.clone_repo(repo_path)
+
+        data_path = os.path.join(repo_path, 'public', 'data')
+
+        neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods')
+        ensure_dir(neighborhoods_dir)
+
+        index = cls(save_dir=neighborhoods_dir)
+
+        for c in cls.config['files']:
+            filename = c['filename']
+            component = c['component']
+
+            path = os.path.join(data_path, filename)
+            features = json.load(open(path))['features']
+            for f in features:
+                f['properties']['component'] = component
+
+            try:
+                index.add_geojson_like_file(features)
+            except ValueError:
+                continue
+
+        return index
+
+
+class OSMNeighborhoodReverseGeocoder(OSMReverseGeocoder):
+    persistent_polygons = False
+    cache_size = 10000
+    simplify_polygons = False
+    polygon_reader = OSMNeighborhoodPolygonReader
+    include_property_patterns = OSMReverseGeocoder.include_property_patterns | set(['postal_code'])
+
+    cache_size = 0
+
+    SCRATCH_DIR = '/tmp'
+
+    @classmethod
+    def create_neighborhoods_index(cls, osm_neighborhoods_file):
+        scratch_dir = cls.SCRATCH_DIR
+        neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index')
+        ensure_dir(neighborhoods_dir)
+
+        return cls.create_from_osm_file(osm_neighborhoods_file, output_dir=neighborhoods_dir)
+
+
+class NeighborhoodReverseGeocoder(RTreePolygonIndex):
+    '''
+    Neighborhoods are very important in cities like NYC, SF, Chicago, London
+    and many others. We want the address parser to be trained with addresses
+    that sufficiently capture variations in address patterns, including
+    neighborhoods. Quattroshapes neighborhood data (in the US at least)
+    is not great in terms of names, mostly becasue GeoPlanet has so many
+    incorrect names. The neighborhoods project, also known as ClickThatHood
+    has very accurate polygons with correct names, but only for a handful
+    of cities. OSM usually lists neighborhoods and some other local admin
+    areas like boroughs as points rather than polygons.
+
+    This index merges all of the above data sets in prioritized order
+    (ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
+    tests for neighborhoods. The properties vary by source but each has
+    source has least a "name" key which in practice is what we care about.
+
+    Quattroshapes data is no longer accessible and has been replaced by
+    WhosOnFirst.
+    '''
+
+    PRIORITIES_FILENAME = 'priorities.json'
+
+    DUPE_THRESHOLD = 0.9
+
+    persistent_polygons = True
+    cache_size = 100000
+
+    source_priorities = {
+        'osm': 0,            # Best names/polygons, same coordinate system
+        'osm_cth': 1,        # Prefer the OSM names if possible
+        'clickthathood': 2,  # Better names/polygons than WhosOnFirst
+        'osm_wof': 3,        # Prefer OSM names matched with WhosOnFirst polygon
+        'wof': 4,            # Replacement of Quattroshapes
+    }
+
+    level_priorities = {
+        'neighborhood': 0,
+        'local_admin': 1,
+    }
+
+    regex_replacements = [
+        # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
+        (re.compile('^paris-(?=[\d])', re.I), ''),
+        (re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
+    ]
+
+    quattroshapes_city_district_patterns = [
+        six.u('Praha [\d]+'),
+    ]
+
+    quattroshapes_city_district_regex = re.compile('|'.join([six.u('^\s*{}\s*$').format(p) for p in quattroshapes_city_district_patterns]), re.I | re.U)
+
+    @classmethod
+    def count_words(cls, s):
+        doc = defaultdict(int)
+        for t, c in NeighborhoodDeduper.content_tokens(s):
+            doc[t] += 1
+        return doc
+
+    @classmethod
+    def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
+        '''
+        Given an OSM file (planet or some other bounds) containing neighborhoods
+        as points (some suburbs have boundaries)
+
+        and their dependencies, create an R-tree index for coarse-grained
+        reverse geocoding.
+
+        Note: the input file is expected to have been created using
+        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
+        admin borders commands if using other geometries.
+        '''
+        index = cls(save_dir=output_dir)
+
+        logger = logging.getLogger('neighborhoods')
+
+        logger.info('Creating ClickThatHood neighborhoods')
+        cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
+
+        logger.info('Creating OSM neighborhoods')
+        osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
+
+        logger.info('Creating WhosOnFirst neighborhoods')
+        wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))
+
+        country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
+
+        osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
+        osm_admin_rtree.cache_size = 1000
+
+        logger.info('Creating IDF index')
+        idf = IDFIndex()
+
+        char_scripts = get_chars_by_script()
+
+        for idx in (cth, wof, osmn):
+            for i in xrange(idx.i):
+                props = idx.get_properties(i)
+                name = props.get('name')
+                if name is not None:
+                    doc = cls.count_words(name)
+                    idf.update(doc)
+
+        for key, attrs, deps in parse_osm(filename):
+            for k, v in six.iteritems(attrs):
+                if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)):
+                    doc = cls.count_words(v)
+                    idf.update(doc)
+
+        for i in six.moves.xrange(osmn.i):
+            props = osmn.get_properties(i)
+            poly = osmn.get_polygon(i)
+
+            props['source'] = 'osm'
+            props['component'] = AddressFormatter.SUBURB
+            props['polygon_type'] = 'neighborhood'
+
+            index.index_polygon(poly.context)
+            index.add_polygon(poly.context, props)
+
+        wof.matched = [False] * wof.i
+        cth.matched = [False] * cth.i
+
+        logger.info('Matching OSM points to neighborhood polygons')
+        # Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
+        num_polys = 0
+        for element_id, attrs, deps in parse_osm(filename):
+            try:
+                lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon'])
+            except ValueError:
+                continue
+
+            osm_name = attrs.get('name')
+            if not osm_name:
+                continue
+
+            id_type, element_id = element_id.split(':')
+            element_id = long(element_id)
+
+            props['type'] = id_type
+            props['id'] = element_id
+
+            possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.EXTENDED_NEIGHBORHOOD)
+            is_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)
+
+            country, candidate_languages = country_rtree.country_and_languages(lat, lon)
+
+            component_name = None
+
+            component_name = osm_address_components.component_from_properties(country, attrs)
+
+            ranks = []
+            osm_names = []
+
+            for key in OSM_NAME_TAGS:
+                name = attrs.get(key)
+                if name:
+                    osm_names.append(name)
+
+            for name_key in OSM_NAME_TAGS:
+                osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
+
+            for idx in (cth, wof):
+                candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
+
+                if candidates:
+                    max_sim = 0.0
+                    arg_max = None
+
+                    normalized_wof_names = {}
+
+                    for osm_name in osm_names:
+
+                        contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts
+                                                   for c in safe_decode(osm_name)))
+
+                        for i in candidates:
+                            props = idx.get_properties(i)
+                            name = normalized_wof_names.get(i)
+                            if not name:
+                                name = props.get('name')
+                                if not name:
+                                    continue
+                                for pattern, repl in cls.regex_replacements:
+                                    name = pattern.sub(repl, name)
+                                normalized_wof_names[i] = name
+
+                            if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
+                                continue
+
+                            if not contains_ideographs:
+                                sim = NeighborhoodDeduper.compare(osm_name, name, idf)
+                            else:
+                                # Many Han/Hangul characters are common, shouldn't use IDF
+                                sim = NeighborhoodDeduper.compare_ideographs(osm_name, name)
+
+                            if sim > max_sim:
+                                max_sim = sim
+                                poly = idx.get_polygon(i)
+                                arg_max = (max_sim, props, poly.context, idx, i)
+
+                    if arg_max:
+                        ranks.append(arg_max)
+
+            ranks.sort(key=operator.itemgetter(0), reverse=True)
+            if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD:
+                score, props, poly, idx, i = ranks[0]
+
+                existing_osm_boundaries = osm_admin_rtree.point_in_poly(lat, lon, return_all=True)
+                existing_neighborhood_boundaries = osmn.point_in_poly(lat, lon, return_all=True)
+
+                skip_node = False
+
+                for boundaries in (existing_osm_boundaries, existing_neighborhood_boundaries):
+                    for poly_index, osm_props in enumerate(boundaries):
+                        containing_component = None
+                        name = osm_props.get('name')
+                        # Only exact name matches here since we're comparins OSM to OSM
+                        if name and name.lower() != attrs.get('name', '').lower():
+                            continue
+
+                        if boundaries is existing_neighborhood_boundaries:
+                            containing_component = AddressFormatter.SUBURB
+                            skip_node = True
+                            break
+                        else:
+                            containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries[poly_index + 1:]]
+
+                            containing_component = osm_address_components.component_from_properties(country, osm_props, containing=containing_ids)
+
+                        if containing_component and containing_component != component_name and AddressFormatter.component_order[containing_component] <= AddressFormatter.component_order[AddressFormatter.CITY]:
+                            skip_node = True
+                            break
+                    if skip_node:
+                        break
+
+                # Skip this element
+                if skip_node:
+                    continue
+
+                if idx is cth:
+                    if props['component'] == AddressFormatter.SUBURB:
+                        attrs['polygon_type'] = 'neighborhood'
+                    elif props['component'] == AddressFormatter.CITY_DISTRICT:
+                        attrs['polygon_type'] = 'local_admin'
+                    else:
+                        continue
+                    source = 'osm_cth'
+                else:
+                    level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)
+
+                    source = 'osm_quattro'
+                    if level == 'neighborhood':
+                        attrs['polygon_type'] = 'neighborhood'
+                    else:
+                        attrs['polygon_type'] = 'local_admin'
+
+                containing_ids = [(boundary['type'], boundary['id']) for boundary in existing_osm_boundaries]
+                component = osm_address_components.component_from_properties(country, attrs, containing=containing_ids)
+                attrs['component'] = component
+
+                attrs['source'] = source
+                index.index_polygon(poly)
+                index.add_polygon(poly, attrs)
+                idx.matched[i] = True
+
+            num_polys += 1
+            if num_polys % 1000 == 0 and num_polys > 0:
+                logger.info('did {} neighborhoods'.format(num_polys))
+
+        for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
+            for i in xrange(idx.i):
+                props = idx.get_properties(i)
+                poly = idx.get_polygon(i)
+                if idx.matched[i]:
+                    continue
+                props['source'] = source
+                if idx is cth:
+                    component = props['component']
+                    if component == AddressFormatter.SUBURB:
+                        props['polygon_type'] = 'neighborhood'
+                    elif component == AddressFormatter.CITY_DISTRICT:
+                        props['polygon_type'] = 'local_admin'
+                    else:
+                        continue
+                elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
+                    component = AddressFormatter.SUBURB
+                    name = props.get('name')
+                    if not name:
+                        continue
+                    for pattern, repl in cls.regex_replacements:
+                        name = pattern.sub(repl, name)
+
+                    props['name'] = name
+
+                    if cls.quattroshapes_city_district_regex.match(name):
+                        component = AddressFormatter.CITY_DISTRICT
+
+                    props['component'] = component
+                    props['polygon_type'] = 'neighborhood'
+                else:
+                    # We don't actually care about local admin polygons unless they match OSM
+                    continue
+                index.index_polygon(poly.context)
+                index.add_polygon(poly.context, props)
+
+        return index
+
+    def setup(self):
+        self.priorities = []
+
+    def index_polygon_properties(self, properties):
+        self.priorities.append((self.level_priorities[properties['polygon_type']], self.source_priorities[properties['source']]))
+
+    def load_polygon_properties(self, d):
+        self.priorities = [tuple(p) for p in json.load(open(os.path.join(d, self.PRIORITIES_FILENAME)))]
+
+    def save_polygon_properties(self, d):
+        json.dump(self.priorities, open(os.path.join(d, self.PRIORITIES_FILENAME), 'w'))
+
+    def priority(self, i):
+        return self.priorities[i]
+
+    def get_candidate_polygons(self, lat, lon):
+        candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon)
+        return sorted(candidates, key=self.priority)
+
+
+class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
+    persistent_polygons = False
+    cache_size = None
+
+    NAME = "wof:name"
+    ASCII_NAME = "gn:asciiname"
+    LEVEL = "wof:placetype"
+    GEONAMES_ID = "gn:geonameid"
+    SUPERSEDED = "wof:superseded_by"
+
+    NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
+    POLYGON_TYPES = {"Polygon", "MultiPolygon"}
+
+    @classmethod
+    def is_valid_neighbourhood(cls, geojson):
+        validity = not geojson["properties"].get(cls.SUPERSEDED)
+        for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
+            validity &= geojson["properties"].get(field)
+        return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
+
+    @classmethod
+    def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
+        index = cls(save_dir=output_dir, index_filename=index_filename)
+
+        for root, dirnames, filenames in os.walk(wof_dir):
+            for fname in fnmatch.filter(filenames, "*.geojson"):
+                with open(os.path.join(root, fname)) as f:
+                    geojson = json.load(f)
+                    if cls.is_valid_neighbourhood(geojson):
+                        properties = {
+                            "name": safe_decode(geojson["properties"].get(cls.NAME)),
+                            "name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
+                            "qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
+                            "gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
+                        }
+
+                    poly_type = geojson['geometry']['type']
+                    if poly_type == 'Polygon':
+                        poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
+                        index.index_polygon(poly)
+                        poly = index.simplify_polygon(poly)
+                        index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
+                    elif poly_type == 'MultiPolygon':
+                        polys = []
+                        for coords in geojson['geometry']['coordinates']:
+                            poly = cls.to_polygon(coords[0])
+                            polys.append(poly)
+                            index.index_polygon(poly)
+
+                        multi_poly = index.simplify_polygon(MultiPolygon(polys))
+                        index.add_polygon(multi_poly, dict(geojson['properties']))
+
+        return index
+
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-w', '--wof-dir',
+                        help='Path to WhosOnFirst dir')
+
+    parser.add_argument('-a', '--osm-admin-rtree-dir',
+                        help='Path to OSM admin rtree dir')
+
+    parser.add_argument('-c', '--country-rtree-dir',
+                        help='Path to country rtree dir')
+
+    parser.add_argument('-b', '--osm-neighborhood-borders-file',
+                        help='Path to OSM neighborhood borders file (with dependencies, .osm format)')
+
+    parser.add_argument('-n', '--osm-neighborhoods-file',
+                        help='Path to OSM neighborhoods file (no dependencies, .osm format)')
+
+    parser.add_argument('-o', '--out-dir',
+                        default=os.getcwd(),
+                        help='Output directory')
+
+    logging.basicConfig(level=logging.INFO)
+
+    args = parser.parse_args()
+    if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
+        index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
+            args.osm_neighborhoods_file,
+            args.wof_dir,
+            args.country_rtree_dir,
+            args.osm_admin_rtree_dir,
+            args.osm_neighborhood_borders_file,
+            args.out_dir
+        )
+    else:
+        parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')
+
+    index.save()
--- a/scripts/geodata/numbers/init.py
+++ b/scripts/geodata/numbers/init.py
--- a/scripts/geodata/numbers/numex.py
+++ b/scripts/geodata/numbers/numex.py
@@ -0,0 +1,219 @@
+import os
+import sys
+
+import yaml
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.encoding import safe_encode
+from geodata.i18n.unicode_paths import DATA_DIR
+
+
+class InvalidNumexRuleException(Exception):
+    pass
+
+NUMEX_DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                              'resources', 'numex')
+
+NUMEX_RULES_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'numex_data.c')
+
+GENDER_MASCULINE = 'GENDER_MASCULINE'
+GENDER_FEMININE = 'GENDER_FEMININE'
+GENDER_NEUTER = 'GENDER_NEUTER'
+GENDER_NONE = 'GENDER_NONE'
+
+gender_map = {
+    'm': GENDER_MASCULINE,
+    'f': GENDER_FEMININE,
+    'n': GENDER_NEUTER,
+    None: GENDER_NONE,
+}
+
+
+CATEGORY_PLURAL = 'CATEGORY_PLURAL'
+CATEGORY_DEFAULT = 'CATEGORY_DEFAULT'
+
+valid_numex_keys = set(['name', 'value', 'type', 'left', 'right', 'gender', 'category', 'radix',
+                        'multiply_gte', 'exact_multiple_only', 'left_separator', 'right_separator'])
+
+valid_ordinal_keys = set(['suffixes', 'gender', 'category'])
+
+
+category_map = {
+    'plural': CATEGORY_PLURAL,
+    None: CATEGORY_DEFAULT
+}
+
+LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
+LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
+LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
+LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
+
+left_context_map = {
+    'add': LEFT_CONTEXT_ADD,
+    'multiply': LEFT_CONTEXT_MULTIPLY,
+    'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
+    None: LEFT_CONTEXT_NONE,
+}
+
+RIGHT_CONTEXT_MULTIPLY = 'NUMEX_RIGHT_CONTEXT_MULTIPLY'
+RIGHT_CONTEXT_ADD = 'NUMEX_RIGHT_CONTEXT_ADD'
+RIGHT_CONTEXT_NONE = 'NUMEX_RIGHT_CONTEXT_NONE'
+
+right_context_map = {
+    'add': RIGHT_CONTEXT_ADD,
+    'multiply': RIGHT_CONTEXT_MULTIPLY,
+    None: RIGHT_CONTEXT_NONE,
+}
+
+CARDINAL = 'NUMEX_CARDINAL_RULE'
+ORDINAL = 'NUMEX_ORDINAL_RULE'
+ORDINAL_INDICATOR = 'NUMEX_ORDINAL_INDICATOR_RULE'
+
+rule_type_map = {
+    'cardinal': CARDINAL,
+    'ordinal': ORDINAL,
+    'ordinal_indicator': ORDINAL_INDICATOR,
+}
+
+numex_key_template = u'"{key}"'
+numex_rule_template = u'{{{left_context_type}, {right_context_type}, {rule_type}, {gender}, {category}, {radix}, {value}LL}}'
+
+stopword_rule = u'NUMEX_STOPWORD_RULE'
+
+ordinal_indicator_template = u'{{"{key}", {gender}, {category}, "{value}"}}'
+
+stopwords_template = u'"{word}"'
+
+language_template = u'{{"{language}", {whole_words_only}, {rule_index}, {num_rules}, {ordinal_indicator_index}, {num_ordinal_indicators}}}'
+
+numex_rules_data_template = u'''
+char *numex_keys[] = {{
+    {numex_keys}
+}};
+
+numex_rule_t numex_rules[] = {{
+    {numex_rules}
+}};
+
+ordinal_indicator_t ordinal_indicator_rules[] = {{
+    {ordinal_indicator_rules}
+}};
+
+numex_language_source_t numex_languages[] = {{
+    {languages}
+}};
+'''
+
+
+def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
+    all_keys = []
+    all_rules = []
+
+    all_ordinal_indicators = []
+    all_stopwords = []
+
+    all_languages = []
+
+    out = open(outfile, 'w')
+
+    for filename in os.listdir(dirname):
+        path = os.path.join(dirname, filename)
+        if not os.path.isfile(path) or not filename.endswith('.yaml'):
+            continue
+
+        language = filename.split('.yaml', 1)[0]
+
+        data = yaml.load(open(path))
+
+        whole_words_only = data.get('whole_words_only', False)
+
+        rules = data.get('rules', [])
+        rule_index = len(all_rules)
+
+        for rule in rules:
+            invalid_keys = set(rule.keys()) - valid_numex_keys
+            if invalid_keys:
+                raise InvalidNumexRuleException(u'Invalid keys: ({}) for language {}, rule: {}'.format(u','.join(invalid_keys), language, rule))
+            gender = gender_map[rule.get('gender')]
+            rule_type = rule_type_map[rule['type']]
+            key = rule['name']
+            value = rule['value']
+            radix = rule.get('radix', 10)
+            rule_category = rule.get('category')
+            category = category_map.get(rule_category)
+            if category is None:
+                continue
+            left_context_type = left_context_map[rule.get('left')]
+            right_context_type = right_context_map[rule.get('right')]
+            all_keys.append(unicode(numex_key_template.format(key=key)))
+            all_rules.append(unicode(numex_rule_template.format(
+                language=language,
+                rule_type=rule_type,
+                gender=gender,
+                category=category,
+                left_context_type=left_context_type,
+                right_context_type=right_context_type,
+                value=value,
+                radix=radix
+            )))
+
+        ordinal_indicator_index = len(all_ordinal_indicators)
+        ordinal_indicators = data.get('ordinal_indicators', [])
+        num_ordinal_indicators = 0
+
+        for rule in ordinal_indicators:
+            gender = gender_map[rule.get('gender')]
+            category = category_map[rule.get('category')]
+            invalid_ordinal_keys = set(rule.keys()) - valid_ordinal_keys
+            if invalid_ordinal_keys:
+                raise InvalidNumexRuleException(u'Invalid keys ({}) in ordinal rule for language {}, rule: {}'.format(u','.join(invalid_ordinal_keys), language, rule))
+
+            for key, suffixes in rule['suffixes'].iteritems():
+                for suffix in suffixes:
+                    all_ordinal_indicators.append(unicode(ordinal_indicator_template.format(
+                        key=key,
+                        value=suffix,
+                        gender=gender,
+                        category=category
+                    )))
+                num_ordinal_indicators += len(suffixes)
+
+        stopwords = data.get('stopwords', [])
+        stopword_index = len(all_stopwords)
+        num_stopwords = len(stopwords)
+
+        for stopword in stopwords:
+            all_keys.append(numex_key_template.format(key=unicode(stopword)))
+            all_rules.append(stopword_rule)
+
+        num_rules = len(rules) + len(stopwords)
+
+        all_languages.append(unicode(language_template.format(
+            language=language,
+            whole_words_only=int(whole_words_only),
+            rule_index=rule_index,
+            num_rules=num_rules,
+            ordinal_indicator_index=ordinal_indicator_index,
+            num_ordinal_indicators=num_ordinal_indicators
+        )))
+
+    out.write(safe_encode(numex_rules_data_template.format(
+        numex_keys=u''',
+    '''.join(all_keys),
+        numex_rules=u''',
+    '''.join(all_rules),
+        ordinal_indicator_rules=u''',
+    '''.join(all_ordinal_indicators),
+        stopwords=u''',
+    '''.join(all_stopwords),
+        languages=u''',
+    '''.join(all_languages),
+    )))
+
+    out.close()
+
+
+if __name__ == '__main__':
+    parse_numex_rules(*sys.argv[1:])
--- a/scripts/geodata/numbers/ordinals.py
+++ b/scripts/geodata/numbers/ordinals.py
@@ -0,0 +1,108 @@
+import bisect
+import math
+import os
+import operator
+import random
+import six
+import sys
+import yaml
+
+from collections import defaultdict
+from marisa_trie import BytesTrie
+
+from geodata.text.phrases import PhraseFilter
+from geodata.encoding import safe_encode, safe_decode
+from geodata.i18n.unicode_paths import DATA_DIR
+
+from geodata.numbers.numex import NUMEX_DATA_DIR
+
+
+class OrdinalSuffixTrie(PhraseFilter):
+    def __init__(self, ordinal_rules):
+        self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
+        self.configured = True
+
+    def search_substring(self, s):
+        if len(s) == 0:
+            return None, 0
+
+        for i in xrange(len(s) + 1):
+            if not self.trie.has_keys_with_prefix(s[:i]):
+                i -= 1
+                break
+        if i > 0:
+            return (self.trie.get(s[:i]), i)
+        else:
+            return None, 0
+
+    def search_suffix(self, token):
+        suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
+        if suffix_search:
+            return suffix_search[0].split('|')
+        else:
+            return None
+
+
+class OrdinalExpressions(object):
+    def __init__(self, base_dir=NUMEX_DATA_DIR):
+        self.cardinal_rules = {}
+        self.cardinal_rules_ones = {}
+
+        self.ordinal_rules = {}
+        self.ordinal_suffix_rules = {}
+
+        for filename in os.listdir(base_dir):
+            if filename.endswith('.yaml'):
+                lang = filename.split('.yaml')[0]
+                f = open(os.path.join(base_dir, filename))
+                data = yaml.load(f)
+
+                rules = data.get('rules')
+                if rules is not None and hasattr(rules, '__getslice__'):
+                    cardinals = []
+                    ordinals = defaultdict(list)
+                    for rule in rules:
+                        name = rule.get('name')
+                        value = rule.get('value')
+                        rule_type = rule.get('type')
+                        if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
+                            continue
+                        gender = rule.get('gender', None)
+                        category = rule.get('category', None)
+                        if rule_type == 'ordinal':
+                            ordinals[(value, gender, category)].append(name)
+                        else:
+                            cardinals.append(rule)
+                            if value == 1:
+                                self.cardinal_rules_ones[(lang, gender, category)] = name
+
+                    self.cardinal_rules[lang] = cardinals
+                    self.ordinal_rules[lang] = ordinals
+
+                ordinal_indicators = data.get('ordinal_indicators')
+                if ordinal_indicators is not None and hasattr(ordinal_indicators, '__getslice__'):
+                    for rule_set in ordinal_indicators:
+                        gender = rule_set.get('gender', None)
+                        category = rule_set.get('category', None)
+                        self.ordinal_suffix_rules[(lang, gender, category)] = OrdinalSuffixTrie(rule_set['suffixes'])
+
+    def get_suffixes(self, num, lang, gender=None, category=None):
+        trie = self.ordinal_suffix_rules.get((lang, gender, category))
+        if not trie:
+            return None
+
+        return trie.search_suffix(str(num))
+
+    def get_suffix(self, num, lang, gender=None, category=None):
+        suffixes = self.get_suffixes(num, lang, gender=gender, category=category)
+        if not suffixes:
+            return None
+        return random.choice(suffixes)
+
+    def suffixed_number(self, num, lang, gender=None, category=None):
+        suffix = self.get_suffix(num, lang, gender=gender, category=category)
+        if not suffix:
+            return None
+        return six.u('{}{}').format(safe_decode(num), safe_decode(suffix))
+
+ordinal_expressions = OrdinalExpressions()
--- a/scripts/geodata/numbers/spellout.py
+++ b/scripts/geodata/numbers/spellout.py
@@ -0,0 +1,449 @@
+import bisect
+import math
+import os
+import random
+import six
+import yaml
+
+from collections import defaultdict
+
+from geodata.numbers.numex import NUMEX_DATA_DIR
+
+
+class NumericExpressions(object):
+    default_separator = ' '
+
+    def __init__(self, base_dir=NUMEX_DATA_DIR):
+        self.cardinal_rules = {}
+        self.cardinal_rules_sorted = {}
+        self.cardinal_rules_ones = defaultdict(dict)
+        self.cardinal_rules_ones_sorted = {}
+
+        self.default_separators = {}
+
+        self.ordinal_rules = {}
+        self.ordinal_suffix_rules = {}
+
+        for filename in os.listdir(base_dir):
+            if filename.endswith('.yaml'):
+                lang = filename.split('.yaml')[0]
+                f = open(os.path.join(base_dir, filename))
+                data = yaml.load(f)
+
+                default_separator = data.get('default_separator')
+                if default_separator is not None:
+                    self.default_separators[lang] = default_separator
+
+                rules = data.get('rules')
+                if rules is not None and hasattr(rules, '__getslice__'):
+                    cardinals = defaultdict(list)
+                    ordinals = defaultdict(list)
+                    for rule in rules:
+                        name = rule.get('name')
+                        value = rule.get('value')
+                        rule_type = rule.get('type')
+                        if not name or type(value) not in (int, float) or rule_type not in ('cardinal', 'ordinal'):
+                            continue
+                        gender = rule.get('gender', None)
+                        category = rule.get('category', None)
+                        if rule_type == 'ordinal':
+                            ordinals[(value, gender, category)].append(rule)
+                        else:
+                            cardinals[(value, gender, category)].append(rule)
+                            if value == 1 and 'multiply_gte' in rule:
+                                self.cardinal_rules_ones[lang][rule['multiply_gte']] = rule
+
+                    self.cardinal_rules[lang] = cardinals
+                    self.ordinal_rules[lang] = ordinals
+
+                    self.cardinal_rules_sorted[lang] = sorted(set([v for v, g, c in cardinals]))
+                    self.cardinal_rules_ones_sorted[lang] = sorted(self.cardinal_rules_ones[lang].keys())
+
+        self.cardinal_rules_ones = dict(self.cardinal_rules_ones)
+
+    def spellout_cardinal(self, num, lang, gender=None, category=None, random_choice_cardinals=False):
+        num = int(num)
+        remainder = 0
+
+        if lang not in self.cardinal_rules:
+            return None
+
+        rules = self.cardinal_rules.get(lang)
+        cardinals = self.cardinal_rules_sorted.get(lang)
+        if not rules or not cardinals:
+            return None
+
+        default_separator = self.default_separators.get(lang, self.default_separator)
+
+        if num == 0:
+            cardinal = rules.get((num, gender, category))
+            if cardinal:
+                if not random_choice_cardinals:
+                    cardinal = cardinal[0]
+                else:
+                    cardinal = random.choice(cardinal)
+                return cardinal['name']
+            else:
+                return None
+
+        cardinal_part = []
+
+        last_rule = {}
+        left_multiply_rules = []
+
+        while num:
+            i = bisect.bisect_left(cardinals, num)
+            if i > len(cardinals) - 1:
+                return None
+            if i > 0 and cardinals[i] > num:
+                val = cardinals[i - 1]
+            else:
+                val = cardinals[i]
+
+            multiple = num // val
+
+            if val == num:
+                cardinal = rules.get((num, gender, category))
+            else:
+                cardinal = rules.get((val, None, None), [])
+
+            multiple_rule = None
+
+            if multiple > 1:
+                multiple_val = rules.get((multiple, None, None))
+                if multiple_val:
+                    if not random_choice_cardinals:
+                        multiple_rule = multiple_val[0]
+                    else:
+                        multiple_rule = random.choice(multiple_val)
+            elif multiple == 1 and lang in self.cardinal_rules_ones_sorted:
+                ones_rules = self.cardinal_rules_ones_sorted[lang]
+                j = bisect.bisect_right(ones_rules, val)
+                if j > 0 and ones_rules[j - 1] <= num:
+                    multiple_rule = self.cardinal_rules_ones[lang][ones_rules[j - 1]]
+
+            use_multiple = multiple > 1
+
+            is_left_multiply = False
+            did_left_multiply = False
+
+            if not use_multiple:
+                rule = None
+                if cardinal and not random_choice_cardinals:
+                    rule = cardinal[0]
+                elif cardinal:
+                    rule = random.choice(cardinal)
+            else:
+                for rule in cardinal:
+                    left_multiply = rule.get('left') == 'multiply'
+                    if left_multiply:
+                        if not multiple_rule:
+                            left_multiply_rules.append(rule)
+                            is_left_multiply = True
+                            last_rule = rule
+                            rule = None
+                        break
+                else:
+                    rule = None
+
+            if rule is not None:
+                left_add = last_rule.get('left') == 'add'
+                right_add = last_rule.get('right') == 'add'
+
+                if multiple_rule:
+                    if right_add and cardinal_part:
+                        cardinal_part.append(last_rule.get('left_separator', default_separator))
+                    cardinal_part.append(multiple_rule['name'])
+                    cardinal_part.append(rule.get('left_separator', default_separator))
+
+                if right_add:
+                    if not multiple_rule and cardinal_part:
+                        right_separator = last_rule.get('right_separator', default_separator)
+                        cardinal_part.append(right_separator)
+                    cardinal_part.append(rule['name'])
+                elif left_add and cardinal_part:
+                    last = cardinal_part.pop()
+                    cardinal_part.append(rule['name'])
+                    left_separator = last_rule.get('left_separator', default_separator)
+                    cardinal_part.append(left_separator)
+                    cardinal_part.append(last)
+                elif not left_add and not right_add:
+                    cardinal_part.append(rule['name'])
+
+                last_rule = rule
+
+                if left_multiply_rules and 'right' not in rule and 'left' not in rule:
+                    left_multiply_rule = left_multiply_rules.pop()
+                    left_separator = left_multiply_rule.get('left_separator', default_separator)
+                    cardinal_part.append(left_separator)
+                    cardinal_part.append(left_multiply_rule['name'])
+                    did_left_multiply = True
+                    last_rule = left_multiply_rule
+
+            if not is_left_multiply and not did_left_multiply:
+                num -= (multiple * val)
+            elif not did_left_multiply:
+                remainder = num % val
+                num /= val
+            else:
+                num = remainder
+                did_left_multiply = False
+
+        return six.u('').join(cardinal_part)
+
+    def roman_numeral(self, num):
+        numeral = self.spellout_cardinal(num, 'la')
+        if numeral is None:
+            return None
+        return numeral.upper()
+
+    def spellout_ordinal(self, num, lang, gender=None, category=None,
+                         random_choice_cardinals=False, random_choice_ordinals=False):
+        num = int(num)
+        remainder = 0
+
+        if lang not in self.cardinal_rules:
+            return None
+
+        rules = self.ordinal_rules.get(lang)
+        cardinal_rules = self.cardinal_rules.get(lang)
+        cardinals = self.cardinal_rules_sorted.get(lang)
+        if not rules or not cardinal_rules or not cardinals:
+            return None
+
+        default_separator = self.default_separators.get(lang, self.default_separator)
+
+        expression = []
+
+        last_rule = {}
+        left_multiply_rules = []
+
+        if num == 0 or (num, gender, category) in rules:
+            ordinals = rules.get((num, gender, category))
+            if ordinals:
+                if not random_choice_ordinals:
+                    ordinal = ordinals[0]
+                else:
+                    ordinal = random.choice(ordinals)
+                return ordinal['name']
+            else:
+                return None
+
+        while num:
+            i = bisect.bisect_left(cardinals, num)
+            if i > len(cardinals) - 1:
+                return None
+            if i > 0 and cardinals[i] > num:
+                val = cardinals[i - 1]
+            else:
+                val = cardinals[i]
+
+            if val == num and not remainder:
+                if last_rule.get('right') == 'add':
+                    ordinals = rules.get((num, gender, category))
+                    if ordinals:
+                        if not random_choice_ordinals:
+                            ordinal = ordinals[0]
+                        else:
+                            ordinal = random.choice(ordinals)
+                        right_separator = last_rule.get('right_separator', default_separator)
+
+                        return right_separator.join([six.u('').join(expression), ordinal['name']])
+                    else:
+                        return None
+                elif last_rule.get('left') == 'add':
+                    last_num = last_rule['value']
+                    ordinals = rules.get((last_num, gender, category))
+                    if ordinals:
+                        if not random_choice_ordinals:
+                            ordinal = ordinals[0]
+                        else:
+                            ordinal = random.choice(ordinals)
+
+                        last_rule = ordinal
+                        expression.pop()
+                        cardinals = cardinal_rules.get((num, None, None))
+                        if cardinals:
+                            if not random_choice_cardinals:
+                                rule = cardinals[0]
+                            else:
+                                rule = random.choice(cardinals)
+                            expression.append(rule['name'])
+                        else:
+                            return None
+                        last = ordinal['name']
+                        left_separator = last_rule.get('left_separator', default_separator)
+                        return left_separator.join([six.u('').join(expression), ordinal['name']])
+                    else:
+                        return None
+                else:
+                    return None
+            else:
+                ordinal = rules.get((val, None, None), [])
+                cardinal = cardinal_rules.get((val, None, None), [])
+
+            multiple = num // val
+
+            multiple_rule = None
+
+            if multiple > 1:
+                multiple_val = cardinal_rules.get((multiple, None, None))
+                if multiple_val:
+                    if not random_choice_cardinals:
+                        multiple_rule = multiple_val[0]
+                    else:
+                        multiple_rule = random.choice(multiple_val)
+            elif multiple == 1 and lang in self.cardinal_rules_ones_sorted:
+                ones_rules = self.cardinal_rules_ones_sorted[lang]
+                j = bisect.bisect_right(ones_rules, val)
+                if j > 0 and ones_rules[j - 1] <= num:
+                    multiple_rule = self.cardinal_rules_ones[lang][ones_rules[j - 1]]
+
+            use_multiple = multiple > 1
+
+            is_left_multiply = False
+            did_left_multiply = False
+
+            if not use_multiple:
+                rule = None
+                if ordinal and not remainder:
+                    for rule in ordinal:
+                        if rule.get('right') == 'add':
+                            break
+                    else:
+                        rule = None
+
+                if not rule and cardinal and not random_choice_cardinals:
+                    rule = cardinal[0]
+                elif not rule and cardinal:
+                    rule = random.choice(cardinal)
+            else:
+                rule = None
+                have_ordinal = False
+                if ordinal:
+                    for rule in ordinal:
+                        left_multiply = rule.get('left') == 'multiply'
+                        if left_multiply and rule.get('right') == 'add':
+                            if not multiple_rule:
+                                left_multiply_rules.append(rule)
+                                is_left_multiply = True
+                                last_rule = rule
+                                rule = None
+                                have_ordinal = True
+                            break
+                    else:
+                        rule = None
+
+                if not have_ordinal:
+                    for rule in cardinal:
+                        left_multiply = rule.get('left') == 'multiply'
+                        if left_multiply:
+                            if not multiple_rule:
+                                left_multiply_rules.append(rule)
+                                is_left_multiply = True
+                                last_rule = rule
+                                rule = None
+                            break
+                    else:
+                        rule = None
+
+            if rule is not None:
+                left_add = last_rule.get('left') == 'add'
+                right_add = last_rule.get('right') == 'add'
+
+                if multiple_rule:
+                    if right_add and expression:
+                        expression.append(last_rule.get('left_separator', default_separator))
+                    expression.append(multiple_rule['name'])
+                    expression.append(rule.get('left_separator', default_separator))
+
+                if right_add:
+                    if not multiple_rule and expression:
+                        right_separator = last_rule.get('right_separator', default_separator)
+                        expression.append(right_separator)
+                    expression.append(rule['name'])
+                elif left_add and expression:
+                    last = expression.pop()
+                    expression.append(rule['name'])
+                    left_separator = last_rule.get('left_separator', default_separator)
+                    expression.append(left_separator)
+                    expression.append(last)
+                elif not left_add and not right_add:
+                    expression.append(rule['name'])
+
+                last_rule = rule
+
+                if left_multiply_rules and 'right' not in rule and 'left' not in rule:
+                    left_multiply_rule = left_multiply_rules.pop()
+                    print 'left_multiply_rule', left_multiply_rule
+                    left_separator = left_multiply_rule.get('left_separator', default_separator)
+                    expression.append(left_separator)
+                    expression.append(left_multiply_rule['name'])
+                    did_left_multiply = True
+                    last_rule = left_multiply_rule
+
+            if not is_left_multiply and not did_left_multiply:
+                num -= (multiple * val)
+            elif not did_left_multiply:
+                remainder = num % val
+                num /= val
+            else:
+                num = remainder
+                remainder = 0
+                did_left_multiply = False
+
+    def spellout_cardinal_hundreds(self, num, lang, gender=None, category=None, splitter=six.u(' ')):
+        if num % 100 >= 10:
+            first_hundred = self.spellout_cardinal(num % 100, lang, gender=gender, category=category)
+        elif num % 100 == 0:
+            rules = self.cardinal_rules.get(lang)
+            if not rules:
+                return None
+
+            cardinals = rules.get((100, gender, category))
+            if not cardinals:
+                return None
+
+            for rule in cardinals:
+                if rule.get('left') == 'multiply' and not rule.get('exact_multiple_only'):
+                    break
+            else:
+                rule = None
+
+            if not rule:
+                return None
+
+            first_hundred = rule['name']
+        else:
+            rules = self.cardinal_rules.get(lang)
+            if not rules:
+                return None
+
+            tens_place = num % 10
+            zero_rules = rules.get((0, gender, category))
+            if not zero_rules:
+                return None
+
+            tens_place_rules = rules.get((tens_place, gender, category))
+            if not tens_place_rules:
+                return None
+
+            zero_rule = random.choice(zero_rules)
+            tens_rule = random.choice(tens_place_rules)
+
+            first_hundred = splitter.join([zero_rule['name'], tens_rule['name']])
+
+        if not first_hundred:
+            return None
+
+        parts = [first_hundred]
+
+        for i in xrange(1, int(math.ceil(math.log(num, 100)))):
+            part = self.spellout_cardinal(num / 100 ** i, lang, gender=gender, category=category)
+            if not part:
+                return None
+            parts.append(part)
+        return splitter.join(reversed(parts))
+
+
+numeric_expressions = NumericExpressions()
--- a/scripts/geodata/openaddresses/init.py
+++ b/scripts/geodata/openaddresses/init.py
--- a/scripts/geodata/openaddresses/config.py
+++ b/scripts/geodata/openaddresses/config.py
@@ -0,0 +1,33 @@
+import os
+import six
+import yaml
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                                'resources', 'parser', 'data_sets', 'openaddresses.yaml')
+
+
+class OpenAddressesConfig(object):
+    def __init__(self, path=OPENADDRESSES_PARSER_DATA_CONFIG):
+        self.path = path
+
+        config = yaml.load(open(path))
+        self.config = config['global']
+        self.country_configs = config['countries']
+
+    @property
+    def sources(self):
+        for country, config in six.iteritems(self.country_configs):
+            for file_config in config.get('files', []):
+                filename = file_config['filename'].rsplit('.', 1)[0]
+
+                yield country, filename
+
+            for subdir, subdir_config in six.iteritems(config.get('subdirs', {})):
+                for file_config in subdir_config.get('files', []):
+                    filename = file_config['filename'].rsplit('.', 1)[0]
+
+                    yield country, subdir, filename
+
+openaddresses_config = OpenAddressesConfig()
--- a/scripts/geodata/openaddresses/download_openaddresses.py
+++ b/scripts/geodata/openaddresses/download_openaddresses.py
@@ -0,0 +1,114 @@
+import argparse
+import os
+import requests
+import six
+import subprocess
+import sys
+import tempfile
+import yaml
+
+from six.moves.urllib_parse import urljoin, quote_plus, unquote_plus
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.openaddresses.config import openaddresses_config
+from geodata.csv_utils import unicode_csv_reader
+from geodata.file_utils import ensure_dir, download_file, unzip_file, cd, remove_file
+from geodata.encoding import safe_encode, safe_decode
+
+BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io'
+
+OPENADDRESSES_LATEST_DIR = urljoin(BASE_OPENADDRESSES_DATA_URL, 'latest/run/')
+
+OPENADDRESSES_STATE_FILE_NAME = 'state.txt'
+OPENADDRESSES_STATE_URL = urljoin(BASE_OPENADDRESSES_DATA_URL, OPENADDRESSES_STATE_FILE_NAME)
+
+
+def download_and_unzip_file(url, out_dir):
+    zip_filename = url.rsplit('/', 1)[-1].strip()
+    zip_local_path = os.path.join(out_dir, zip_filename)
+
+    success = download_file(url, zip_local_path) and unzip_file(zip_local_path, out_dir)
+
+    if os.path.exists(zip_local_path):
+        remove_file(zip_local_path)
+
+    return success
+
+
+def download_pre_release_downloads(out_dir):
+    for url in openaddresses_config.config.get('pre_release_downloads', []):
+        print(six.u('doing pre_release {}').format(safe_decode(url)))
+
+        success = download_and_unzip_file(url, out_dir)
+        if not success:
+            print(six.u('ERR: could not download {}').format(source))
+            return False
+    return True
+
+
+def openaddresses_download_all_files(out_dir):
+    temp_dir = tempfile.gettempdir()
+
+    local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME)
+    if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path):
+        sys.exit('Could not download state.txt file')
+
+    reader = unicode_csv_reader(open(local_state_file_path), delimiter='\t')
+    headers = reader.next()
+
+    source_index = headers.index('source')
+    url_index = headers.index('processed')
+
+    download_pre_release_downloads(out_dir)
+
+    for row in reader:
+        source = row[source_index].rsplit('.')[0]
+        processed = row[url_index]
+        if not processed or not processed.strip():
+            continue
+
+        print(six.u('doing {}').format(source))
+        success = download_and_unzip_file(processed, out_dir)
+        if not success:
+            print(six.u('ERR: could not download {}').format(source))
+
+    remove_file(local_state_file_path)
+
+
+def openaddresses_download_configured_files(out_dir):
+    for path in openaddresses_config.sources:
+
+        source = six.b('/').join([safe_encode(p) for p in path])
+        filename = safe_encode(path[-1]) + six.b('.zip')
+        zip_path = filename + '.zip'
+        zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)])
+
+        url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)
+
+        download_pre_release_downloads(out_dir)
+
+        print(six.u('doing {}').format(safe_decode(source)))
+        success = download_and_unzip_file(url, out_dir)
+        if not success:
+            print(six.u('ERR: could not download {}').format(source))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-o', '--out-dir',
+                        required=True,
+                        help='Output directory')
+
+    parser.add_argument('--all', action='store_true',
+                        default=False, help='Download all completed OpenAddresses files')
+
+    args = parser.parse_args()
+    ensure_dir(args.out_dir)
+
+    if args.all:
+        openaddresses_download_all_files(args.out_dir)
+    else:
+        openaddresses_download_configured_files(args.out_dir)
--- a/scripts/geodata/openaddresses/formatter.py
+++ b/scripts/geodata/openaddresses/formatter.py
@@ -0,0 +1,698 @@
+# -*- coding: utf-8 -*-
+
+import csv
+import ftfy
+import itertools
+import os
+import random
+import re
+import six
+import yaml
+
+from geodata.addresses.units import Unit
+from geodata.address_expansions.abbreviations import abbreviate
+from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
+from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_types_gazetteer, toponym_abbreviations_gazetteer
+from geodata.address_formatting.formatter import AddressFormatter
+from geodata.addresses.components import AddressComponents
+from geodata.countries.constants import Countries
+from geodata.countries.names import country_names
+from geodata.encoding import safe_decode, safe_encode
+from geodata.i18n.languages import get_country_languages
+from geodata.i18n.word_breaks import ideographic_scripts
+from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE, get_string_script
+from geodata.math.sampling import cdf, weighted_choice
+from geodata.openaddresses.config import openaddresses_config
+from geodata.places.config import place_config
+from geodata.postal_codes.phrases import PostalCodes
+from geodata.text.tokenize import tokenize
+from geodata.text.token_types import token_types
+from geodata.text.utils import is_numeric, is_numeric_strict
+
+from geodata.csv_utils import tsv_string, unicode_csv_reader
+
+OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv'
+OPENADDRESSES_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
+
+null_regex = re.compile('^\s*(?:null|none)\s*$', re.I)
+unknown_regex = re.compile('\bunknown\b', re.I)
+not_applicable_regex = re.compile('^\s*n\.?\s*/?\s*a\.?\s*$', re.I)
+sin_numero_regex = re.compile('^\s*s\s*/\s*n\s*$', re.I)
+
+russian_number_regex_str = safe_decode(r'(?:№\s*)?(?:(?:[\d]+\w?(?:[\-/](?:(?:[\d]+\w?)|\w))*)|(?:[\d]+\s*\w?)|(?:\b\w\b))')
+dom_korpus_stroyeniye_regex = re.compile(safe_decode('(?:(?:дом(?=\s)|д\.?)\s*)?{}(?:(?:\s*,|\s+)\s*(?:(?:корпус(?=\s)|к\.?)\s*){})?(?:(?:\s*,|\s+)\s*(?:(?:строение(?=\s)|с\.?)\s*){})?\s*$').format(russian_number_regex_str, russian_number_regex_str, russian_number_regex_str), re.I | re.U)
+uchastok_regex = re.compile(safe_decode('{}\s*(?:,?\s*участок\s+{}\s*)?$').format(russian_number_regex_str, russian_number_regex_str), re.I | re.U)
+bea_nomera_regex = re.compile(safe_decode('^\s*б\s*/\s*н\s*$'), re.I)
+fraction_regex = re.compile('^\s*[\d]+[\s]*/[\s]*(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)[\s]*$', re.I)
+number_space_letter_regex = re.compile('^[\d]+\s+[a-z]$', re.I)
+number_slash_number_regex = re.compile('^(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)[\s]*/[\s]*(?:[\d]+|[a-z]|[\d]+[a-z]|[a-z][\d]+)$', re.I)
+number_fraction_regex = re.compile('^(?:[\d]+\s+)?(?:1[\s]*/[\s]*[234]|2[\s]*/[\s]*3)$')
+
+colombian_standard_house_number_regex = re.compile('^(\d+[\s]*[a-z]?)\s+([a-z]?[\d]+[\s]*[a-z]?)?', re.I)
+
+dutch_house_number_regex = re.compile('([\d]+)( [a-z])?( [\d]+)?', re.I)
+
+SPANISH = 'es'
+PORTUGUESE = 'pt'
+RUSSIAN = 'ru'
+CHINESE = 'zh'
+
+
+class OpenAddressesFormatter(object):
+    field_regex_replacements = {
+        # All fields
+        None: [
+            (re.compile('<\s*null\s*>', re.I), u''),
+            (re.compile('[\s]{2,}'), six.u(' ')),
+            (re.compile('\`'), u"'"),
+            (re.compile('\-?\*'), u""),
+        ],
+        AddressFormatter.HOUSE_NUMBER: [
+            # Most of the house numbers in Montreal start with "#"
+            (re.compile('^#', re.UNICODE), u''),
+            # Some house numbers have multiple hyphens
+            (re.compile('[\-]{2,}'), u'-'),
+            # Some house number ranges are split up like "12 -14"
+            (re.compile('[\s]*\-[\s]*'), u'-'),
+        ]
+    }
+
+    unit_type_regexes = {}
+
+    for (lang, dictionary_type), values in six.iteritems(address_phrase_dictionaries.phrases):
+        if dictionary_type == 'unit_types_numbered':
+            unit_phrases = [safe_encode(p) for p in itertools.chain(*values) if len(p) > 2]
+            pattern = re.compile(r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$'.format(safe_encode('|').join(unit_phrases)),
+                                 re.I | re.UNICODE)
+            unit_type_regexes[lang] = pattern
+
+    def __init__(self, components, country_rtree, debug=False):
+        self.components = components
+        self.country_rtree = country_rtree
+
+        self.debug = debug
+
+        self.formatter = AddressFormatter()
+
+    class validators:
+        @classmethod
+        def validate_postcode(cls, postcode):
+            '''
+            Postcodes that are all zeros are improperly-formatted NULL values
+            '''
+            return not all((c in ('0', '-', '.', ' ', ',') for c in postcode))
+
+        @classmethod
+        def validate_street(cls, street):
+            '''
+            Streets should not be simple numbers. If they are it's probably a
+            copy/paste error and should be the house number.
+            '''
+            return not is_numeric(street)
+
+        @classmethod
+        def validate_house_number(cls, house_number):
+            '''
+            House number doesn't necessarily have to be numeric, but in some of the
+            OpenAddresses data sets the house number field is equal to the capitalized
+            street name, so this at least provides protection against insane values
+            for house number at the cost of maybe missing a few houses numbered "A", etc.
+
+            Also OpenAddresses primarily comes from county GIS servers, etc. which use
+            a variety of database schemas and don't always handle NULLs very well. Again,
+            while a single zero is a valid house number, in OpenAddresses it's more likely
+            an error
+
+            While a single zero is a valid house number, more than one zero is not, or
+            at least not in OpenAddresses
+            '''
+
+            try:
+                house_number = int(house_number.strip())
+                return house_number > 0
+            except (ValueError, TypeError):
+                house_number = house_number.strip()
+                return house_number and (is_numeric(house_number) or fraction_regex.match(house_number) or number_space_letter_regex.match(house_number) or
+                                         number_slash_number_regex.match(house_number) or number_fraction_regex.match(house_number)) and not all((c == '0' for c in house_number if c.isdigit()))
+
+        @classmethod
+        def validate_house_number_sin_numero(cls, house_number):
+            if sin_numero_regex.match(house_number):
+                return True
+            return cls.validate_house_number(house_number)
+
+        @classmethod
+        def validate_russian_house_number(cls, house_number):
+            if dom_korpus_stroyeniye_regex.match(house_number):
+                return True
+            elif uchastok_regex.match(house_number):
+                return True
+            elif bea_nomera_regex.match(house_number):
+                return True
+            return cls.validate_house_number(house_number)
+
+        @classmethod
+        def validate_colombian_house_number(cls, house_number):
+            return True
+
+        @classmethod
+        def validate_chinese_house_number(cls, house_number):
+            if not house_number:
+                return False
+            tokens = tokenize(house_number)
+
+            if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens):
+                return True
+            return cls.validate_house_number(house_number)
+
+    component_validators = {
+        AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
+        AddressFormatter.ROAD: validators.validate_street,
+        AddressFormatter.POSTCODE: validators.validate_postcode,
+    }
+
+    language_validators = {
+        SPANISH: {
+            AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
+        },
+        PORTUGUESE: {
+            AddressFormatter.HOUSE_NUMBER: validators.validate_house_number_sin_numero,
+        },
+        RUSSIAN: {
+            AddressFormatter.HOUSE_NUMBER: validators.validate_russian_house_number,
+        },
+        CHINESE: {
+            AddressFormatter.HOUSE_NUMBER: validators.validate_chinese_house_number,
+        }
+    }
+
+    country_validators = {
+        Countries.COLOMBIA: {
+            AddressFormatter.HOUSE_NUMBER: validators.validate_colombian_house_number
+        }
+    }
+
+    chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U)
+
+    @classmethod
+    def format_chinese_house_number(cls, house_number):
+        if not house_number:
+            return house_number
+        return cls.chinese_annex_regex.sub(u'\\1号', house_number)
+
+    @classmethod
+    def format_colombian_house_number(cls, house_number):
+        house_number = house_number.strip()
+        match = colombian_standard_house_number_regex.match(house_number)
+        if match:
+            separator = random.choice((u'-', u' - ', u' '))
+
+            cross_street, building_number = match.groups()
+
+            numbers = []
+            if cross_street and u' ' in cross_street and random.choice((True, False)):
+                cross_street = cross_street.replace(u' ', u'')
+
+            if cross_street:
+                numbers.append(cross_street)
+
+            if building_number and u' ' in building_number and random.choice((True, False)):
+                building_number = building_number.replace(u' ', u'')
+
+            if building_number:
+                numbers.append(building_number)
+
+            if numbers:
+                house_number = separator.join(numbers)
+                house_number_prefixes = (u'#', u'no.', u'no', u'nº')
+                if random.choice((True, False)) and not any((house_number.lower().startswith(p) for p in house_number_prefixes)):
+                    house_number = u' '.join([random.choice(house_number_prefixes), house_number])
+
+        return house_number
+
+    def get_property(self, key, *configs):
+        for config in configs:
+            value = config.get(key, None)
+            if value is not None:
+                return value
+        return None
+
+    def cldr_country_name(self, country_code, language, configs):
+        cldr_country_prob = float(self.get_property('cldr_country_probability', *configs))
+
+        country_name = None
+
+        if random.random() < cldr_country_prob:
+            localized, iso_3166, alpha2, alpha3 = values = range(4)
+            localized_prob = float(self.get_property('localized_name_probability', *configs))
+            iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs))
+            alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs))
+            alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs))
+
+            probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])
+
+            country_type = weighted_choice(values, probs)
+
+            country_name = country_code.upper()
+            if country_type == localized:
+                country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
+            elif country_type == iso_3166:
+                country_name = country_names.iso3166_name(country_code)
+            elif country_type == alpha3:
+                country_name = country_names.alpha3_code(country_code) or country_name
+
+        return country_name
+
+    @classmethod
+    def cleanup_number(cls, num, strip_commas=False):
+        num = num.strip()
+        if strip_commas:
+            num = num.replace(six.u(','), six.u(''))
+        try:
+            num_int = int(num)
+        except (ValueError, TypeError):
+            try:
+                num_float = float(num)
+                leading_zeros = 0
+                for c in num:
+                    if c == six.u('0'):
+                        leading_zeros += 1
+                    else:
+                        break
+                num = safe_decode(int(num_float))
+                if leading_zeros:
+                    num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
+            except (ValueError, TypeError):
+                pass
+        return num
+
+    @classmethod
+    def fix_component_encodings(cls, components):
+        return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
+
+    def formatted_addresses(self, country_dir, path, configs, tag_components=True):
+        abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
+        separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
+        abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs))
+        separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0)
+        abbreviate_toponym_prob = float(self.get_property('abbreviate_toponym_probability', *configs))
+
+        add_osm_boundaries = bool(self.get_property('add_osm_boundaries', *configs) or False)
+        add_osm_neighborhoods = bool(self.get_property('add_osm_neighborhoods', *configs) or False)
+        osm_neighborhood_overrides_city = self.get_property('osm_neighborhood_overrides_city', *configs)
+        non_numeric_units = bool(self.get_property('non_numeric_units', *configs) or False)
+        house_number_strip_commas = bool(self.get_property('house_number_strip_commas', *configs) or False)
+        numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
+        postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)
+
+        address_only_probability = float(self.get_property('address_only_probability', *configs))
+        place_only_probability = float(self.get_property('place_only_probability', *configs))
+        place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs))
+
+        city_replacements = self.get_property('city_replacements', *configs)
+
+        override_country_dir = self.get_property('override_country_dir', *configs)
+
+        postcode_length = int(self.get_property('postcode_length', *configs) or 0)
+
+        drop_address_probability = place_only_probability + place_and_postcode_probability
+
+        ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or [])
+
+        ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
+                                    for field, value in six.iteritems(dict(self.get_property('ignore_fields_containing', *configs) or {}))}
+
+        alias_fields_containing = {field: [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
+                                   for field, value in six.iteritems(dict(self.get_property('alias_fields_containing', *configs) or {}))}
+
+        config_language = self.get_property('language', *configs)
+
+        add_components = self.get_property('add', *configs)
+
+        fields = self.get_property('fields', *configs)
+        if not fields:
+            return
+
+        field_map = {field_name: f['component'] for field_name, f in six.iteritems(fields)}
+        mapped_values = {f['component']: f['value_map'] for f in six.itervalues(fields) if hasattr(f.get('value_map'), 'get')}
+
+        f = open(path)
+        reader = unicode_csv_reader(f)
+        headers = reader.next()
+
+        header_indices = {i: field_map[k] for i, k in enumerate(headers) if k in field_map}
+        latitude_index = headers.index('LAT')
+        longitude_index = headers.index('LON')
+
+        # Clear cached polygons
+        self.components.osm_admin_rtree.clear_cache()
+        self.components.neighborhoods_rtree.clear_cache()
+
+        for row in reader:
+            try:
+                latitude = float(row[latitude_index])
+                longitude = float(row[longitude_index])
+            except (ValueError, TypeError):
+                continue
+
+            language = config_language
+
+            components = {}
+
+            skip_record = False
+
+            for i, key in six.iteritems(header_indices):
+                value = row[i].strip()
+                if not value and key in ignore_rows_missing_fields:
+                    skip_record = True
+                    break
+                elif not value:
+                    continue
+
+                if key in mapped_values:
+                    value = mapped_values[key].get(value, value)
+
+                if key == AddressFormatter.ROAD and language == SPANISH:
+                    value = self.components.spanish_street_name(value)
+
+                if key == AddressFormatter.POSTCODE:
+                    value = self.cleanup_number(value)
+
+                    if postcode_strip_non_digit_chars:
+                        value = six.u('').join((c for c in value if c.isdigit()))
+
+                    if value and not is_numeric(value) and numeric_postcodes_only:
+                        continue
+                    else:
+                        if postcode_length:
+                            value = value.zfill(postcode_length)[:postcode_length]
+
+                if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
+                    if add_osm_boundaries:
+                        continue
+                    value = self.components.cleaned_name(value, first_comma_delimited_phrase=True)
+                    if value and ((len(value) < 2 and not get_string_script(value)[0].lower() in ideographic_scripts) or is_numeric(value)):
+                        continue
+
+                if not_applicable_regex.match(value) or null_regex.match(value) or unknown_regex.match(value):
+                    continue
+
+                for exp, sub_val in self.field_regex_replacements.get(key, []):
+                    value = exp.sub(sub_val, value)
+
+                for exp, sub_val in self.field_regex_replacements.get(None, []):
+                    value = exp.sub(sub_val, value)
+
+                value = value.strip(', -')
+
+                validator = self.country_validators.get(country_dir, {}).get(key, self.language_validators.get(language, {}).get(key, self.component_validators.get(key, None)))
+
+                if validator is not None and not validator(value):
+                    continue
+
+                if key in ignore_fields_containing and ignore_fields_containing[key].search(value):
+                    continue
+
+                for (pattern, alias) in alias_fields_containing.get(key, []):
+                    if pattern.search(value):
+                        if 'component' in alias:
+                            key = alias['component']
+
+                if value:
+                    components[key] = value
+
+            if skip_record:
+                continue
+
+            if components:
+                country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
+                if not (country and candidate_languages) or (country != country_dir and not override_country_dir):
+                    country = country_dir
+                    candidate_languages = get_country_languages(country)
+                    if not candidate_languages:
+                        continue
+                    candidate_languages = candidate_languages.items()
+
+                components = self.fix_component_encodings(components)
+
+                if language is None:
+                    language = AddressComponents.address_language(components, candidate_languages)
+
+                street = components.get(AddressFormatter.ROAD, None)
+                if street is not None:
+                    street = street.strip()
+                    street = AddressComponents.cleaned_name(street)
+
+                    if language == UNKNOWN_LANGUAGE:
+                        strip_unit_language = candidate_languages[0][0] if candidate_languages else None
+                    else:
+                        strip_unit_language = language
+
+                    street = self.components.strip_unit_phrases_for_language(street, strip_unit_language)
+
+                    street = abbreviate(street_types_gazetteer, street, language,
+                                        abbreviate_prob=abbreviate_street_prob,
+                                        separate_prob=separate_street_prob)
+                    components[AddressFormatter.ROAD] = street
+
+                house_number = components.get(AddressFormatter.HOUSE_NUMBER, None)
+                if house_number:
+                    house_number = self.cleanup_number(house_number, strip_commas=house_number_strip_commas)
+
+                    if language == CHINESE:
+                        house_number = self.format_chinese_house_number(house_number)
+
+                    if country_dir == Countries.COLOMBIA:
+                        house_number = self.format_colombian_house_number(house_number)
+
+                    if house_number is not None:
+                        components[AddressFormatter.HOUSE_NUMBER] = house_number
+
+                unit = components.get(AddressFormatter.UNIT, None)
+
+                street_required = country not in (Countries.JAPAN, Countries.CZECH_REPUBLIC) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES
+
+                postcode = components.get(AddressFormatter.POSTCODE, None)
+
+                if postcode:
+                    components[AddressFormatter.POSTCODE] = PostalCodes.add_country_code(postcode, country)
+
+                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
+                if (not street and street_required) or (street and house_number and (street.lower() == house_number.lower())) or (unit and street and street.lower() == unit.lower()):
+                    if not postcode:
+                        continue
+                    components = self.components.drop_address(components)
+
+                # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
+                unit = components.get(AddressFormatter.UNIT, None)
+
+                if unit is not None:
+                    if is_numeric_strict(unit):
+                        unit = Unit.phrase(unit, language, country=country)
+                    elif non_numeric_units:
+                        unit = abbreviate(unit_types_gazetteer, unit, language,
+                                          abbreviate_prob=abbreviate_unit_prob,
+                                          separate_prob=separate_unit_prob)
+                    else:
+                        unit = None
+
+                    if unit is not None:
+                        components[AddressFormatter.UNIT] = unit
+                    else:
+                        components.pop(AddressFormatter.UNIT)
+                        unit = None
+
+                # CLDR country name
+                country_name = self.cldr_country_name(country, language, configs)
+                if country_name:
+                    components[AddressFormatter.COUNTRY] = country_name
+
+                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
+                    component = components.get(component_key, None)
+                    if component is not None:
+                        component = abbreviate(toponym_abbreviations_gazetteer, component, language,
+                                               abbreviate_prob=abbreviate_toponym_prob)
+                        component = self.components.name_hyphens(component)
+                        components[component_key] = component
+
+                # Any components specified to be added by the config (usually state)
+                if add_components:
+                    for k, v in six.iteritems(add_components):
+                        if k not in components:
+                            components[k] = v
+
+                # Get named states occasionally, added component is usually a state code
+                address_state = self.components.state_name(components, country, language)
+                if address_state:
+                    components[AddressFormatter.STATE] = address_state
+
+                state = components.get(AddressFormatter.STATE)
+                if state:
+                    state = self.components.abbreviated_state(state, country, language)
+                    if state:
+                        components[AddressFormatter.STATE] = state
+
+                # This is expensive, so only turn on for files that don't supply their own city names
+                # or for which those names are flawed
+                osm_components = []
+
+                # Using population=0 instead of None means if there's no known population or
+                # we don't need to add OSM components, we assume the population of the town is
+                # very small and the place name shouldn't be used unqualified (i.e. needs information
+                # like state name to disambiguate it)
+                population = 0
+                unambiguous_city = False
+                if add_osm_boundaries or AddressFormatter.CITY not in components:
+                    osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
+                    self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude)
+                    categorized = self.components.categorized_osm_components(country, osm_components)
+                    for component, label in categorized:
+                        if label == AddressFormatter.CITY:
+                            unambiguous_city = self.components.unambiguous_wikipedia(component, language)
+                            if 'population' in component:
+                                population = component['population']
+                            break
+
+                if AddressFormatter.CITY not in components and city_replacements:
+                    components.update({k: v for k, v in six.iteritems(city_replacements) if k not in components})
+
+                # The neighborhood index is cheaper so can turn on for whole countries
+                neighborhood_components = []
+                if add_osm_neighborhoods:
+                    neighborhood_components = self.components.neighborhood_components(latitude, longitude)
+                    self.components.add_neighborhoods(components, neighborhood_components, country, language, replace_city=osm_neighborhood_overrides_city)
+
+                self.components.cleanup_boundary_names(components)
+                self.components.country_specific_cleanup(components, country)
+
+                self.components.replace_name_affixes(components, language, country=country)
+
+                self.components.replace_names(components)
+
+                self.components.prune_duplicate_names(components)
+
+                self.components.remove_numeric_boundary_names(components)
+                self.components.add_house_number_phrase(components, language, country=country)
+                self.components.add_postcode_phrase(components, language, country=country)
+
+                # Component dropout
+                all_osm_components = osm_components + neighborhood_components
+                components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)
+
+                self.components.add_genitives(components, language)
+
+                formatted = self.formatter.format_address(components, country, language=language,
+                                                          minimal_only=False, tag_components=tag_components)
+                yield (language, country, formatted)
+
+                if random.random() < address_only_probability and street:
+                    address_only_components = self.components.drop_places(components)
+                    address_only_components = self.components.drop_postcode(address_only_components)
+                    formatted = self.formatter.format_address(address_only_components, country, language=language,
+                                                              minimal_only=False, tag_components=tag_components)
+                    yield (language, country, formatted)
+
+                rand_val = random.random()
+
+                if street and house_number and rand_val < drop_address_probability:
+                    components = self.components.drop_address(components)
+
+                    if rand_val < place_and_postcode_probability:
+                        components = self.components.drop_postcode(components)
+
+                    if components and (len(components) > 1 or add_osm_boundaries):
+                        formatted = self.formatter.format_address(components, country, language=language,
+                                                                  minimal_only=False, tag_components=tag_components)
+                        yield (language, country, formatted)
+
+    def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None):
+        all_sources_valid = sources_only is None
+        valid_sources = set()
+        if not all_sources_valid:
+            for source in sources_only:
+                if source.startswith(base_dir):
+                    source = os.path.relpath(source, base_dir)
+
+                parts = source.strip('/ ').split('/')
+                if len(parts) > 3:
+                    raise AssertionError('Sources may only have at maximum 3 parts')
+                valid_sources.add(tuple(parts))
+
+        if tag_components:
+            formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
+            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
+        else:
+            formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w')
+            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
+
+        i = 0
+
+        for country_dir in sorted(openaddresses_config.country_configs.keys()):
+            country_config = openaddresses_config.country_configs[country_dir]
+            # Clear country cache for each new country
+            self.country_rtree.clear_cache()
+
+            for file_config in country_config.get('files', []):
+                filename = file_config['filename']
+
+                if not all_sources_valid and not ((country_dir, filename) in valid_sources or (country_dir,) in valid_sources):
+                    continue
+
+                print(six.u('doing {}/{}').format(country_dir, filename))
+
+                path = os.path.join(base_dir, country_dir, filename)
+                configs = (file_config, country_config, openaddresses_config.config)
+                for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
+                    if not formatted_address or not formatted_address.strip():
+                        continue
+
+                    formatted_address = tsv_string(formatted_address)
+                    if not formatted_address or not formatted_address.strip():
+                        continue
+
+                    if tag_components:
+                        row = (language, country, formatted_address)
+                    else:
+                        row = (formatted_address,)
+
+                    writer.writerow(row)
+                    i += 1
+                    if i % 1000 == 0 and i > 0:
+                        print('did {} formatted addresses'.format(i))
+                        if self.debug:
+                            break
+
+            for subdir in sorted(country_config.get('subdirs', {}).keys()):
+                subdir_config = country_config['subdirs'][subdir]
+                subdir = safe_decode(subdir)
+                for file_config in subdir_config.get('files', []):
+                    filename = file_config['filename']
+
+                    if not all_sources_valid and not ((country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir,) in valid_sources):
+                        continue
+
+                    print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename))
+
+                    path = os.path.join(base_dir, country_dir, subdir, filename)
+
+                    configs = (file_config, subdir_config, country_config, openaddresses_config.config)
+                    for language, country, formatted_address in self.formatted_addresses(country_dir, path, configs, tag_components=tag_components):
+                        if not formatted_address or not formatted_address.strip():
+                            continue
+
+                        formatted_address = tsv_string(formatted_address)
+                        if not formatted_address or not formatted_address.strip():
+                            continue
+
+                        if tag_components:
+                            row = (language, country, formatted_address)
+                        else:
+                            row = (formatted_address,)
+
+                        writer.writerow(row)
+
+                        i += 1
+                        if i % 1000 == 0 and i > 0:
+                            print('did {} formatted addresses'.format(i))
+                            if self.debug:
+                                break
--- a/Show More
+++ b/Show More