Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/geonames/init.py
+++ b/scripts/geodata/geonames/init.py
--- a/scripts/geodata/geonames/create_geonames_tsv.py
+++ b/scripts/geodata/geonames/create_geonames_tsv.py
@@ -0,0 +1,688 @@
+'''
+create_geonames_tsv.py
+----------------------
+
+This script formats the open GeoNames database (as well as
+its accompanying postal codes data set) into a schema'd
+tab-separated value file.
+
+It generates a C header which uses an enum for the field names.
+This way if new fields are added or there's a typo, etc. the
+error will show up at compile-time.
+
+The relevant C modules which operate on this data are:
+    geodb_builder.c
+    geonames.c
+
+As well as the generated headers:
+    geonames_fields.h
+    postal_fields.h
+'''
+
+import argparse
+import csv
+import logging
+import operator
+import os
+import re
+import sqlite3
+import subprocess
+import sys
+
+import pycountry
+
+import unicodedata
+
+import urllib
+import urlparse
+
+from collections import defaultdict, OrderedDict
+from lxml import etree
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.csv_utils import *
+from geodata.file_utils import *
+from geodata.countries.country_names import *
+from geodata.encoding import safe_encode, safe_decode
+from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
+from geodata.i18n.languages import *
+from geodata.i18n.unicode_paths import CLDR_DIR
+from geodata.log import log_to_file
+
+multispace_regex = re.compile('[\s]+')
+
+
+def encode_field(value):
+    return multispace_regex.sub(' ', safe_encode((value if value is not None else '')))
+
+log_to_file(sys.stderr)
+
+DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
+                                os.path.pardir, 'data', 'geonames')
+
+COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
+CONTINENT_FEATURE_CODES = ('CONT',)
+
+ADMIN_1_FEATURE_CODES = ('ADM1',)
+ADMIN_2_FEATURE_CODES = ('ADM2',)
+ADMIN_3_FEATURE_CODES = ('ADM3',)
+ADMIN_4_FEATURE_CODES = ('ADM4',)
+OTHER_ADMIN_FEATURE_CODES = ('ADM5',)
+ADMIN_OTHER_FEATURE_CODES = ('ADMD', )
+
+POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
+                                 'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL',
+                                 'PPLR', 'PPLS', 'STLMT')
+NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
+
+
+class boundary_types:
+    COUNTRY = 0
+    ADMIN1 = 1
+    ADMIN2 = 2
+    ADMIN3 = 3
+    ADMIN4 = 4
+    ADMIN_OTHER = 5
+    LOCALITY = 6
+    NEIGHBORHOOD = 7
+
+geonames_admin_dictionaries = OrderedDict([
+    (boundary_types.COUNTRY, COUNTRY_FEATURE_CODES),
+    (boundary_types.ADMIN1, ADMIN_1_FEATURE_CODES),
+    (boundary_types.ADMIN2, ADMIN_2_FEATURE_CODES),
+    (boundary_types.ADMIN3, ADMIN_3_FEATURE_CODES),
+    (boundary_types.ADMIN4, ADMIN_4_FEATURE_CODES),
+    (boundary_types.ADMIN_OTHER, ADMIN_OTHER_FEATURE_CODES),
+    (boundary_types.LOCALITY, POPULATED_PLACE_FEATURE_CODES),
+    (boundary_types.NEIGHBORHOOD, NEIGHBORHOOD_FEATURE_CODES),
+])
+
+# Inserted post-query
+DUMMY_BOUNDARY_TYPE = '-1 as type'
+DUMMY_HAS_WIKIPEDIA_ENTRY = '0 as has_wikipedia_entry'
+DUMMY_LANGUAGE_PRIORITY = '0 as language_priority'
+
+
+class GeonamesField(object):
+    def __init__(self, name, c_constant, default=None, is_dummy=False):
+        self.name = name
+        self.c_constant = c_constant
+        self.default = default
+        self.is_dummy = is_dummy
+
+geonames_fields = [
+    # Field if alternate_names present, default field name if not, C header constant
+    GeonamesField('alternate_name', 'GEONAMES_NAME', default='gn.name'),
+    GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'),
+    GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'),
+    GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True),
+    GeonamesField(DUMMY_HAS_WIKIPEDIA_ENTRY, 'GEONAMES_HAS_WIKIPEDIA_ENTRY', is_dummy=True),
+    GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"),
+    GeonamesField(DUMMY_LANGUAGE_PRIORITY, 'GEONAMES_LANGUAGE_PRIORITY', is_dummy=True),
+    GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'),
+    GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
+    GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
+    GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'),
+    GeonamesField('gn.population', 'GEONAMES_POPULATION'),
+    GeonamesField('gn.latitude', 'GEONAMES_LATITUDE'),
+    GeonamesField('gn.longitude', 'GEONAMES_LONGITUDE'),
+    GeonamesField('gn.feature_code', 'GEONAMES_FEATURE_CODE'),
+    GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
+    GeonamesField('c.geonames_id as country_gn_id', 'GEONAMES_COUNTRY_ID'),
+    GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
+    GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
+    GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
+    GeonamesField('a2.geonames_id as a2_gn_id', 'GEONAMES_ADMIN2_ID'),
+    GeonamesField('gn.admin3_code as admin3_code', 'GEONAMES_ADMIN3_CODE'),
+    GeonamesField('a3.geonames_id as a3_gn_id', 'GEONAMES_ADMIN3_ID'),
+    GeonamesField('gn.admin4_code as admin4_code', 'GEONAMES_ADMIN4_CODE'),
+    GeonamesField('a4.geonames_id as a4_gn_id', 'GEONAMES_ADMIN4_ID'),
+]
+
+def geonames_field_index(s):
+    for i, f in enumerate(geonames_fields):
+        if f.c_constant == s:
+            return i
+    return None
+
+
+DUMMY_BOUNDARY_TYPE_INDEX = geonames_field_index('GEONAMES_BOUNDARY_TYPE')
+DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX = geonames_field_index('GEONAMES_HAS_WIKIPEDIA_ENTRY')
+
+GEONAMES_ID_INDEX = geonames_field_index('GEONAMES_ID')
+LANGUAGE_INDEX = geonames_field_index('GEONAMES_ISO_LANGUAGE')
+
+DUMMY_LANGUAGE_PRIORITY_INDEX = geonames_field_index('GEONAMES_LANGUAGE_PRIORITY')
+
+CANONICAL_NAME_INDEX = geonames_field_index('GEONAMES_CANONICAL')
+
+NAME_INDEX = geonames_field_index('GEONAMES_NAME')
+
+COUNTRY_CODE_INDEX = geonames_field_index('GEONAMES_COUNTRY_CODE')
+
+POPULATION_INDEX = geonames_field_index('GEONAMES_POPULATION')
+
+PREFERRED_INDEX = geonames_field_index('GEONAMES_IS_PREFERRED_NAME')
+
+HISTORICAL_INDEX = geonames_field_index('GEONAMES_IS_HISTORICAL')
+
+
+geonames_admin_joins = '''
+left join admin1_codes a1
+    on a1.code = gn.admin1_code
+    and a1.country_code = gn.country_code
+left join admin2_codes a2
+    on a2.code = gn.admin2_code
+    and a2.admin1_code = gn.admin1_code
+    and a2.country_code = gn.country_code
+left join admin3_codes a3
+    on a3.code = gn.admin3_code
+    and a3.admin1_code = gn.admin1_code
+    and a3.admin2_code = gn.admin2_code
+    and a3.country_code = gn.country_code
+left join admin4_codes a4
+    on a4.code = gn.admin4_code
+    and a4.admin1_code = gn.admin1_code
+    and a4.admin2_code = gn.admin2_code
+    and a4.admin3_code = gn.admin3_code
+    and a4.country_code = gn.country_code
+'''
+
+# Canonical names are stored in the geonames table with alternates
+# stored in a separate table. UNION ALL query will capture them all.
+
+base_geonames_query = '''
+select {geonames_fields}
+from geonames gn
+join countries c
+    on gn.country_code = c.country_code
+{admin_joins}
+{{predicate}}
+union all
+select {alt_name_fields}
+from geonames gn
+join countries c
+    on gn.country_code = c.country_code
+join alternate_names an
+    on an.geonames_id = gn.geonames_id
+    and iso_language not in ('doi','faac','iata',
+                             'icao','link','post','tcid')
+    and an.alternate_name != gn.name
+{admin_joins}
+{{predicate}}
+'''.format(
+    geonames_fields=', '.join((f.name if f.default is None else
+                               '{} as {}'.format(f.default, f.name)
+                               for f in geonames_fields)),
+    alt_name_fields=', '.join((f.name for f in geonames_fields)),
+    admin_joins=geonames_admin_joins
+)
+
+IGNORE_COUNTRY_POSTAL_CODES = set([
+    'AR',   # GeoNames has pre-1999 postal codes
+])
+
+postal_code_fields = [
+    GeonamesField('postal_code', 'GN_POSTAL_CODE'),
+    GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
+    GeonamesField('c.geonames_id as country_geonames_id', 'GN_POSTAL_COUNTRY_GEONAMES_ID'),
+    GeonamesField('c.population as country_population', 'GN_POSTAL_COUNTRY_POPULATION'),
+    GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
+    GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
+    GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
+    GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
+]
+
+def postal_code_field_index(s):
+    for i, f in enumerate(postal_code_fields):
+        if f.c_constant == s:
+            return i
+    return None
+
+POSTAL_CODE_INDEX = postal_code_field_index('GN_POSTAL_CODE')
+POSTAL_CODE_POP_INDEX = postal_code_field_index('GN_POSTAL_COUNTRY_POPULATION')
+
+postal_codes_query = '''
+select
+{fields}
+from postal_codes p
+join countries c
+    on p.country_code = c.country_code
+left join (
+    select
+    gn.geonames_id,
+    alternate_name,
+    country_code,
+    gn.name
+    from alternate_names an
+    join geonames gn
+        on an.geonames_id = gn.geonames_id
+    where iso_language = 'post'
+) as n
+on p.postal_code = n.alternate_name
+and p.country_code = n.country_code
+left join admin1_codes a1
+    on a1.code = p.admin1_code
+    and p.country_code = a1.country_code
+left join admin2_codes a2
+    on a2.code = p.admin2_code
+    and a2.admin1_code = p.admin1_code
+    and a2.country_code = p.country_code
+left join admin3_codes a3
+    on a3.code = p.admin3_code
+    and a3.admin1_code = p.admin1_code
+    and a3.admin2_code = p.admin2_code
+    and a3.country_code = p.country_code
+where p.country_code not in ({exclude_country_codes})
+group by postal_code, p.country_code
+'''.format(
+    fields=','.join([f.name for f in postal_code_fields]),
+    exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))
+
+
+wikipedia_query = '''
+select alternate_name, geonames_id, is_preferred_name
+from alternate_names
+where iso_language = 'link'
+and alternate_name like '%%en.wikipedia%%'
+order by alternate_name, is_preferred_name
+'''
+
+BATCH_SIZE = 2000
+
+
+wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
+
+
+def normalize_wikipedia_title(title):
+    return safe_decode(title).replace(u'_', u' ')
+
+
+def normalize_wikipedia_url(url):
+    url = urllib.unquote_plus(url)
+
+    parsed = urlparse.urlsplit(url)
+    if parsed.query:
+        params = urlparse.parse_qs(parsed.query)
+        if 'title' in params:
+            return normalize_wikipedia_title(params['title'][0])
+
+    title = parsed.path.rsplit('/', 1)[-1]
+    if title not in ('index.php', 'index.html'):
+        return normalize_wikipedia_title(title)
+
+    return None
+
+
+def normalize_name(name):
+    name = name.replace('&', 'and')
+    name = name.replace('-', ' ')
+    name = name.replace(', ', ' ')
+    name = name.replace(',', ' ')
+    return name
+
+
+saint_replacements = [
+    ('st.', 'saint'),
+    ('st.', 'st'),
+    ('st', 'saint')
+]
+
+
+abbreviated_saint_regex = re.compile(r'\bSt(\.|\b)')
+
+
+def normalize_display_name(name):
+    return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
+
+
+def utf8_normalize(s, form='NFD'):
+    return unicodedata.normalize(form, s)
+
+
+def get_wikipedia_titles(db):
+    d = defaultdict(dict)
+
+    cursor = db.execute(wikipedia_query)
+
+    while True:
+        batch = cursor.fetchmany(BATCH_SIZE)
+        if not batch:
+            break
+
+        for (url, geonames_id, is_preferred) in batch:
+            title = normalize_wikipedia_url(safe_encode(url))
+            if title is not None and title.strip():
+                title = utf8_normalize(normalize_name(title))
+                d[title.lower()][geonames_id] = int(is_preferred or 0)
+
+    return d
+
+
+def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
+    '''
+    Writes geonames.tsv using the specified db to the specified data directory
+    '''
+    filename = os.path.join(out_dir, 'geonames.tsv')
+    temp_filename = filename + '.tmp'
+
+    f = open(temp_filename, 'w')
+
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    init_languages()
+
+    init_country_names()
+
+    wiki_titles = get_wikipedia_titles(db)
+    logging.info('Fetched Wikipedia titles')
+
+    # Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
+    for boundary_type, codes in geonames_admin_dictionaries.iteritems():
+        if boundary_type != boundary_types.COUNTRY:
+            predicate = 'where gn.feature_code in ({codes})'.format(
+                codes=','.join(['"{}"'.format(c) for c in codes])
+            )
+        else:
+            # The query for countries in GeoNames is somewhat non-trivial
+            predicate = 'where gn.geonames_id in (select geonames_id from countries)'
+
+        query = base_geonames_query.format(
+            predicate=predicate
+        )
+
+        cursor = db.execute(query)
+        i = 1
+        while True:
+            # Fetch rows in batches to save memory
+            batch = cursor.fetchmany(BATCH_SIZE)
+            if not batch:
+                break
+            rows = []
+            for row in batch:
+                row = list(row)
+                row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type
+
+                language = row[LANGUAGE_INDEX]
+
+                country_code = row[COUNTRY_CODE_INDEX]
+
+                is_preferred = int(row[PREFERRED_INDEX] or 0)
+                is_historical = int(row[HISTORICAL_INDEX] or 0)
+
+                lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
+                lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
+                null_language = not language.strip()
+
+                is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
+
+                alpha2_code = None
+                is_orig_name = False
+
+                if boundary_type == boundary_types.COUNTRY:
+                    alpha2_code = row[COUNTRY_CODE_INDEX]
+
+                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
+                    # Set the canonical for countries to the local name, see country_official_name in country_names.py
+                    country_canonical = country_localized_display_name(alpha2_code.lower())
+                    if not country_canonical or not country_canonical.strip():
+                        raise ValueError('Could not get local canonical name for country code={}'.format(alpha2_code))
+                    row[CANONICAL_NAME_INDEX] = country_canonical
+
+                geonames_id = row[GEONAMES_ID_INDEX]
+
+                name = utf8_normalize(safe_decode(row[NAME_INDEX]))
+
+                # For non-postal codes, don't count
+                if name.isdigit():
+                    continue
+
+                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))
+
+                row[NAME_INDEX] = name
+
+                if boundary_type == boundary_types.COUNTRY:
+                    norm_name = normalize_name(name.lower())
+                    for s, repl in saint_replacements:
+                        if not wikipedia_entries:
+                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
+
+                wiki_row = []
+
+                have_wikipedia = geonames_id in wikipedia_entries
+                wiki_preferred = wikipedia_entries.get(geonames_id, 0)
+
+                '''
+                The following set of heuristics assigns a numerical value to a given name
+                alternative, such that in the case of ambiguous names, this value can be
+                used as part of the ranking function (as indeed it will be during sort).
+                The higher the value, the more likely the given entity resolution.
+                '''
+                if is_historical:
+                    # Historical names, unlikely to be used
+                    language_priority = 0
+                elif not null_language and language != 'abbr' and lang_spoken is None:
+                    # Name of a place in language not widely spoken e.g. Japanese name for a US toponym
+                    language_priority = 1
+                elif null_language and not is_preferred and not is_canonical:
+                    # Null-language alternate names not marked as preferred, dubious
+                    language_priority = 2
+                elif language == 'abbr' and not is_preferred:
+                    # Abbreviation, not preferred
+                    language_priority = 3
+                elif language == 'abbr' and is_preferred:
+                    # Abbreviation, preferred e.g. NYC, UAE
+                    language_priority = 4
+                elif lang_spoken and not lang_official and not is_preferred:
+                    # Non-preferred name but in a spoken (non-official) language
+                    language_priority = 5
+                elif lang_official == 1 and not is_preferred:
+                    # Name in an official language, not preferred
+                    language_priority = 6
+                elif null_language and not is_preferred and is_canonical:
+                    # Canonical name, may be overly official e.g. Islamic Republic of Pakistan
+                    language_priority = 7
+                elif is_preferred and not lang_official:
+                    # Preferred names, not an official language
+                    language_priority = 8
+                elif is_preferred and lang_official:
+                    # Official language preferred
+                    language_priority = 9
+
+                row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
+
+                if have_wikipedia:
+                    wiki_row = row[:]
+                    wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
+                    rows.append(map(encode_field, wiki_row))
+
+                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
+                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
+
+                have_normalized = False
+
+                if is_orig_name:
+                    canonical_row = wiki_row[:] if have_wikipedia else row[:]
+
+                    canonical_row_name = normalize_display_name(name)
+                    if canonical_row_name != name:
+                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
+                        have_normalized = True
+                        rows.append(map(encode_field, canonical_row))
+
+                if not have_wikipedia:
+                    rows.append(map(encode_field, row))
+
+                # Country names have more specialized logic
+                if boundary_type == boundary_types.COUNTRY:
+                    wikipedia_entries = wiki_titles.get(canonical.lower(), {})
+
+                    canonical_row_name = normalize_display_name(canonical)
+
+                    canonical_row = row[:]
+
+                    if is_orig_name:
+                        canonical = safe_decode(canonical)
+                        canonical_row[NAME_INDEX] = safe_encode(canonical)
+
+                        norm_name = normalize_name(canonical.lower())
+                        for s, repl in saint_replacements:
+                            if not wikipedia_entries:
+                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
+
+                        if not wikipedia_entries:
+                            norm_name = normalize_name(canonical_row_name.lower())
+                            for s, repl in saint_replacements:
+                                if not wikipedia_entries:
+                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
+
+                        have_wikipedia = geonames_id in wikipedia_entries
+                        wiki_preferred = wikipedia_entries.get(geonames_id, 0)
+
+                        if have_wikipedia:
+                            canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
+
+                        if (name != canonical):
+                            rows.append(map(encode_field, canonical_row))
+
+                    if canonical_row_name != canonical and canonical_row_name != name:
+                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
+                        rows.append(map(encode_field, canonical_row))
+
+                    if alpha2_code and is_orig_name:
+                        alpha2_row = row[:]
+                        alpha2_row[NAME_INDEX] = alpha2_code
+                        alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
+                        rows.append(map(encode_field, alpha2_row))
+
+                    if alpha2_code.lower() in country_alpha3_map and is_orig_name:
+                        alpha3_row = row[:]
+                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
+                        alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
+                        rows.append(map(encode_field, alpha3_row))
+
+            writer.writerows(rows)
+            logging.info('Did {} batches'.format(i))
+            i += 1
+
+        cursor.close()
+        f.flush()
+
+    f.close()
+
+    logging.info('Sorting...')
+
+    env = os.environ.copy()
+    env['LC_ALL'] = 'C'
+
+    command = ['sort', '-t\t', '-u', '--ignore-case',
+               '-k{0},{0}'.format(NAME_INDEX + 1),
+               # If there's a Wikipedia link to this name for the given id, sort first
+               '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
+               # Language priority rules as above
+               '-k{0},{0}nr'.format(DUMMY_LANGUAGE_PRIORITY_INDEX + 1),
+               # Sort descending by population (basic proxy for relevance)
+               '-k{0},{0}nr'.format(POPULATION_INDEX + 1),
+               # group rows for the same geonames ID together
+               '-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
+               # preferred names come first within that grouping
+               '-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
+               # since uniquing is done on the sort key, add language
+               '-k{0},{0}'.format(LANGUAGE_INDEX + 1),
+               '-o', filename, temp_filename]
+
+    p = subprocess.Popen(command, env=env)
+
+    return_code = p.wait()
+    if return_code != 0:
+        raise subprocess.CalledProcessError(return_code, command)
+
+    os.unlink(temp_filename)
+
+
+def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
+    filename = os.path.join(out_dir, 'postal_codes.tsv')
+    temp_filename = filename + '.tmp'
+    f = open(temp_filename, 'w')
+
+    writer = csv.writer(f, 'tsv_no_quote')
+
+    cursor = db.execute(postal_codes_query)
+
+    i = 1
+    while True:
+        batch = cursor.fetchmany(BATCH_SIZE)
+        if not batch:
+            break
+        rows = [
+            map(encode_field, row)
+            for row in batch
+        ]
+        writer.writerows(rows)
+        logging.info('Did {} batches'.format(i))
+        i += 1
+
+    cursor.close()
+    f.close()
+
+    logging.info('Sorting...')
+
+    subprocess.check_call([
+        'sort', '-t\t', '--ignore-case',
+        '-k{0},{0}'.format(POSTAL_CODE_INDEX + 1),
+        '-k{0},{0}nr'.format(POSTAL_CODE_POP_INDEX + 1),
+        '-o', filename,
+        temp_filename
+    ])
+    os.unlink(temp_filename)
+
+# Generates a C header telling us the order of the fields as written
+GEONAMES_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                      'src', 'geonames_fields.h')
+
+GEONAMES_FIELDS_HEADER_FILE = '''enum geonames_fields {{
+    {fields},
+    NUM_GEONAMES_FIELDS
+}};
+'''.format(fields=''',
+    '''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(geonames_fields)]))
+
+
+def write_geonames_fields_header(filename=GEONAMES_FIELDS_HEADER):
+    with open(filename, 'w') as f:
+        f.write(GEONAMES_FIELDS_HEADER_FILE)
+
+POSTAL_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                    'src', 'postal_fields.h')
+
+POSTAL_FIELDS_HEADER_FILE = '''enum gn_postal_fields {{
+    {fields},
+    NUM_POSTAL_FIELDS
+}};
+'''.format(fields=''',
+    '''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(postal_code_fields)]))
+
+
+def write_postal_fields_header(filename=POSTAL_FIELDS_HEADER):
+    with open(filename, 'w') as f:
+        f.write(POSTAL_FIELDS_HEADER_FILE)
+
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--db',
+                        default=DEFAULT_GEONAMES_DB_PATH,
+                        help='SQLite db file')
+    parser.add_argument('-o', '--out',
+                        default=DEFAULT_DATA_DIR, help='output directory')
+    args = parser.parse_args()
+    db = sqlite3.connect(args.db)
+
+    create_geonames_tsv(db, args.out)
+    create_postal_codes_tsv(db, args.out)
+    write_geonames_fields_header()
+    write_postal_fields_header()
+    db.close()
--- a/scripts/geodata/geonames/db.py
+++ b/scripts/geodata/geonames/db.py
@@ -0,0 +1,30 @@
+import sqlite3
+from collections import defaultdict
+
+
+class GeoNamesDB(object):
+    names_query = '''
+    select iso_language, alternate_name,
+    is_preferred_name, is_short_name
+    from alternate_names
+    where geonames_id = ?
+    and is_historic != '1'
+    and is_colloquial != '1'
+    and iso_language != 'post'
+    order by iso_language, cast(is_preferred_name as integer) desc, cast(is_short_name as integer)
+    '''
+
+    def __init__(self, filename):
+        self.db = sqlite3.connect(filename)
+
+    def query(self, query, *params):
+        return self.db.execute(self.names_query, params)
+
+    def get_alternate_names(self, geonames_id):
+        cursor = self.query(self.names_query, geonames_id)
+        language_names = defaultdict(list)
+        for language, name, is_preferred, is_short in cursor:
+            language_names[language].append((name,
+                                             int(is_preferred or 0),
+                                             int(is_short or 0)))
+        return dict(language_names)
--- a/scripts/geodata/geonames/geonames_sqlite.py
+++ b/scripts/geodata/geonames/geonames_sqlite.py
@@ -0,0 +1,333 @@
+import os
+import shutil
+import sqlite3
+
+import tempfile
+import urlparse
+import urllib2
+import subprocess
+
+import logging
+
+import argparse
+
+import csv
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode
+from geodata.geonames.paths import *
+
+from geodata.file_utils import *
+from geodata.log import *
+
+from itertools import islice, chain
+
+log_to_file(sys.stderr)
+logger = logging.getLogger('geonames.sqlite')
+
+BASE_URL = 'http://download.geonames.org/export/'
+
+DUMP_URL = urlparse.urljoin(BASE_URL, 'dump/')
+ALL_COUNTRIES_ZIP_FILE = 'allCountries.zip'
+HIERARCHY_ZIP_FILE = 'hierarchy.zip'
+ALTERNATE_NAMES_ZIP_FILE = 'alternateNames.zip'
+
+ZIP_URL = urlparse.urljoin(BASE_URL, 'zip/')
+
+GEONAMES_DUMP_FILES = (ALL_COUNTRIES_ZIP_FILE,
+                       HIERARCHY_ZIP_FILE,
+                       ALTERNATE_NAMES_ZIP_FILE)
+
+# base_url, local_dir, is_gzipped, local_filename
+
+
+GEONAMES_FILES = [(DUMP_URL, '', True, ALL_COUNTRIES_ZIP_FILE),
+                  (DUMP_URL, '', True, HIERARCHY_ZIP_FILE),
+                  (DUMP_URL, '', True, ALTERNATE_NAMES_ZIP_FILE),
+                  (ZIP_URL, 'zip', True, ALL_COUNTRIES_ZIP_FILE),
+                  ]
+
+
+def download_file(url, dest):
+    logger.info('Downloading file from {}'.format(url))
+    subprocess.check_call(['wget', url, '-O', dest])
+
+
+def admin_ddl(admin_level):
+    columns = ['country_code TEXT'] + \
+              ['admin{}_code TEXT'.format(i)
+               for i in xrange(1, admin_level)]
+
+    create = '''
+    CREATE TABLE admin{level}_codes (
+    geonames_id INT,
+    code TEXT,
+    name TEXT,
+    {fields}
+    )'''.format(level=admin_level,
+                fields=''',
+    '''.join(columns))
+
+    indices = (
+        '''CREATE INDEX admin{}_code_index ON
+        admin{}_codes (code)'''.format(admin_level, admin_level),
+        '''CREATE INDEX admin{}_gn_id_index ON
+        admin{}_codes (geonames_id)'''.format(admin_level, admin_level),
+    )
+
+    return (create, ) + indices
+
+geonames_ddl = {
+    'geonames': (
+        '''CREATE TABLE geonames (
+        geonames_id INT PRIMARY KEY,
+        name TEXT,
+        ascii_name TEXT,
+        alternate_names TEXT,
+        latitude DOUBLE,
+        longitude DOUBLE,
+        feature_class TEXT,
+        feature_code TEXT,
+        country_code TEXT,
+        cc2 TEXT,
+        admin1_code TEXT,
+        admin2_code TEXT,
+        admin3_code TEXT,
+        admin4_code TEXT,
+        population LONG DEFAULT 0,
+        elevation INT,
+        dem INT,
+        timezone TEXT,
+        modification_date TEXT)''',
+        '''CREATE INDEX feature_code ON
+        geonames (feature_code)''',
+        '''CREATE INDEX country_code ON
+        geonames (country_code)''',
+        '''CREATE INDEX admin_codes ON
+        geonames (country_code, admin1_code, admin2_code, admin3_code, admin4_code)''',
+    ),
+
+    'alternate_names': (
+        '''CREATE TABLE alternate_names (
+        alternate_name_id INT PRIMARY KEY,
+        geonames_id INT,
+        iso_language TEXT,
+        alternate_name TEXT,
+        is_preferred_name BOOLEAN DEFAULT 0,
+        is_short_name BOOLEAN DEFAULT 0,
+        is_colloquial BOOLEAN DEFAULT 0,
+        is_historic BOOLEAN DEFAULT 0)''',
+        '''CREATE INDEX geonames_id_index ON
+        alternate_names (geonames_id)''',
+        '''CREATE INDEX geonames_id_alt_name_index ON
+        alternate_names(geonames_id, alternate_name)''',
+    ),
+
+    'hierarchy': (
+        '''CREATE TABLE hierarchy (
+        parent_id INT,
+        child_id INT,
+        type TEXT
+        );''',
+        '''CREATE INDEX parent_child_index ON
+        hierarchy (parent_id, child_id)''',
+        '''CREATE INDEX child_parent_index ON
+        hierarchy (child_id, parent_id)''',
+    ),
+
+    'postal_codes': (
+        '''CREATE TABLE postal_codes (
+        country_code TEXT,
+        postal_code TEXT,
+        place_name TEXT,
+        admin1 TEXT,
+        admin1_code TEXT,
+        admin2 TEXT,
+        admin2_code TEXT,
+        admin3 TEXT,
+        admin3_code TEXT,
+        latitude DOUBLE,
+        longitude DOUBLE,
+        accuracy INT
+        )''',
+        '''CREATE INDEX post_code_index ON
+        postal_codes (country_code, postal_code)''',
+        '''CREATE INDEX postal_code_admins ON
+        postal_codes (country_code, admin1_code, admin2_code, admin3_code)''',
+    ),
+    'admin1_codes': admin_ddl(1),
+    'admin2_codes': admin_ddl(2),
+    'admin3_codes': admin_ddl(3),
+    'admin4_codes': admin_ddl(4),
+
+}
+
+geonames_file_table_map = {
+    ('', ALL_COUNTRIES_ZIP_FILE): 'geonames',
+    ('', ALTERNATE_NAMES_ZIP_FILE): 'alternate_names',
+    ('', HIERARCHY_ZIP_FILE): 'hierarchy',
+    ('zip', ALL_COUNTRIES_ZIP_FILE): 'postal_codes',
+}
+
+
+country_codes_create_table = (
+    'drop table if exists country_codes',
+    '''
+    create table country_codes as
+    select distinct country_code from geonames
+    where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS', 'TERR')
+    ''',
+)
+
+proper_countries_create_table = (
+    'drop table if exists proper_countries',
+    '''
+    create table proper_countries as
+    select * from geonames
+    where feature_code in ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
+    and country_code in (select country_code from country_codes)
+    ''',
+)
+
+territories_create_table = (
+    'drop table if exists territories',
+    '''
+    create table territories as
+    select * from geonames where feature_code = 'TERR'
+    and country_code not in (select country_code from proper_countries);
+    ''',
+)
+
+countries_create_table = (
+    'drop table if exists countries',
+    '''
+    create table countries as
+    select * from proper_countries
+    union
+    select * from territories;
+    ''',
+    'create index country_geonames_id on countries (geonames_id)',
+    'create index conntry_country_code on countries (country_code)',
+)
+
+country_alises_create_table = (
+    'drop table if exists country_aliases',
+    '''
+    create table country_aliases as
+    select name, country_code
+    from countries
+    union
+    select alternate_name, country_code
+    from alternate_names an
+    join countries c
+        on c.geonames_id = an.geonames_id
+    where alternate_name != ''
+    and iso_language not in ('doi','faac','iata',
+                             'icao','link','post','tcid')
+    '''
+)
+
+country_table_create_statements = list(chain(country_codes_create_table,
+                                             proper_countries_create_table,
+                                             territories_create_table,
+                                             countries_create_table,
+                                             country_alises_create_table))
+
+
+def create_table(conn, table):
+    cursor = conn.cursor()
+    create_statements = geonames_ddl[table]
+    cursor.execute('DROP TABLE IF EXISTS {}'.format(table))
+    for statement in create_statements:
+        cursor.execute(statement)
+    conn.commit()
+
+
+def batch_iter(iterable, batch_size):
+    source_iter = iter(iterable)
+    while True:
+        batch = list(islice(source_iter, batch_size))
+        if len(batch) > 0:
+            yield batch
+        else:
+            return
+
+
+def populate_admin_table(conn, admin_level):
+    logging.info('Doing admin level {}'.format(admin_level))
+
+    columns = ['geonames_id',
+               'admin{}_code'.format(admin_level),
+               'name',
+               'country_code']
+    columns.extend(['admin{}_code'.format(i)
+                    for i in xrange(1, admin_level)])
+
+    admin_insert_statement = '''
+    insert into "admin{}_codes"
+    select {}
+    from geonames
+    where feature_code = "ADM{}"
+    '''.format(admin_level, ','.join(columns), admin_level)
+
+    conn.execute(admin_insert_statement)
+    conn.commit()
+
+    logging.info('Done with admin level {}'.format(admin_level))
+
+
+def import_geonames_table(conn, table, f, batch_size=2000):
+    # escape the brackets around the values format string so we can use later
+    statement = 'INSERT INTO "{}" VALUES ({{}})'.format(table)
+    cursor = conn.cursor()
+    for i, batch in enumerate(batch_iter(f, batch_size)):
+        num_cols = len(batch[0])
+        cursor.executemany(statement.format(','.join(['?'] * num_cols)), batch)
+        conn.commit()
+        cursor = conn.cursor()
+        logging.info('imported {} batches ({} records)'.format(i + 1, (i + 1) * batch_size))
+    cursor.close()
+
+
+def create_geonames_sqlite_db(temp_dir, db_file=DEFAULT_GEONAMES_DB_PATH):
+    conn = sqlite3.connect(db_file)
+    logging.info('Created database at {}'.format(db_file))
+    for url, directory, is_gzipped, filename in GEONAMES_FILES:
+        table = geonames_file_table_map[(directory, filename)]
+        create_table(conn, table)
+        full_url = urlparse.urljoin(url, filename)
+        dest_dir = os.path.join(temp_dir, directory)
+        ensure_dir(dest_dir)
+        dest_file = os.path.join(dest_dir, filename)
+        download_file(full_url, dest_file)
+        if is_gzipped:
+            unzip_file(dest_file, dest_dir)
+            filename = dest_file.replace('.zip', '.txt')
+        reader = csv.reader(open(filename), delimiter='\t', quotechar=None)
+        lines = (map(safe_decode, line) for line in reader)
+        import_geonames_table(conn, table, lines)
+    logging.info('Creating countries tables')
+    for statement in country_table_create_statements:
+        conn.execute(statement)
+        conn.commit()
+    logging.info('Creating admin tables')
+    for admin_level in xrange(1, 5):
+        create_table(conn, 'admin{}_codes'.format(admin_level))
+        populate_admin_table(conn, admin_level)
+    conn.close()
+
+
+if __name__ == '__main__':
+    # Handle argument parsing here
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--temp-dir',
+                        default=tempfile.gettempdir(),
+                        help='Temporary work directory')
+    parser.add_argument('-o', '--out',
+                        default=DEFAULT_GEONAMES_DB_PATH,
+                        help='SQLite3 db filename')
+    args = parser.parse_args()
+    create_geonames_sqlite_db(args.temp_dir, args.out)
--- a/scripts/geodata/geonames/paths.py
+++ b/scripts/geodata/geonames/paths.py
@@ -0,0 +1,9 @@
+import os
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+
+GEONAMES_DB_NAME = 'geonames.db'
+
+DEFAULT_GEONAMES_DB_PATH = os.path.join(this_dir, os.path.pardir,
+                                        os.path.pardir, os.path.pardir,
+                                        'data', 'geonames', GEONAMES_DB_NAME)