libpostal/scripts/geodata/geonames/create_geonames_tsv.py

import argparse
import csv
import logging
import operator
import os
import re
import sqlite3
import subprocess
import sys

import requests
import pycountry

import unicodedata

import urllib
import urlparse

from collections import defaultdict
from lxml import etree

this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))

from geodata.file_utils import *
from geodata.encoding import safe_encode, safe_decode
from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
from geodata.i18n.unicode_paths import CLDR_DIR
from geodata.log import log_to_file

multispace_regex = re.compile('[\s]+')

csv.register_dialect('tsv_no_quote', delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='')


def encode_field(value):
    return multispace_regex.sub(' ', safe_encode((value if value is not None else '')))

log_to_file(sys.stderr)

DEFAULT_DATA_DIR = os.path.join(this_dir, os.path.pardir, os.path.pardir,
                                os.path.pardir, 'data', 'geonames')

COUNTRY_FEATURE_CODES = ('PCL', 'PCLI', 'PCLIX', 'PCLD', 'PCLF', 'PCLS')
CONTINENT_FEATURE_CODES = ('CONT',)

ADMIN_1_FEATURE_CODES = ('ADM1',)
ADMIN_2_FEATURE_CODES = ('ADM2',)
ADMIN_3_FEATURE_CODES = ('ADM3',)
ADMIN_4_FEATURE_CODES = ('ADM4',)
OTHER_ADMIN_FEATURE_CODES = ('ADM5',)
ADMIN_OTHER_FEATURE_CODES = ('ADMD', )

POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
                                 'PPLC', 'PPLCH', 'PPLF', 'PPLG', 'PPLL',
                                 'PPLR', 'PPLS', 'STLMT')
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )


CLDR_ENGLISH_PATH = os.path.join(CLDR_DIR, 'common', 'main', 'en.xml')


class boundary_types:
    COUNTRY = 0
    ADMIN1 = 1
    ADMIN2 = 2
    ADMIN3 = 3
    ADMIN4 = 4
    ADMIN_OTHER = 5
    LOCALITY = 6
    NEIGHBORHOOD = 7

geonames_admin_dictionaries = {
    boundary_types.COUNTRY: COUNTRY_FEATURE_CODES,
    boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES,
    boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES,
    boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES,
    boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES,
    boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES,
    boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES,
    boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES,
}

# Inserted post-query
DUMMY_BOUNDARY_TYPE = '-1 as type'
DUMMY_HAS_WIKIPEDIA_ENTRY = '0 as has_wikipedia_entry'


class GeonamesField(object):
    def __init__(self, name, c_constant, default=None, is_dummy=False):
        self.name = name
        self.c_constant = c_constant
        self.default = default
        self.is_dummy = is_dummy

geonames_fields = [
    # Field if alternate_names present, default field name if not, C header constant
    GeonamesField('alternate_name', 'GEONAMES_NAME', default='gn.name'),
    GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'),
    GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'),
    GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True),
    GeonamesField(DUMMY_HAS_WIKIPEDIA_ENTRY, 'GEONAMES_HAS_WIKIPEDIA_ENTRY', is_dummy=True),
    GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"),
    GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'),
    GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
    GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
    GeonamesField('is_historic', 'GEONAMES_IS_HISTORICAL', default='0'),
    GeonamesField('population', 'GEONAMES_POPULATION'),
    GeonamesField('latitude', 'GEONAMES_LATITUDE'),
    GeonamesField('longitude', 'GEONAMES_LONGITUDE'),
    GeonamesField('feature_code', 'GEONAMES_FEATURE_CODE'),
    GeonamesField('gn.country_code as country_code', 'GEONAMES_COUNTRY_CODE'),
    GeonamesField('gn.admin1_code as admin1_code', 'GEONAMES_ADMIN1_CODE'),
    GeonamesField('a1.geonames_id as a1_gn_id', 'GEONAMES_ADMIN1_ID'),
    GeonamesField('gn.admin2_code as admin2_code', 'GEONAMES_ADMIN2_CODE'),
    GeonamesField('a2.geonames_id as a2_gn_id', 'GEONAMES_ADMIN2_ID'),
    GeonamesField('gn.admin3_code as admin3_code', 'GEONAMES_ADMIN3_CODE'),
    GeonamesField('a3.geonames_id as a3_gn_id', 'GEONAMES_ADMIN3_ID'),
    GeonamesField('gn.admin4_code as admin4_code', 'GEONAMES_ADMIN4_CODE'),
    GeonamesField('a4.geonames_id as a4_gn_id', 'GEONAMES_ADMIN4_ID'),
]

DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields)
                             if f.c_constant == 'GEONAMES_BOUNDARY_TYPE'][0]

DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX = [i for i, f in enumerate(geonames_fields)
                                   if f.c_constant == 'GEONAMES_HAS_WIKIPEDIA_ENTRY'][0]

GEONAMES_ID_INDEX = [i for i, f in enumerate(geonames_fields)
                     if f.c_constant == 'GEONAMES_ID'][0]

LANGUAGE_INDEX = [i for i, f in enumerate(geonames_fields)
                  if f.c_constant == 'GEONAMES_ISO_LANGUAGE'][0]

CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields)
                        if f.c_constant == 'GEONAMES_CANONICAL'][0]

NAME_INDEX = [i for i, f in enumerate(geonames_fields)
              if f.c_constant == 'GEONAMES_NAME'][0]

COUNTRY_CODE_INDEX = [i for i, f in enumerate(geonames_fields)
                      if f.c_constant == 'GEONAMES_COUNTRY_CODE'][0]

POPULATION_INDEX = [i for i, f in enumerate(geonames_fields)
                    if f.c_constant == 'GEONAMES_POPULATION'][0]

PREFERRED_INDEX = [i for i, f in enumerate(geonames_fields)
                   if f.c_constant == 'GEONAMES_IS_PREFERRED_NAME'][0]

HISTORICAL_INDEX = [i for i, f in enumerate(geonames_fields)
                    if f.c_constant == 'GEONAMES_IS_HISTORICAL'][0]


geonames_admin_joins = '''
left join admin1_codes a1
    on a1.code = gn.admin1_code
    and a1.country_code = gn.country_code
left join admin2_codes a2
    on a2.code = gn.admin2_code
    and a2.admin1_code = gn.admin1_code
    and a2.country_code = gn.country_code
left join admin3_codes a3
    on a3.code = gn.admin3_code
    and a3.admin1_code = gn.admin1_code
    and a3.admin2_code = gn.admin2_code
    and a3.country_code = gn.country_code
left join admin4_codes a4
    on a4.code = gn.admin4_code
    and a4.admin1_code = gn.admin1_code
    and a4.admin2_code = gn.admin2_code
    and a4.admin3_code = gn.admin3_code
    and a4.country_code = gn.country_code
'''

# Canonical names are stored in the geonames table with alternates
# stored in a separate table. UNION ALL query will capture them all.

base_geonames_query = '''
select {geonames_fields}
from geonames gn
{admin_joins}
{{predicate}}
union all
select {alt_name_fields}
from geonames gn
join alternate_names an
    on an.geonames_id = gn.geonames_id
    and iso_language not in ('doi','faac','iata',
                             'icao','link','post','tcid')
    and an.alternate_name != gn.name
{admin_joins}
{{predicate}}
'''.format(
    geonames_fields=', '.join((f.name if f.default is None else
                               '{} as {}'.format(f.default, f.name)
                               for f in geonames_fields)),
    alt_name_fields=', '.join((f.name for f in geonames_fields)),
    admin_joins=geonames_admin_joins
)

IGNORE_COUNTRY_POSTAL_CODES = set([
    'AR',   # GeoNames has pre-1999 postal codes
])

postal_code_fields = [
    GeonamesField('postal_code', 'GN_POSTAL_CODE'),
    GeonamesField('p.country_code as country_code', 'GN_POSTAL_COUNTRY_CODE'),
    GeonamesField('n.geonames_id as containing_geoname_id', 'GN_POSTAL_CONTAINING_GEONAME_ID'),
    GeonamesField('group_concat(distinct a1.geonames_id) admin1_ids', 'GN_POSTAL_ADMIN1_IDS'),
    GeonamesField('group_concat(distinct a2.geonames_id) admin2_ids', 'GN_POSTAL_ADMIN2_IDS'),
    GeonamesField('group_concat(distinct a3.geonames_id) admin3_ids', 'GN_POSTAL_ADMIN3_IDS'),
]

postal_codes_query = '''
select
{fields}
from postal_codes p
left join (
    select
    gn.geonames_id,
    alternate_name,
    country_code,
    gn.name
    from alternate_names an
    join geonames gn
        on an.geonames_id = gn.geonames_id
    where iso_language = 'post'
) as n
on p.postal_code = n.alternate_name
and p.country_code = n.country_code
left join admin1_codes a1
    on a1.code = p.admin1_code
    and p.country_code = a1.country_code
left join admin2_codes a2
    on a2.code = p.admin2_code
    and a2.admin1_code = p.admin1_code
    and a2.country_code = p.country_code
left join admin3_codes a3
    on a3.code = p.admin3_code
    and a3.admin1_code = p.admin1_code
    and a3.admin2_code = p.admin2_code
    and a3.country_code = p.country_code
where p.country_code not in ({exclude_country_codes})
group by postal_code, p.country_code
'''.format(
    fields=','.join([f.name for f in postal_code_fields]),
    exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))


wikipedia_query = '''
select alternate_name, geonames_id, is_preferred_name
from alternate_names
where iso_language = 'link'
and alternate_name like '%%en.wikipedia%%'
order by alternate_name, is_preferred_name
'''

BATCH_SIZE = 2000


IGNORE_COUNTRIES = set(['ZZ'])

COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])


def cldr_country_names(filename=CLDR_ENGLISH_PATH):
    xml = etree.parse(open(filename))

    country_names = {}

    for territory in xml.xpath('*//territories/*'):
        country_code = territory.attrib['type']
        if country_code in IGNORE_COUNTRIES and not country_code.isdigit():
            continue
        elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short':
            continue
        elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant':
            continue
        elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'):
            continue

        country_names[country_code] = safe_encode(territory.text)

    return country_names


wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')


def normalize_wikipedia_title(title):
    return safe_decode(title).replace(u'_', u' ')


def normalize_wikipedia_url(url):
    url = urllib.unquote_plus(url)

    parsed = urlparse.urlsplit(url)
    if parsed.query:
        params = urlparse.parse_qs(parsed.query)
        if 'title' in params:
            return normalize_wikipedia_title(params['title'][0])

    title = parsed.path.rsplit('/', 1)[-1]
    if title not in ('index.php', 'index.html'):
        return normalize_wikipedia_title(title)

    return None


def normalize_name(name):
    name = name.replace('&', 'and')
    name = name.replace('-', ' ')
    name = name.replace(', ', ' ')
    name = name.replace(',', ' ')
    return name


saint_replacements = [
    ('st.', 'saint'),
    ('st.', 'st'),
    ('st', 'saint')
]


abbreviated_saint_regex = re.compile(r'\bSt(\.|\b)')


def normalize_display_name(name):
    return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')


def utf8_normalize(s, form='NFD'):
    return unicodedata.normalize(form, s)


def get_wikipedia_titles(db):
    d = defaultdict(list)

    cursor = db.execute(wikipedia_query)

    while True:
        batch = cursor.fetchmany(BATCH_SIZE)
        if not batch:
            break

        for (url, geonames_id, is_preferred) in batch:
            title = normalize_wikipedia_url(safe_encode(url))
            if title is not None and title.strip():
                title = utf8_normalize(normalize_name(title))
                d[title.lower()].append((geonames_id, int(is_preferred or 0)))

    return {title: sorted(values, key=operator.itemgetter(1), reverse=True)
            for title, values in d.iteritems()}


def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
    filename = os.path.join(out_dir, 'geonames.tsv')
    temp_filename = filename + '.tmp'

    f = open(temp_filename, 'w')

    writer = csv.writer(f, 'tsv_no_quote')

    country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries}
    country_alpha2 = set([c.alpha2 for c in pycountry.countries])

    country_names = cldr_country_names()

    wiki_titles = get_wikipedia_titles(db)
    logging.info('Fetched Wikipedia titles')

    for boundary_type, codes in geonames_admin_dictionaries.iteritems():
        if boundary_type != boundary_types.COUNTRY:
            predicate = 'where gn.feature_code in ({codes})'.format(
                codes=','.join(['"{}"'.format(c) for c in codes])
            )
        else:
            # The query for countries in GeoNames is somewhat non-trivial
            predicate = 'where gn.geonames_id in (select geonames_id from countries)'

        query = base_geonames_query.format(
            predicate=predicate
        )

        cursor = db.execute(query)
        i = 1
        while True:
            batch = cursor.fetchmany(BATCH_SIZE)
            if not batch:
                break
            rows = []
            for row in batch:
                row = list(row)
                row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type

                alpha2_code = None
                is_orig_name = False

                if boundary_type == boundary_types.COUNTRY:
                    alpha2_code = row[COUNTRY_CODE_INDEX]

                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
                    row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]]

                geonames_id = row[GEONAMES_ID_INDEX]

                name = utf8_normalize(safe_decode(row[NAME_INDEX]))
                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)

                have_wikipedia = False

                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))

                row[NAME_INDEX] = name

                if boundary_type == boundary_types.COUNTRY:
                    norm_name = normalize_name(name.lower())
                    for s, repl in saint_replacements:
                        if not wikipedia_entries:
                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])

                wiki_row = []

                for gid, is_preferred in wikipedia_entries:
                    if gid == geonames_id:
                        wiki_row = row[:]
                        wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
                        rows.append(map(encode_field, wiki_row))
                        have_wikipedia = True
                        break

                have_normalized = False

                if is_orig_name:
                    canonical_row = wiki_row[:] if have_wikipedia else row[:]

                    canonical_row_name = normalize_display_name(name)
                    if canonical_row_name != name:
                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
                        have_normalized = True
                        rows.append(map(encode_field, canonical_row))

                if not have_wikipedia:
                    rows.append(map(encode_field, row))

                if boundary_type == boundary_types.COUNTRY:
                    wikipedia_entries = wiki_titles.get(canonical.lower(), [])

                    canonical_row_name = normalize_display_name(canonical)

                    canonical_row = row[:]

                    if is_orig_name:
                        canonical = safe_decode(canonical)
                        canonical_row[NAME_INDEX] = safe_encode(canonical)

                        norm_name = normalize_name(canonical.lower())
                        for s, repl in saint_replacements:
                            if not wikipedia_entries:
                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])

                        if not wikipedia_entries:
                            norm_name = normalize_name(canonical_row_name.lower())
                            for s, repl in saint_replacements:
                                if not wikipedia_entries:
                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])

                        for gid, is_preferred in wikipedia_entries:
                            if gid == geonames_id:
                                have_wikipedia = True
                                canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
                                break

                        if (name != canonical):
                            rows.append(map(encode_field, canonical_row))

                    if canonical_row_name != canonical and canonical_row_name != name:
                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
                        rows.append(map(encode_field, canonical_row))

                    if alpha2_code and is_orig_name:
                        alpha2_row = row[:]
                        alpha2_row[NAME_INDEX] = alpha2_code
                        rows.append(map(encode_field, alpha2_row))

                    if alpha2_code in country_code_alpha3_map and is_orig_name:
                        alpha3_row = row[:]
                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code]
                        rows.append(map(encode_field, alpha3_row))

            writer.writerows(rows)
            logging.info('Did {} batches'.format(i))
            i += 1

        cursor.close()
        f.flush()

    f.close()

    logging.info('Sorting...')
    subprocess.check_call(['sort', '-t\t', '-u', '--ignore-case',
                           '-k{0},{0}'.format(NAME_INDEX + 1),
                           # If there's a Wikipedia link to this name for the given id, sort first
                           '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
                           # Historical entries should be sorted last
                           '-k{0},{0}n'.format(HISTORICAL_INDEX + 1),
                           # Sort descending by population (basic proxy for relevance)
                           '-k{0},{0}nr'.format(POPULATION_INDEX + 1),
                           # group rows for the same geonames ID together
                           '-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
                           # preferred names come first within that grouping
                           '-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
                           # since uniquing is done on the sort key, add language
                           '-k{0},{0}'.format(LANGUAGE_INDEX + 1),
                           '-o', filename, temp_filename])
    os.unlink(temp_filename)


def create_postal_codes_tsv(db, out_dir=DEFAULT_DATA_DIR):
    filename = os.path.join(out_dir, 'postal_codes.tsv')
    temp_filename = filename + '.tmp'
    f = open(temp_filename, 'w')

    writer = csv.writer(f, 'tsv_no_quote')

    cursor = db.execute(postal_codes_query)

    i = 1
    while True:
        batch = cursor.fetchmany(BATCH_SIZE)
        if not batch:
            break
        rows = [
            map(encode_field, row)
            for row in batch
        ]
        writer.writerows(rows)
        logging.info('Did {} batches'.format(i))
        i += 1

    cursor.close()
    f.close()

    logging.info('Sorting...')
    subprocess.check_call(['sort', '-t\t', '-k1,1', '-k2,2', '-o', filename, temp_filename])
    os.unlink(temp_filename)

# Generates a C header telling us the order of the fields as written
GEONAMES_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                                      'src', 'geonames_fields.h')

GEONAMES_FIELDS_HEADER_FILE = '''enum geonames_fields {{
    {fields},
    NUM_GEONAMES_FIELDS
}};
'''.format(fields=''',
    '''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(geonames_fields)]))


def write_geonames_fields_header(filename=GEONAMES_FIELDS_HEADER):
    with open(filename, 'w') as f:
        f.write(GEONAMES_FIELDS_HEADER_FILE)

POSTAL_FIELDS_HEADER = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                                    'src', 'postal_fields.h')

POSTAL_FIELDS_HEADER_FILE = '''enum gn_postal_fields {{
    {fields},
    NUM_POSTAL_FIELDS
}};
'''.format(fields=''',
    '''.join(['{}={}'.format(f.c_constant, i) for i, f in enumerate(postal_code_fields)]))


def write_postal_fields_header(filename=POSTAL_FIELDS_HEADER):
    with open(filename, 'w') as f:
        f.write(POSTAL_FIELDS_HEADER_FILE)


if __name__ == '__main__':
    # Handle argument parsing here
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--db',
                        default=DEFAULT_GEONAMES_DB_PATH,
                        help='SQLite db file')
    parser.add_argument('-o', '--out',
                        default=DEFAULT_DATA_DIR, help='output directory')
    args = parser.parse_args()
    db = sqlite3.connect(args.db)

    create_geonames_tsv(db, args.out)
    create_postal_codes_tsv(db, args.out)
    write_geonames_fields_header()
    write_postal_fields_header()
    db.close()