[geonames] Using official country languages in GeoNames

This commit is contained in:
Al
2015-10-01 00:45:34 -04:00
parent 01856dd36d
commit db3364be30

View File

@@ -1,3 +1,24 @@
'''
create_geonames_tsv.py
----------------------
This script formats the open GeoNames database (as well as
its accompanying postal codes data set) into a schema'd
tab-separated value file.
It generates a C header which uses an enum for the field names.
This way if new fields are added or there's a typo, etc. the
error will show up at compile-time.
The relevant C modules which operate on this data are:
geodb_builder.c
geonames.c
As well as the generated headers:
geonames_fields.h
postal_fields.h
'''
import argparse import argparse
import csv import csv
import logging import logging
@@ -15,7 +36,7 @@ import unicodedata
import urllib import urllib
import urlparse import urlparse
from collections import defaultdict from collections import defaultdict, OrderedDict
from lxml import etree from lxml import etree
this_dir = os.path.realpath(os.path.dirname(__file__)) this_dir = os.path.realpath(os.path.dirname(__file__))
@@ -23,6 +44,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.csv_utils import * from geodata.csv_utils import *
from geodata.file_utils import * from geodata.file_utils import *
from geodata.countries.country_names import *
from geodata.encoding import safe_encode, safe_decode from geodata.encoding import safe_encode, safe_decode
from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
from geodata.i18n.languages import * from geodata.i18n.languages import *
@@ -56,9 +78,6 @@ POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
NEIGHBORHOOD_FEATURE_CODES = ('PPLX', ) NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
CLDR_ENGLISH_PATH = os.path.join(CLDR_DIR, 'common', 'main', 'en.xml')
class boundary_types: class boundary_types:
COUNTRY = 0 COUNTRY = 0
ADMIN1 = 1 ADMIN1 = 1
@@ -69,16 +88,16 @@ class boundary_types:
LOCALITY = 6 LOCALITY = 6
NEIGHBORHOOD = 7 NEIGHBORHOOD = 7
geonames_admin_dictionaries = { geonames_admin_dictionaries = OrderedDict([
boundary_types.COUNTRY: COUNTRY_FEATURE_CODES, (boundary_types.COUNTRY, COUNTRY_FEATURE_CODES),
boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES, (boundary_types.ADMIN1, ADMIN_1_FEATURE_CODES),
boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES, (boundary_types.ADMIN2, ADMIN_2_FEATURE_CODES),
boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES, (boundary_types.ADMIN3, ADMIN_3_FEATURE_CODES),
boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES, (boundary_types.ADMIN4, ADMIN_4_FEATURE_CODES),
boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES, (boundary_types.ADMIN_OTHER, ADMIN_OTHER_FEATURE_CODES),
boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES, (boundary_types.LOCALITY, POPULATED_PLACE_FEATURE_CODES),
boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES, (boundary_types.NEIGHBORHOOD, NEIGHBORHOOD_FEATURE_CODES),
} ])
# Inserted post-query # Inserted post-query
DUMMY_BOUNDARY_TYPE = '-1 as type' DUMMY_BOUNDARY_TYPE = '-1 as type'
@@ -277,33 +296,6 @@ order by alternate_name, is_preferred_name
BATCH_SIZE = 2000 BATCH_SIZE = 2000
IGNORE_COUNTRIES = set(['ZZ'])
COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])
def cldr_country_names(filename=CLDR_ENGLISH_PATH):
xml = etree.parse(open(filename))
country_names = {}
for territory in xml.xpath('*//territories/*'):
country_code = territory.attrib['type']
if country_code in IGNORE_COUNTRIES and not country_code.isdigit():
continue
elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short':
continue
elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant':
continue
elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'):
continue
country_names[country_code] = safe_encode(territory.text)
return country_names
wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*') wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
@@ -354,7 +346,7 @@ def utf8_normalize(s, form='NFD'):
def get_wikipedia_titles(db): def get_wikipedia_titles(db):
d = defaultdict(list) d = defaultdict(dict)
cursor = db.execute(wikipedia_query) cursor = db.execute(wikipedia_query)
@@ -367,13 +359,15 @@ def get_wikipedia_titles(db):
title = normalize_wikipedia_url(safe_encode(url)) title = normalize_wikipedia_url(safe_encode(url))
if title is not None and title.strip(): if title is not None and title.strip():
title = utf8_normalize(normalize_name(title)) title = utf8_normalize(normalize_name(title))
d[title.lower()].append((geonames_id, int(is_preferred or 0))) d[title.lower()][geonames_id] = int(is_preferred or 0)
return {title: sorted(values, key=operator.itemgetter(1), reverse=True) return d
for title, values in d.iteritems()}
def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR): def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
'''
Writes geonames.tsv using the specified db to the specified data directory
'''
filename = os.path.join(out_dir, 'geonames.tsv') filename = os.path.join(out_dir, 'geonames.tsv')
temp_filename = filename + '.tmp' temp_filename = filename + '.tmp'
@@ -383,14 +377,12 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
init_languages() init_languages()
country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries} init_country_names()
country_alpha2 = set([c.alpha2 for c in pycountry.countries])
country_names = cldr_country_names()
wiki_titles = get_wikipedia_titles(db) wiki_titles = get_wikipedia_titles(db)
logging.info('Fetched Wikipedia titles') logging.info('Fetched Wikipedia titles')
# Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
for boundary_type, codes in geonames_admin_dictionaries.iteritems(): for boundary_type, codes in geonames_admin_dictionaries.iteritems():
if boundary_type != boundary_types.COUNTRY: if boundary_type != boundary_types.COUNTRY:
predicate = 'where gn.feature_code in ({codes})'.format( predicate = 'where gn.feature_code in ({codes})'.format(
@@ -407,6 +399,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
cursor = db.execute(query) cursor = db.execute(query)
i = 1 i = 1
while True: while True:
# Fetch rows in batches to save memory
batch = cursor.fetchmany(BATCH_SIZE) batch = cursor.fetchmany(BATCH_SIZE)
if not batch: if not batch:
break break
@@ -422,30 +415,12 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
is_preferred = int(row[PREFERRED_INDEX] or 0) is_preferred = int(row[PREFERRED_INDEX] or 0)
is_historical = int(row[HISTORICAL_INDEX] or 0) is_historical = int(row[HISTORICAL_INDEX] or 0)
lang_official = official_languages[country_code.lower()].get(language, None) lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
null_language = not language.strip() null_language = not language.strip()
is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
if is_historical:
language_priority = 0
elif not null_language and language != 'abbr' and lang_official is None:
language_priority = 1
elif null_language and not is_preferred and not is_canonical:
language_priority = 2
elif language == 'abbr' and not is_preferred:
language_priority = 3
elif language == 'abbr' and is_preferred:
language_priority = 4
elif lang_official == 0:
language_priority = 5
elif lang_official == 1 or (null_language and not is_preferred and is_canonical):
language_priority = 6
elif is_preferred:
language_priority = 7
row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
alpha2_code = None alpha2_code = None
is_orig_name = False is_orig_name = False
@@ -453,7 +428,8 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
alpha2_code = row[COUNTRY_CODE_INDEX] alpha2_code = row[COUNTRY_CODE_INDEX]
is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == '' is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]] # Set the canonical for countries to the local name, see country_official_name in country_names.py
row[CANONICAL_NAME_INDEX] = country_localized_display_name(alpha2_code.lower())
geonames_id = row[GEONAMES_ID_INDEX] geonames_id = row[GEONAMES_ID_INDEX]
@@ -463,12 +439,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
if name.isdigit(): if name.isdigit():
continue continue
canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX])) wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))
row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
have_wikipedia = False
wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))
row[NAME_INDEX] = name row[NAME_INDEX] = name
@@ -476,17 +447,59 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
norm_name = normalize_name(name.lower()) norm_name = normalize_name(name.lower())
for s, repl in saint_replacements: for s, repl in saint_replacements:
if not wikipedia_entries: if not wikipedia_entries:
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), []) wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
wiki_row = [] wiki_row = []
for gid, is_preferred in wikipedia_entries: have_wikipedia = geonames_id in wikipedia_entries
if gid == geonames_id: wiki_preferred = wikipedia_entries.get(geonames_id, 0)
wiki_row = row[:]
wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1 '''
rows.append(map(encode_field, wiki_row)) The following set of heuristics assigns a numerical value to a given name
have_wikipedia = True alternative, such that in the case of ambiguous names, this value can be
break used as part of the ranking function (as indeed it will be during sort).
The higher the value, the more likely the given entity resolution.
'''
if is_historical:
# Historical names, unlikely to be used
language_priority = 0
elif not null_language and language != 'abbr' and lang_spoken is None:
# Name of a place in language not widely spoken e.g. Japanese name for a US toponym
language_priority = 1
elif null_language and not is_preferred and not is_canonical:
# Null-language alternate names not marked as preferred, dubious
language_priority = 2
elif language == 'abbr' and not is_preferred:
# Abbreviation, not preferred
language_priority = 3
elif language == 'abbr' and is_preferred:
# Abbreviation, preferred e.g. NYC, UAE
language_priority = 4
elif lang_spoken == 0 and not is_preferred:
# Non-preferred name but in a spoken (non-official) language
language_priority = 5
elif lang_official == 1 and not is_preferred:
# Name in an official language, not preferred
language_priority = 6
elif null_language and not is_preferred and is_canonical:
# Canonical name, may be overly official e.g. Islamic Republic of Pakistan
language_priority = 7
elif is_preferred and not lang_official:
# Preferred names, not an official language
language_priority = 8
elif is_preferred and lang_official:
# Official language preferred
language_priority = 9
row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
if have_wikipedia:
wiki_row = row[:]
wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
rows.append(map(encode_field, wiki_row))
canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
have_normalized = False have_normalized = False
@@ -502,8 +515,9 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
if not have_wikipedia: if not have_wikipedia:
rows.append(map(encode_field, row)) rows.append(map(encode_field, row))
# Country names have more specialized logic
if boundary_type == boundary_types.COUNTRY: if boundary_type == boundary_types.COUNTRY:
wikipedia_entries = wiki_titles.get(canonical.lower(), []) wikipedia_entries = wiki_titles.get(canonical.lower(), {})
canonical_row_name = normalize_display_name(canonical) canonical_row_name = normalize_display_name(canonical)
@@ -516,19 +530,19 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
norm_name = normalize_name(canonical.lower()) norm_name = normalize_name(canonical.lower())
for s, repl in saint_replacements: for s, repl in saint_replacements:
if not wikipedia_entries: if not wikipedia_entries:
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), []) wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
if not wikipedia_entries: if not wikipedia_entries:
norm_name = normalize_name(canonical_row_name.lower()) norm_name = normalize_name(canonical_row_name.lower())
for s, repl in saint_replacements: for s, repl in saint_replacements:
if not wikipedia_entries: if not wikipedia_entries:
wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), []) wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
for gid, is_preferred in wikipedia_entries: have_wikipedia = geonames_id in wikipedia_entries
if gid == geonames_id: wiki_preferred = wikipedia_entries.get(geonames_id, 0)
have_wikipedia = True
canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1 if have_wikipedia:
break canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
if (name != canonical): if (name != canonical):
rows.append(map(encode_field, canonical_row)) rows.append(map(encode_field, canonical_row))
@@ -543,9 +557,9 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10 alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
rows.append(map(encode_field, alpha2_row)) rows.append(map(encode_field, alpha2_row))
if alpha2_code in country_code_alpha3_map and is_orig_name: if alpha2_code.lower() in country_alpha3_map and is_orig_name:
alpha3_row = row[:] alpha3_row = row[:]
alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code] alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10 alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
rows.append(map(encode_field, alpha3_row)) rows.append(map(encode_field, alpha3_row))