[geonames] Using official country languages in GeoNames

2015-10-01 00:45:34 -04:00
parent 01856dd36d
commit db3364be30
1 changed files with 108 additions and 94 deletions
--- a/scripts/geodata/geonames/create_geonames_tsv.py
+++ b/scripts/geodata/geonames/create_geonames_tsv.py
@@ -1,3 +1,24 @@
 '''
 create_geonames_tsv.py
 ----------------------
 This script formats the open GeoNames database (as well as
 its accompanying postal codes data set) into a schema'd
 tab-separated value file.
 It generates a C header which uses an enum for the field names.
 This way if new fields are added or there's a typo, etc. the
 error will show up at compile-time.
 The relevant C modules which operate on this data are:
    geodb_builder.c
    geonames.c
 As well as the generated headers:
    geonames_fields.h
    postal_fields.h
 '''
 import argparse
 import csv
 import logging
@@ -15,7 +36,7 @@ import unicodedata
 import urllib
 import urlparse
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from lxml import etree
 this_dir = os.path.realpath(os.path.dirname(__file__))
@@ -23,6 +44,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 from geodata.csv_utils import *
 from geodata.file_utils import *
 from geodata.countries.country_names import *
 from geodata.encoding import safe_encode, safe_decode
 from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
 from geodata.i18n.languages import *
@@ -56,9 +78,6 @@ POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
 NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
 CLDR_ENGLISH_PATH = os.path.join(CLDR_DIR, 'common', 'main', 'en.xml')
 class boundary_types:
    COUNTRY = 0
    ADMIN1 = 1
@@ -69,16 +88,16 @@ class boundary_types:
    LOCALITY = 6
    NEIGHBORHOOD = 7
-geonames_admin_dictionaries = {
+geonames_admin_dictionaries = OrderedDict([
-    boundary_types.COUNTRY: COUNTRY_FEATURE_CODES,
+    (boundary_types.COUNTRY, COUNTRY_FEATURE_CODES),
-    boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES,
+    (boundary_types.ADMIN1, ADMIN_1_FEATURE_CODES),
-    boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES,
+    (boundary_types.ADMIN2, ADMIN_2_FEATURE_CODES),
-    boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES,
+    (boundary_types.ADMIN3, ADMIN_3_FEATURE_CODES),
-    boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES,
+    (boundary_types.ADMIN4, ADMIN_4_FEATURE_CODES),
-    boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES,
+    (boundary_types.ADMIN_OTHER, ADMIN_OTHER_FEATURE_CODES),
-    boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES,
+    (boundary_types.LOCALITY, POPULATED_PLACE_FEATURE_CODES),
-    boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES,
+    (boundary_types.NEIGHBORHOOD, NEIGHBORHOOD_FEATURE_CODES),
-}
+])
 # Inserted post-query
 DUMMY_BOUNDARY_TYPE = '-1 as type'
@@ -277,33 +296,6 @@ order by alternate_name, is_preferred_name
 BATCH_SIZE = 2000
 IGNORE_COUNTRIES = set(['ZZ'])
 COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
 COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])
 def cldr_country_names(filename=CLDR_ENGLISH_PATH):
    xml = etree.parse(open(filename))
    country_names = {}
    for territory in xml.xpath('*//territories/*'):
        country_code = territory.attrib['type']
        if country_code in IGNORE_COUNTRIES and not country_code.isdigit():
            continue
        elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short':
            continue
        elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant':
            continue
        elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'):
            continue
        country_names[country_code] = safe_encode(territory.text)
    return country_names
 wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
@@ -354,7 +346,7 @@ def utf8_normalize(s, form='NFD'):
 def get_wikipedia_titles(db):
-    d = defaultdict(list)
+    d = defaultdict(dict)
    cursor = db.execute(wikipedia_query)
@@ -367,13 +359,15 @@ def get_wikipedia_titles(db):
            title = normalize_wikipedia_url(safe_encode(url))
            if title is not None and title.strip():
                title = utf8_normalize(normalize_name(title))
-                d[title.lower()].append((geonames_id, int(is_preferred or 0)))
+                d[title.lower()][geonames_id] = int(is_preferred or 0)
-    return {title: sorted(values, key=operator.itemgetter(1), reverse=True)
+    return d
            for title, values in d.iteritems()}
 def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
    '''
    Writes geonames.tsv using the specified db to the specified data directory
    '''
    filename = os.path.join(out_dir, 'geonames.tsv')
    temp_filename = filename + '.tmp'
@@ -383,14 +377,12 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
    init_languages()
-    country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries}
+    init_country_names()
    country_alpha2 = set([c.alpha2 for c in pycountry.countries])
    country_names = cldr_country_names()
    wiki_titles = get_wikipedia_titles(db)
    logging.info('Fetched Wikipedia titles')
    # Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
    for boundary_type, codes in geonames_admin_dictionaries.iteritems():
        if boundary_type != boundary_types.COUNTRY:
            predicate = 'where gn.feature_code in ({codes})'.format(
@@ -407,6 +399,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
        cursor = db.execute(query)
        i = 1
        while True:
            # Fetch rows in batches to save memory
            batch = cursor.fetchmany(BATCH_SIZE)
            if not batch:
                break
@@ -422,30 +415,12 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                is_preferred = int(row[PREFERRED_INDEX] or 0)
                is_historical = int(row[HISTORICAL_INDEX] or 0)
-                lang_official = official_languages[country_code.lower()].get(language, None)
+                lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
                lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
                null_language = not language.strip()
                is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
                if is_historical:
                    language_priority = 0
                elif not null_language and language != 'abbr' and lang_official is None:
                    language_priority = 1
                elif null_language and not is_preferred and not is_canonical:
                    language_priority = 2
                elif language == 'abbr' and not is_preferred:
                    language_priority = 3
                elif language == 'abbr' and is_preferred:
                    language_priority = 4
                elif lang_official == 0:
                    language_priority = 5
                elif lang_official == 1 or (null_language and not is_preferred and is_canonical):
                    language_priority = 6
                elif is_preferred:
                    language_priority = 7
                row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
                alpha2_code = None
                is_orig_name = False
@@ -453,7 +428,8 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                    alpha2_code = row[COUNTRY_CODE_INDEX]
                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
-                    row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]]
+                    # Set the canonical for countries to the local name, see country_official_name in country_names.py
                    row[CANONICAL_NAME_INDEX] = country_localized_display_name(alpha2_code.lower())
                geonames_id = row[GEONAMES_ID_INDEX]
@@ -463,12 +439,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                if name.isdigit():
                    continue
-                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
+                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))
                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
                have_wikipedia = False
                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))
                row[NAME_INDEX] = name
@@ -476,17 +447,59 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                    norm_name = normalize_name(name.lower())
                    for s, repl in saint_replacements:
                        if not wikipedia_entries:
-                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
                wiki_row = []
-                for gid, is_preferred in wikipedia_entries:
+                have_wikipedia = geonames_id in wikipedia_entries
-                    if gid == geonames_id:
+                wiki_preferred = wikipedia_entries.get(geonames_id, 0)
-                        wiki_row = row[:]
+
-                        wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
+                '''
-                        rows.append(map(encode_field, wiki_row))
+                The following set of heuristics assigns a numerical value to a given name
-                        have_wikipedia = True
+                alternative, such that in the case of ambiguous names, this value can be
-                        break
+                used as part of the ranking function (as indeed it will be during sort).
                The higher the value, the more likely the given entity resolution.
                '''
                if is_historical:
                    # Historical names, unlikely to be used
                    language_priority = 0
                elif not null_language and language != 'abbr' and lang_spoken is None:
                    # Name of a place in language not widely spoken e.g. Japanese name for a US toponym
                    language_priority = 1
                elif null_language and not is_preferred and not is_canonical:
                    # Null-language alternate names not marked as preferred, dubious
                    language_priority = 2
                elif language == 'abbr' and not is_preferred:
                    # Abbreviation, not preferred
                    language_priority = 3
                elif language == 'abbr' and is_preferred:
                    # Abbreviation, preferred e.g. NYC, UAE
                    language_priority = 4
                elif lang_spoken == 0 and not is_preferred:
                    # Non-preferred name but in a spoken (non-official) language
                    language_priority = 5
                elif lang_official == 1 and not is_preferred:
                    # Name in an official language, not preferred
                    language_priority = 6
                elif null_language and not is_preferred and is_canonical:
                    # Canonical name, may be overly official e.g. Islamic Republic of Pakistan
                    language_priority = 7
                elif is_preferred and not lang_official:
                    # Preferred names, not an official language
                    language_priority = 8
                elif is_preferred and lang_official:
                    # Official language preferred
                    language_priority = 9
                row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
                if have_wikipedia:
                    wiki_row = row[:]
                    wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
                    rows.append(map(encode_field, wiki_row))
                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
                have_normalized = False
@@ -502,8 +515,9 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                if not have_wikipedia:
                    rows.append(map(encode_field, row))
                # Country names have more specialized logic
                if boundary_type == boundary_types.COUNTRY:
-                    wikipedia_entries = wiki_titles.get(canonical.lower(), [])
+                    wikipedia_entries = wiki_titles.get(canonical.lower(), {})
                    canonical_row_name = normalize_display_name(canonical)
@@ -516,19 +530,19 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                        norm_name = normalize_name(canonical.lower())
                        for s, repl in saint_replacements:
                            if not wikipedia_entries:
-                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
                        if not wikipedia_entries:
                            norm_name = normalize_name(canonical_row_name.lower())
                            for s, repl in saint_replacements:
                                if not wikipedia_entries:
-                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
-                        for gid, is_preferred in wikipedia_entries:
+                        have_wikipedia = geonames_id in wikipedia_entries
-                            if gid == geonames_id:
+                        wiki_preferred = wikipedia_entries.get(geonames_id, 0)
-                                have_wikipedia = True
+
-                                canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
+                        if have_wikipedia:
-                                break
+                            canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
                        if (name != canonical):
                            rows.append(map(encode_field, canonical_row))
@@ -543,9 +557,9 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                        alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
                        rows.append(map(encode_field, alpha2_row))
-                    if alpha2_code in country_code_alpha3_map and is_orig_name:
+                    if alpha2_code.lower() in country_alpha3_map and is_orig_name:
                        alpha3_row = row[:]
-                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code]
+                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
                        alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
                        rows.append(map(encode_field, alpha3_row))