From db3364be3051a283987d2db1139c7400ad7137e4 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Thu, 1 Oct 2015 00:45:34 -0400
Subject: [PATCH] [geonames] Using official country languages in GeoNames

---
 .../geodata/geonames/create_geonames_tsv.py   | 202 ++++++++++--------
 1 file changed, 108 insertions(+), 94 deletions(-)

diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py
index c2e78dda..dea4a1ae 100644
--- a/scripts/geodata/geonames/create_geonames_tsv.py
+++ b/scripts/geodata/geonames/create_geonames_tsv.py
@@ -1,3 +1,24 @@
+'''
+create_geonames_tsv.py
+----------------------
+
+This script formats the open GeoNames database (as well as
+its accompanying postal codes data set) into a schema'd
+tab-separated value file.
+
+It generates a C header which uses an enum for the field names.
+This way if new fields are added or there's a typo, etc. the
+error will show up at compile-time.
+
+The relevant C modules which operate on this data are:
+    geodb_builder.c
+    geonames.c
+
+As well as the generated headers:
+    geonames_fields.h
+    postal_fields.h
+'''
+
 import argparse
 import csv
 import logging
@@ -15,7 +36,7 @@ import unicodedata
 import urllib
 import urlparse
 
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from lxml import etree
 
 this_dir = os.path.realpath(os.path.dirname(__file__))
@@ -23,6 +44,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 
 from geodata.csv_utils import *
 from geodata.file_utils import *
+from geodata.countries.country_names import *
 from geodata.encoding import safe_encode, safe_decode
 from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
 from geodata.i18n.languages import *
@@ -56,9 +78,6 @@ POPULATED_PLACE_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4',
 NEIGHBORHOOD_FEATURE_CODES = ('PPLX', )
 
 
-CLDR_ENGLISH_PATH = os.path.join(CLDR_DIR, 'common', 'main', 'en.xml')
-
-
 class boundary_types:
     COUNTRY = 0
     ADMIN1 = 1
@@ -69,16 +88,16 @@ class boundary_types:
     LOCALITY = 6
     NEIGHBORHOOD = 7
 
-geonames_admin_dictionaries = {
-    boundary_types.COUNTRY: COUNTRY_FEATURE_CODES,
-    boundary_types.ADMIN1: ADMIN_1_FEATURE_CODES,
-    boundary_types.ADMIN2: ADMIN_2_FEATURE_CODES,
-    boundary_types.ADMIN3: ADMIN_3_FEATURE_CODES,
-    boundary_types.ADMIN4: ADMIN_4_FEATURE_CODES,
-    boundary_types.ADMIN_OTHER: ADMIN_OTHER_FEATURE_CODES,
-    boundary_types.LOCALITY: POPULATED_PLACE_FEATURE_CODES,
-    boundary_types.NEIGHBORHOOD: NEIGHBORHOOD_FEATURE_CODES,
-}
+geonames_admin_dictionaries = OrderedDict([
+    (boundary_types.COUNTRY, COUNTRY_FEATURE_CODES),
+    (boundary_types.ADMIN1, ADMIN_1_FEATURE_CODES),
+    (boundary_types.ADMIN2, ADMIN_2_FEATURE_CODES),
+    (boundary_types.ADMIN3, ADMIN_3_FEATURE_CODES),
+    (boundary_types.ADMIN4, ADMIN_4_FEATURE_CODES),
+    (boundary_types.ADMIN_OTHER, ADMIN_OTHER_FEATURE_CODES),
+    (boundary_types.LOCALITY, POPULATED_PLACE_FEATURE_CODES),
+    (boundary_types.NEIGHBORHOOD, NEIGHBORHOOD_FEATURE_CODES),
+])
 
 # Inserted post-query
 DUMMY_BOUNDARY_TYPE = '-1 as type'
@@ -277,33 +296,6 @@ order by alternate_name, is_preferred_name
 BATCH_SIZE = 2000
 
 
-IGNORE_COUNTRIES = set(['ZZ'])
-
-COUNTRY_USE_SHORT_NAME = set(['HK', 'MM', 'MO', 'PS'])
-COUNTRY_USE_VARIANT_NAME = set(['CD', 'CG', 'CI', 'TL'])
-
-
-def cldr_country_names(filename=CLDR_ENGLISH_PATH):
-    xml = etree.parse(open(filename))
-
-    country_names = {}
-
-    for territory in xml.xpath('*//territories/*'):
-        country_code = territory.attrib['type']
-        if country_code in IGNORE_COUNTRIES and not country_code.isdigit():
-            continue
-        elif country_code in COUNTRY_USE_SHORT_NAME and territory.attrib.get('alt') != 'short':
-            continue
-        elif country_code in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt') != 'variant':
-            continue
-        elif country_code not in COUNTRY_USE_SHORT_NAME and country_code not in COUNTRY_USE_VARIANT_NAME and territory.attrib.get('alt'):
-            continue
-
-        country_names[country_code] = safe_encode(territory.text)
-
-    return country_names
-
-
 wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
 
 
@@ -354,7 +346,7 @@ def utf8_normalize(s, form='NFD'):
 
 
 def get_wikipedia_titles(db):
-    d = defaultdict(list)
+    d = defaultdict(dict)
 
     cursor = db.execute(wikipedia_query)
 
@@ -367,13 +359,15 @@ def get_wikipedia_titles(db):
             title = normalize_wikipedia_url(safe_encode(url))
             if title is not None and title.strip():
                 title = utf8_normalize(normalize_name(title))
-                d[title.lower()].append((geonames_id, int(is_preferred or 0)))
+                d[title.lower()][geonames_id] = int(is_preferred or 0)
 
-    return {title: sorted(values, key=operator.itemgetter(1), reverse=True)
-            for title, values in d.iteritems()}
+    return d
 
 
 def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
+    '''
+    Writes geonames.tsv using the specified db to the specified data directory
+    '''
     filename = os.path.join(out_dir, 'geonames.tsv')
     temp_filename = filename + '.tmp'
 
@@ -383,14 +377,12 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
 
     init_languages()
 
-    country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries}
-    country_alpha2 = set([c.alpha2 for c in pycountry.countries])
-
-    country_names = cldr_country_names()
+    init_country_names()
 
     wiki_titles = get_wikipedia_titles(db)
     logging.info('Fetched Wikipedia titles')
 
+    # Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
     for boundary_type, codes in geonames_admin_dictionaries.iteritems():
         if boundary_type != boundary_types.COUNTRY:
             predicate = 'where gn.feature_code in ({codes})'.format(
@@ -407,6 +399,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
         cursor = db.execute(query)
         i = 1
         while True:
+            # Fetch rows in batches to save memory
             batch = cursor.fetchmany(BATCH_SIZE)
             if not batch:
                 break
@@ -422,30 +415,12 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                 is_preferred = int(row[PREFERRED_INDEX] or 0)
                 is_historical = int(row[HISTORICAL_INDEX] or 0)
 
-                lang_official = official_languages[country_code.lower()].get(language, None)
+                lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
+                lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
                 null_language = not language.strip()
 
                 is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
 
-                if is_historical:
-                    language_priority = 0
-                elif not null_language and language != 'abbr' and lang_official is None:
-                    language_priority = 1
-                elif null_language and not is_preferred and not is_canonical:
-                    language_priority = 2
-                elif language == 'abbr' and not is_preferred:
-                    language_priority = 3
-                elif language == 'abbr' and is_preferred:
-                    language_priority = 4
-                elif lang_official == 0:
-                    language_priority = 5
-                elif lang_official == 1 or (null_language and not is_preferred and is_canonical):
-                    language_priority = 6
-                elif is_preferred:
-                    language_priority = 7
-
-                row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
-
                 alpha2_code = None
                 is_orig_name = False
 
@@ -453,7 +428,8 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                     alpha2_code = row[COUNTRY_CODE_INDEX]
 
                     is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
-                    row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]]
+                    # Set the canonical for countries to the local name, see country_official_name in country_names.py
+                    row[CANONICAL_NAME_INDEX] = country_localized_display_name(alpha2_code.lower())
 
                 geonames_id = row[GEONAMES_ID_INDEX]
 
@@ -463,12 +439,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                 if name.isdigit():
                     continue
 
-                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
-                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
-
-                have_wikipedia = False
-
-                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))
+                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))
 
                 row[NAME_INDEX] = name
 
@@ -476,17 +447,59 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                     norm_name = normalize_name(name.lower())
                     for s, repl in saint_replacements:
                         if not wikipedia_entries:
-                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
 
                 wiki_row = []
 
-                for gid, is_preferred in wikipedia_entries:
-                    if gid == geonames_id:
-                        wiki_row = row[:]
-                        wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
-                        rows.append(map(encode_field, wiki_row))
-                        have_wikipedia = True
-                        break
+                have_wikipedia = geonames_id in wikipedia_entries
+                wiki_preferred = wikipedia_entries.get(geonames_id, 0)
+
+                '''
+                The following set of heuristics assigns a numerical value to a given name
+                alternative, such that in the case of ambiguous names, this value can be
+                used as part of the ranking function (as indeed it will be during sort).
+                The higher the value, the more likely the given entity resolution.
+                '''
+                if is_historical:
+                    # Historical names, unlikely to be used
+                    language_priority = 0
+                elif not null_language and language != 'abbr' and lang_spoken is None:
+                    # Name of a place in language not widely spoken e.g. Japanese name for a US toponym
+                    language_priority = 1
+                elif null_language and not is_preferred and not is_canonical:
+                    # Null-language alternate names not marked as preferred, dubious
+                    language_priority = 2
+                elif language == 'abbr' and not is_preferred:
+                    # Abbreviation, not preferred
+                    language_priority = 3
+                elif language == 'abbr' and is_preferred:
+                    # Abbreviation, preferred e.g. NYC, UAE
+                    language_priority = 4
+                elif lang_spoken == 0 and not is_preferred:
+                    # Non-preferred name but in a spoken (non-official) language
+                    language_priority = 5
+                elif lang_official == 1 and not is_preferred:
+                    # Name in an official language, not preferred
+                    language_priority = 6
+                elif null_language and not is_preferred and is_canonical:
+                    # Canonical name, may be overly official e.g. Islamic Republic of Pakistan
+                    language_priority = 7
+                elif is_preferred and not lang_official:
+                    # Preferred names, not an official language
+                    language_priority = 8
+                elif is_preferred and lang_official:
+                    # Official language preferred
+                    language_priority = 9
+
+                row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority
+
+                if have_wikipedia:
+                    wiki_row = row[:]
+                    wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
+                    rows.append(map(encode_field, wiki_row))
+
+                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
+                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
 
                 have_normalized = False
 
@@ -502,8 +515,9 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                 if not have_wikipedia:
                     rows.append(map(encode_field, row))
 
+                # Country names have more specialized logic
                 if boundary_type == boundary_types.COUNTRY:
-                    wikipedia_entries = wiki_titles.get(canonical.lower(), [])
+                    wikipedia_entries = wiki_titles.get(canonical.lower(), {})
 
                     canonical_row_name = normalize_display_name(canonical)
 
@@ -516,19 +530,19 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                         norm_name = normalize_name(canonical.lower())
                         for s, repl in saint_replacements:
                             if not wikipedia_entries:
-                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
 
                         if not wikipedia_entries:
                             norm_name = normalize_name(canonical_row_name.lower())
                             for s, repl in saint_replacements:
                                 if not wikipedia_entries:
-                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})
 
-                        for gid, is_preferred in wikipedia_entries:
-                            if gid == geonames_id:
-                                have_wikipedia = True
-                                canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
-                                break
+                        have_wikipedia = geonames_id in wikipedia_entries
+                        wiki_preferred = wikipedia_entries.get(geonames_id, 0)
+
+                        if have_wikipedia:
+                            canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
 
                         if (name != canonical):
                             rows.append(map(encode_field, canonical_row))
@@ -543,9 +557,9 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                         alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
                         rows.append(map(encode_field, alpha2_row))
 
-                    if alpha2_code in country_code_alpha3_map and is_orig_name:
+                    if alpha2_code.lower() in country_alpha3_map and is_orig_name:
                         alpha3_row = row[:]
-                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code]
+                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
                         alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
                         rows.append(map(encode_field, alpha3_row))