From acd5d07d179003796bc8de02d947bf1983d7f3bf Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sun, 5 Jul 2015 15:56:46 -0400
Subject: [PATCH] [geonames] Storing NFD normalized names and sorting
 case-insensitive in order to group everything with the same normalized name
 together

---
 scripts/geodata/geonames/create_geonames_tsv.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py
index 05c525b2..cb6e4681 100644
--- a/scripts/geodata/geonames/create_geonames_tsv.py
+++ b/scripts/geodata/geonames/create_geonames_tsv.py
@@ -11,6 +11,8 @@ import sys
 import requests
 import pycountry
 
+import unicodedata
+
 import urllib
 import urlparse
 
@@ -328,12 +330,15 @@ def normalize_display_name(name):
     return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
 
 
+def utf8_normalize(s, form='NFD'):
+    return unicodedata.normalize(form, s)
+
+
 def get_wikipedia_titles(db):
     d = defaultdict(list)
 
     cursor = db.execute(wikipedia_query)
 
-    i = 1
     while True:
         batch = cursor.fetchmany(BATCH_SIZE)
         if not batch:
@@ -342,7 +347,7 @@ def get_wikipedia_titles(db):
         for (url, geonames_id, is_preferred) in batch:
             title = normalize_wikipedia_url(safe_encode(url))
             if title is not None and title.strip():
-                title = normalize_name(title)
+                title = utf8_normalize(normalize_name(title))
                 d[title.lower()].append((geonames_id, int(is_preferred or 0)))
 
     return {title: sorted(values, key=operator.itemgetter(1), reverse=True)
@@ -400,14 +405,16 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
 
                 geonames_id = row[GEONAMES_ID_INDEX]
 
-                name = safe_decode(row[NAME_INDEX])
-                canonical = safe_decode(row[CANONICAL_NAME_INDEX])
+                name = utf8_normalize(safe_decode(row[NAME_INDEX]))
+                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
                 row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
 
                 have_wikipedia = False
 
                 wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))
 
+                row[NAME_INDEX] = name
+
                 if boundary_type == boundary_types.COUNTRY:
                     norm_name = normalize_name(name.lower())
                     for s, repl in saint_replacements:
@@ -493,7 +500,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
     f.close()
 
     logging.info('Sorting...')
-    subprocess.check_call(['sort', '-t\t', '-u',
+    subprocess.check_call(['sort', '-t\t', '-u', '--ignore-case',
                            '-k{0},{0}'.format(NAME_INDEX + 1),
                            # If there's a Wikipedia link to this name for the given id, sort first
                            '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),