[geonames] Storing NFD normalized names and sorting case-insensitive in order to group everything with the same normalized name together
This commit is contained in:
@@ -11,6 +11,8 @@ import sys
|
|||||||
import requests
|
import requests
|
||||||
import pycountry
|
import pycountry
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
@@ -328,12 +330,15 @@ def normalize_display_name(name):
|
|||||||
return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
|
return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
|
||||||
|
|
||||||
|
|
||||||
|
def utf8_normalize(s, form='NFD'):
|
||||||
|
return unicodedata.normalize(form, s)
|
||||||
|
|
||||||
|
|
||||||
def get_wikipedia_titles(db):
|
def get_wikipedia_titles(db):
|
||||||
d = defaultdict(list)
|
d = defaultdict(list)
|
||||||
|
|
||||||
cursor = db.execute(wikipedia_query)
|
cursor = db.execute(wikipedia_query)
|
||||||
|
|
||||||
i = 1
|
|
||||||
while True:
|
while True:
|
||||||
batch = cursor.fetchmany(BATCH_SIZE)
|
batch = cursor.fetchmany(BATCH_SIZE)
|
||||||
if not batch:
|
if not batch:
|
||||||
@@ -342,7 +347,7 @@ def get_wikipedia_titles(db):
|
|||||||
for (url, geonames_id, is_preferred) in batch:
|
for (url, geonames_id, is_preferred) in batch:
|
||||||
title = normalize_wikipedia_url(safe_encode(url))
|
title = normalize_wikipedia_url(safe_encode(url))
|
||||||
if title is not None and title.strip():
|
if title is not None and title.strip():
|
||||||
title = normalize_name(title)
|
title = utf8_normalize(normalize_name(title))
|
||||||
d[title.lower()].append((geonames_id, int(is_preferred or 0)))
|
d[title.lower()].append((geonames_id, int(is_preferred or 0)))
|
||||||
|
|
||||||
return {title: sorted(values, key=operator.itemgetter(1), reverse=True)
|
return {title: sorted(values, key=operator.itemgetter(1), reverse=True)
|
||||||
@@ -400,14 +405,16 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
|
|
||||||
geonames_id = row[GEONAMES_ID_INDEX]
|
geonames_id = row[GEONAMES_ID_INDEX]
|
||||||
|
|
||||||
name = safe_decode(row[NAME_INDEX])
|
name = utf8_normalize(safe_decode(row[NAME_INDEX]))
|
||||||
canonical = safe_decode(row[CANONICAL_NAME_INDEX])
|
canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
|
||||||
row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
|
row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
|
||||||
|
|
||||||
have_wikipedia = False
|
have_wikipedia = False
|
||||||
|
|
||||||
wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))
|
wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))
|
||||||
|
|
||||||
|
row[NAME_INDEX] = name
|
||||||
|
|
||||||
if boundary_type == boundary_types.COUNTRY:
|
if boundary_type == boundary_types.COUNTRY:
|
||||||
norm_name = normalize_name(name.lower())
|
norm_name = normalize_name(name.lower())
|
||||||
for s, repl in saint_replacements:
|
for s, repl in saint_replacements:
|
||||||
@@ -493,7 +500,7 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
|
|||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
logging.info('Sorting...')
|
logging.info('Sorting...')
|
||||||
subprocess.check_call(['sort', '-t\t', '-u',
|
subprocess.check_call(['sort', '-t\t', '-u', '--ignore-case',
|
||||||
'-k{0},{0}'.format(NAME_INDEX + 1),
|
'-k{0},{0}'.format(NAME_INDEX + 1),
|
||||||
# If there's a Wikipedia link to this name for the given id, sort first
|
# If there's a Wikipedia link to this name for the given id, sort first
|
||||||
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
|
'-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
|
||||||
|
|||||||
Reference in New Issue
Block a user