[geodisambig] Adding presence of a Wikipedia link to the GeoNames output (an unqualified entry for the name in Wikipeida usually indicates a primary meaning). Ranking ambiguous entries for each term so that the top entry should be selected if no further information is available

2015-06-30 18:00:07 -04:00
parent 8d64c9301e
commit 071d6bb392
1 changed files with 195 additions and 9 deletions
--- a/scripts/geodata/geonames/create_geonames_tsv.py
+++ b/scripts/geodata/geonames/create_geonames_tsv.py
@@ -1,6 +1,7 @@
 import argparse
 import csv
 import logging
+import operator
 import os
 import re
 import sqlite3
@@ -10,13 +11,17 @@ import sys
 import requests
 import pycountry

+import urllib
+import urlparse
+
+from collections import defaultdict
 from lxml import etree

 this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))

 from geodata.file_utils import *
-from geodata.encoding import safe_encode
+from geodata.encoding import safe_encode, safe_decode
 from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH
 from geodata.i18n.unicode_paths import CLDR_DIR
 from geodata.log import log_to_file
@@ -27,7 +32,7 @@ csv.register_dialect('tsv_no_quote', delimiter='\t', quoting=csv.QUOTE_NONE, quo


 def encode_field(value):
-    return multispace_regex.sub(' ', safe_encode((value or '')))
+    return multispace_regex.sub(' ', safe_encode((value if value is not None else '')))

 log_to_file(sys.stderr)

@@ -76,6 +81,7 @@ geonames_admin_dictionaries = {

 # Inserted post-query
 DUMMY_BOUNDARY_TYPE = '-1 as type'
+DUMMY_HAS_WIKIPEDIA_ENTRY = '0 as has_wikipedia_entry'


 class GeonamesField(object):
@@ -91,8 +97,12 @@ geonames_fields = [
    GeonamesField('gn.geonames_id as geonames_id', 'GEONAMES_ID'),
    GeonamesField('gn.name as canonical', 'GEONAMES_CANONICAL'),
    GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True),
+    GeonamesField(DUMMY_HAS_WIKIPEDIA_ENTRY, 'GEONAMES_HAS_WIKIPEDIA_ENTRY', is_dummy=True),
    GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"),
    GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'),
+    GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'),
+    GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'),
+    GeonamesField('is_historical', 'GEONAMES_IS_HISTORICAL', default='0'),
    GeonamesField('population', 'GEONAMES_POPULATION'),
    GeonamesField('latitude', 'GEONAMES_LATITUDE'),
    GeonamesField('longitude', 'GEONAMES_LONGITUDE'),
@@ -109,11 +119,17 @@ geonames_fields = [
 ]

 DUMMY_BOUNDARY_TYPE_INDEX = [i for i, f in enumerate(geonames_fields)
-                             if f.is_dummy][0]
+                             if f.c_constant == 'GEONAMES_BOUNDARY_TYPE'][0]
+
+DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX = [i for i, f in enumerate(geonames_fields)
+                                   if f.c_constant == 'GEONAMES_HAS_WIKIPEDIA_ENTRY'][0]

 GEONAMES_ID_INDEX = [i for i, f in enumerate(geonames_fields)
                     if f.c_constant == 'GEONAMES_ID'][0]

+LANGUAGE_INDEX = [i for i, f in enumerate(geonames_fields)
+                  if f.c_constant == 'GEONAMES_ISO_LANGUAGE'][0]
+
 CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields)
                        if f.c_constant == 'GEONAMES_CANONICAL'][0]

@@ -126,6 +142,12 @@ COUNTRY_CODE_INDEX = [i for i, f in enumerate(geonames_fields)
 POPULATION_INDEX = [i for i, f in enumerate(geonames_fields)
                    if f.c_constant == 'GEONAMES_POPULATION'][0]

+PREFERRED_INDEX = [i for i, f in enumerate(geonames_fields)
+                   if f.c_constant == 'GEONAMES_IS_PREFERRED_NAME'][0]
+
+HISTORICAL_INDEX = [i for i, f in enumerate(geonames_fields)
+                    if f.c_constant == 'GEONAMES_IS_HISTORICAL'][0]
+

 geonames_admin_joins = '''
 left join admin1_codes a1
@@ -222,6 +244,15 @@ group by postal_code, p.country_code
    fields=','.join([f.name for f in postal_code_fields]),
    exclude_country_codes=','.join("'{}'".format(code) for code in IGNORE_COUNTRY_POSTAL_CODES))

+
+wikipedia_query = '''
+select alternate_name, geonames_id, is_preferred_name
+from alternate_names
+where iso_language = 'link'
+and alternate_name like '%%en.wikipedia%%'
+order by alternate_name, is_preferred_name
+'''
+
 BATCH_SIZE = 2000


@@ -252,6 +283,72 @@ def cldr_country_names(filename=CLDR_ENGLISH_PATH):
    return country_names


+wiki_paren_regex = re.compile('(.*)[\s]*\(.*?\)[\s]*')
+
+
+def normalize_wikipedia_title(title):
+    return safe_decode(title).replace(u'_', u' ')
+
+
+def normalize_wikipedia_url(url):
+    url = urllib.unquote_plus(url)
+
+    parsed = urlparse.urlsplit(url)
+    if parsed.query:
+        params = urlparse.parse_qs(parsed.query)
+        if 'title' in params:
+            return normalize_wikipedia_title(params['title'][0])
+
+    title = parsed.path.rsplit('/', 1)[-1]
+    if title not in ('index.php', 'index.html'):
+        return normalize_wikipedia_title(title)
+
+    return None
+
+
+def normalize_name(name):
+    name = name.replace('&', 'and')
+    name = name.replace('-', ' ')
+    name = name.replace(', ', ' ')
+    name = name.replace(',', ' ')
+    return name
+
+
+saint_replacements = [
+    ('st.', 'saint'),
+    ('st.', 'st'),
+    ('st', 'saint')
+]
+
+
+abbreviated_saint_regex = re.compile(r'\bSt(\.|\b)')
+
+
+def normalize_display_name(name):
+    return abbreviated_saint_regex.sub('Saint', name).replace('&', 'and')
+
+
+def get_wikipedia_titles(db):
+    d = defaultdict(list)
+
+    cursor = db.execute(wikipedia_query)
+
+    i = 1
+    while True:
+        batch = cursor.fetchmany(BATCH_SIZE)
+        if not batch:
+            break
+
+        for (url, geonames_id, is_preferred) in batch:
+            title = normalize_wikipedia_url(safe_encode(url))
+            if title is not None and title.strip():
+                title = normalize_name(title)
+                d[title.lower()].append((geonames_id, int(is_preferred or 0)))
+
+    return {title: sorted(values, key=operator.itemgetter(1), reverse=True)
+            for title, values in d.iteritems()}
+
+
 def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
    filename = os.path.join(out_dir, 'geonames.tsv')
    temp_filename = filename + '.tmp'
@@ -265,6 +362,9 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):

    country_names = cldr_country_names()

+    wiki_titles = get_wikipedia_titles(db)
+    logging.info('Fetched Wikipedia titles')
+
    for boundary_type, codes in geonames_admin_dictionaries.iteritems():
        if boundary_type != boundary_types.COUNTRY:
            predicate = 'where gn.feature_code in ({codes})'.format(
@@ -286,26 +386,102 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
                break
            rows = []
            for row in batch:
-                row = map(encode_field, row)
+                row = list(row)
                row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type

+                alpha2_code = None
+                is_orig_name = False
+
                if boundary_type == boundary_types.COUNTRY:
                    alpha2_code = row[COUNTRY_CODE_INDEX]

-                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]
+                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
                    row[CANONICAL_NAME_INDEX] = country_names[row[COUNTRY_CODE_INDEX]]

+                geonames_id = row[GEONAMES_ID_INDEX]
+
+                name = safe_decode(row[NAME_INDEX])
+                canonical = safe_decode(row[CANONICAL_NAME_INDEX])
+                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)
+
+                have_wikipedia = False
+
+                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), []))
+
+                if boundary_type == boundary_types.COUNTRY:
+                    norm_name = normalize_name(name.lower())
+                    for s, repl in saint_replacements:
+                        if not wikipedia_entries:
+                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+
+                wiki_row = []
+
+                for gid, is_preferred in wikipedia_entries:
+                    if gid == geonames_id:
+                        wiki_row = row[:]
+                        wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
+                        rows.append(map(encode_field, wiki_row))
+                        have_wikipedia = True
+                        break
+
+                have_normalized = False
+
+                if is_orig_name:
+                    canonical_row = wiki_row[:] if have_wikipedia else row[:]
+
+                    canonical_row_name = normalize_display_name(name)
+                    if canonical_row_name != name:
+                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
+                        have_normalized = True
+                        rows.append(map(encode_field, canonical_row))
+
+                if not have_wikipedia:
+                    rows.append(map(encode_field, row))
+
+                if boundary_type == boundary_types.COUNTRY:
+                    wikipedia_entries = wiki_titles.get(canonical.lower(), [])
+
+                    canonical_row_name = normalize_display_name(canonical)
+
+                    canonical_row = row[:]
+
+                    if is_orig_name:
+                        canonical = safe_decode(canonical)
+                        canonical_row[NAME_INDEX] = safe_encode(canonical)
+
+                        norm_name = normalize_name(canonical.lower())
+                        for s, repl in saint_replacements:
+                            if not wikipedia_entries:
+                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+
+                        if not wikipedia_entries:
+                            norm_name = normalize_name(canonical_row_name.lower())
+                            for s, repl in saint_replacements:
+                                if not wikipedia_entries:
+                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), [])
+
+                        for gid, is_preferred in wikipedia_entries:
+                            if gid == geonames_id:
+                                have_wikipedia = True
+                                canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = is_preferred + 1
+                                break
+
+                        if (name != canonical):
+                            rows.append(map(encode_field, canonical_row))
+
+                    if canonical_row_name != canonical and canonical_row_name != name:
+                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
+                        rows.append(map(encode_field, canonical_row))
+
                    if alpha2_code and is_orig_name:
                        alpha2_row = row[:]
                        alpha2_row[NAME_INDEX] = alpha2_code
-                        rows.append(alpha2_row)
+                        rows.append(map(encode_field, alpha2_row))

                    if alpha2_code in country_code_alpha3_map and is_orig_name:
                        alpha3_row = row[:]
                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code]
-                        rows.append(alpha3_row)
-
-                rows.append(row)
+                        rows.append(map(encode_field, alpha3_row))

            writer.writerows(rows)
            logging.info('Did {} batches'.format(i))
@@ -319,8 +495,18 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
    logging.info('Sorting...')
    subprocess.check_call(['sort', '-t\t', '-u',
                           '-k{0},{0}'.format(NAME_INDEX + 1),
+                           # If there's a Wikipedia link to this name for the given id, sort first
+                           '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
+                           # Historical entries should be sorted last
+                           '-k{0},{0}n'.format(HISTORICAL_INDEX + 1),
+                           # Sort descending by population (basic proxy for relevance)
                           '-k{0},{0}nr'.format(POPULATION_INDEX + 1),
+                           # group rows for the same geonames ID together
                           '-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
+                           # preferred names come first within that grouping
+                           '-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
+                           # since uniquing is done on the sort key, add language
+                           '-k{0},{0}'.format(LANGUAGE_INDEX + 1),
                           '-o', filename, temp_filename])
    os.unlink(temp_filename)