diff --git a/scripts/geodata/geonames/create_geonames_tsv.py b/scripts/geodata/geonames/create_geonames_tsv.py index d19dde72..52e305a6 100644 --- a/scripts/geodata/geonames/create_geonames_tsv.py +++ b/scripts/geodata/geonames/create_geonames_tsv.py @@ -25,6 +25,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) from geodata.file_utils import * from geodata.encoding import safe_encode, safe_decode from geodata.geonames.paths import DEFAULT_GEONAMES_DB_PATH +from geodata.i18n.languages import * from geodata.i18n.unicode_paths import CLDR_DIR from geodata.log import log_to_file @@ -84,6 +85,7 @@ geonames_admin_dictionaries = { # Inserted post-query DUMMY_BOUNDARY_TYPE = '-1 as type' DUMMY_HAS_WIKIPEDIA_ENTRY = '0 as has_wikipedia_entry' +DUMMY_LANGUAGE_PRIORITY = '0 as language_priority' class GeonamesField(object): @@ -101,6 +103,7 @@ geonames_fields = [ GeonamesField(DUMMY_BOUNDARY_TYPE, 'GEONAMES_BOUNDARY_TYPE', is_dummy=True), GeonamesField(DUMMY_HAS_WIKIPEDIA_ENTRY, 'GEONAMES_HAS_WIKIPEDIA_ENTRY', is_dummy=True), GeonamesField('iso_language', 'GEONAMES_ISO_LANGUAGE', default="''"), + GeonamesField(DUMMY_LANGUAGE_PRIORITY, 'GEONAMES_LANGUAGE_PRIORITY', is_dummy=True), GeonamesField('is_preferred_name', 'GEONAMES_IS_PREFERRED_NAME', default='0'), GeonamesField('is_short_name', 'GEONAMES_IS_SHORT_NAME', default='0'), GeonamesField('is_colloquial', 'GEONAMES_IS_COLLOQUIAL', default='0'), @@ -133,6 +136,9 @@ GEONAMES_ID_INDEX = [i for i, f in enumerate(geonames_fields) LANGUAGE_INDEX = [i for i, f in enumerate(geonames_fields) if f.c_constant == 'GEONAMES_ISO_LANGUAGE'][0] +DUMMY_LANGUAGE_PRIORITY_INDEX = [i for i, f in enumerate(geonames_fields) + if f.c_constant == 'GEONAMES_LANGUAGE_PRIORITY'][0] + CANONICAL_NAME_INDEX = [i for i, f in enumerate(geonames_fields) if f.c_constant == 'GEONAMES_CANONICAL'][0] @@ -377,6 +383,8 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR): writer = csv.writer(f, 'tsv_no_quote') + init_languages() + country_code_alpha3_map = {c.alpha2: c.alpha3 for c in pycountry.countries} country_alpha2 = set([c.alpha2 for c in pycountry.countries]) @@ -409,6 +417,39 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR): row = list(row) row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type + language = row[LANGUAGE_INDEX] + + country_code = row[COUNTRY_CODE_INDEX] + + is_preferred = int(row[PREFERRED_INDEX] or 0) + is_historical = int(row[HISTORICAL_INDEX] or 0) + + lang_official = official_languages[country_code.lower()].get(language, None) + null_language = not language.strip() + + is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] + + if is_historical: + language_priority = 0 + elif not null_language and language != 'abbr' and lang_official is None: + language_priority = 1 + elif null_language and not is_preferred and not is_canonical: + language_priority = 2 + elif language == 'abbr' and not is_preferred: + language_priority = 3 + elif language == 'abbr' and is_preferred: + language_priority = 4 + elif lang_official == 0: + language_priority = 5 + elif lang_official == 1: + language_priority = 6 + elif null_language and not is_preferred and is_canonical: + language_priority = 7 + elif is_preferred: + language_priority = 8 + + row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority + alpha2_code = None is_orig_name = False @@ -498,11 +539,13 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR): if alpha2_code and is_orig_name: alpha2_row = row[:] alpha2_row[NAME_INDEX] = alpha2_code + alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10 rows.append(map(encode_field, alpha2_row)) if alpha2_code in country_code_alpha3_map and is_orig_name: alpha3_row = row[:] alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code] + alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10 rows.append(map(encode_field, alpha3_row)) writer.writerows(rows) @@ -523,8 +566,8 @@ def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR): '-k{0},{0}'.format(NAME_INDEX + 1), # If there's a Wikipedia link to this name for the given id, sort first '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1), - # Historical entries should be sorted last - '-k{0},{0}n'.format(HISTORICAL_INDEX + 1), + # Language priority rules as above + '-k{0},{0}nr'.format(DUMMY_LANGUAGE_PRIORITY_INDEX + 1), # Sort descending by population (basic proxy for relevance) '-k{0},{0}nr'.format(POPULATION_INDEX + 1), # group rows for the same geonames ID together