From df20e2cbc0353edcd2b7535ed5502c72cdd1a991 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 4 Sep 2015 14:13:26 -0400 Subject: [PATCH] [osm] Including toponyms in the training data for countries where the unqualified place names can be assumed to be examples of a given language --- scripts/geodata/language_id/disambiguation.py | 16 +++++++++++++++- scripts/geodata/osm/osm_address_training_data.py | 15 +++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index b8dbb081..532e5132 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -17,7 +17,21 @@ from geodata.i18n.unicode_properties import get_chars_by_script, get_script_lang from address_normalizer.text.normalize import PhraseFilter from address_normalizer.text.tokenize import * -WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es']) +WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt']) + +# For toponyms, we want to limit the countries we consider to those where +# we the place names can themselves be considered training examples of the language +WELL_REPRESENTED_LANGUAGE_COUNTRIES = { + 'en': set(['gb', 'us', 'ca', 'au', 'nz', 'ie']), + 'fr': set(['fr']), + 'it': set(['it']), + 'de': set(['de', 'at']), + 'nl': set(['nl']), + 'es': set(['es', 'ar', 'mx', 'cl', 'co', 'pe', 'ec', 'pr', 'uy', + 've', 'cu', 'do', 'bo', 'gt', 'cr', 'py', 'sv', 'pa', + 'ni', 'hn']), + 'pt': set(['pt', 'br']), +} DICTIONARIES_DIR = os.path.join(DATA_DIR, 'dictionaries') diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 191bbeff..b1c46fe7 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -23,7 +23,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python'))) from address_normalizer.text.tokenize import * -from geodata.language_id.disambiguation import street_types_gazetteer, disambiguate_language, WELL_REPRESENTED_LANGUAGES, UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE +from geodata.language_id.disambiguation import * from geodata.language_id.polygon_lookup import country_and_languages from geodata.i18n.languages import * from geodata.polygons.language_polys import * @@ -644,9 +644,6 @@ def build_toponym_training_data(language_rtree, infile, out_dir): num_langs = len(candidate_languages) default_langs = set([l for l, default in official.iteritems() if default]) - num_defaults = len(default_langs) - - defaults_well_represented = all((d in WELL_REPRESENTED_LANGUAGES for d in default_langs)) regional_langs = list(chain(*(p['languages'] for p in language_props if p.get('admin_level', 0) > 0))) @@ -654,12 +651,14 @@ def build_toponym_training_data(language_rtree, infile, out_dir): if len(official) > 0: top_lang = official.iterkeys().next() + # E.g. Hindi in India, Urdu in Pakistan if top_lang is not None and top_lang not in WELL_REPRESENTED_LANGUAGES and len(default_langs) > 1: default_langs -= WELL_REPRESENTED_LANGUAGES - elif len(default_langs & WELL_REPRESENTED_LANGUAGES) > 1: - continue - valid_languages = (set([l['lang'] for l in candidate_languages]) - WELL_REPRESENTED_LANGUAGES) | default_langs + valid_languages = set([l['lang'] for l in candidate_languages]) + valid_languages -= set([lang for lang in valid_languages if lang in WELL_REPRESENTED_LANGUAGES and country not in WELL_REPRESENTED_LANGUAGE_COUNTRIES[lang]]) + + valid_languages |= default_langs if not valid_languages: continue @@ -684,7 +683,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir): have_qualified_names = True name_language[lang].append(v) - if not have_qualified_names and len(regional_langs) <= 1 and 'name' in value and (len(all_langs) == 1 or (num_langs == 1 and not defaults_well_represented)): + if not have_qualified_names and len(regional_langs) <= 1 and 'name' in value and num_langs == 1: name_language[candidate_languages[0]['lang']].append(value['name']) for k, v in name_language.iteritems():