From 390271525896a658a951dd233592518d978d4dd8 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 22 Aug 2015 14:11:44 -0400 Subject: [PATCH] [osm] Some countries like Lebanon in OSM will list the same address under two languages (French/English), which creates an unreasonable task for a linear classifier, so running disambiguation in those cases --- scripts/geodata/language_id/disambiguation.py | 2 +- .../geodata/osm/osm_address_training_data.py | 38 +++++++++++++------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index ebc01858..8e2d8f74 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -113,7 +113,7 @@ AMBIGUOUS_LANGUAGE = 'xxx' def disambiguate_language(text, languages): - valid_languages = OrderedDict([(l['lang'], l['default']) for l in languages]) + valid_languages = OrderedDict(languages) tokens = tokenize(safe_decode(text).replace(u'-', u' ').lower()) current_lang = None diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index c3af0233..5dd87c4e 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -331,9 +331,6 @@ def latlon_to_floats(latitude, longitude): return float(latitude), float(longitude) -newline_regex = re.compile('\r\n|\r|\n') - - def get_language_names(language_rtree, key, value, tag_prefix='name'): if not ('lat' in value and 'lon' in value): return None, None @@ -355,8 +352,22 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): default_langs = set([l['lang'] for l in candidate_languages if l.get('default')]) num_defaults = len(default_langs) name_language = defaultdict(list) - has_alternate_names = any((k.startswith(tag_prefix + ':') and normalize_osm_name_tag(k, script=True) - in languages for k, v in value.iteritems())) + + alternate_langs = [] + has_alternate_names = len(alternate_langs) > 0 + + alternate_langs = [] + + equivalent_alternatives = defaultdict(list) + for k, v in value.iteritems(): + if k.startswith(tag_prefix + ':') and normalize_osm_name_tag(k, script=True) in languages: + lang = k.rsplit(':', 1)[-1] + alternate_langs.append((lang, v)) + equivalent_alternatives[v].append(lang) + + has_alternate_names = len(alternate_langs) + # Some countries like Lebanon + ambiguous_alternatives = set([k for k, v in equivalent_alternatives.iteritems() if len(v) > 1]) regional_defaults = 0 country_defaults = 0 @@ -370,17 +381,26 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): country_defaults += sum((1 for lang in p['languages'] if lang.get('default'))) country_langs |= set([l['lang'] for l in p['languages']]) + ambiguous_already_seen = set() + for k, v in value.iteritems(): - if k.startswith(tag_prefix + ':'): + if k.startswith(tag_prefix + ':') and v not in ambiguous_alternatives: norm = normalize_osm_name_tag(k) norm_sans_script = normalize_osm_name_tag(k, script=True) if norm in languages or norm_sans_script in languages: name_language[norm].append(v) + elif v in ambiguous_alternatives and v not in ambiguous_already_seen: + lang = disambiguate_language(v, [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]) + + if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE: + name_language[lang].append(v) + + ambiguous_already_seen.add(v) elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component: if num_langs == 1: name_language[candidate_languages[0]['lang']].append(v) else: - lang = disambiguate_language(v, candidate_languages) + lang = disambiguate_language(v, [(l['lang'], l['default']) for l in candidate_languages]) default_lang = candidate_languages[0]['lang'] if lang == AMBIGUOUS_LANGUAGE: @@ -397,10 +417,6 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): return country, name_language -def tsv_string(s): - return safe_encode(newline_regex.sub(u', ', safe_decode(s).strip()).replace(u'\t', u' ')) - - def build_ways_training_data(language_rtree, infile, out_dir): i = 0 f = open(os.path.join(out_dir, WAYS_LANGUAGE_DATA_FILENAME), 'w')