[osm] Only adding country default language toponyms to training data

This commit is contained in:
Al
2015-09-03 13:44:41 -04:00
parent 11c01f64d2
commit 23633e95dd

View File

@@ -10,6 +10,7 @@ import sys
import tempfile
import ujson as json
import yaml
import HTMLParser
from collections import defaultdict, OrderedDict
from lxml import etree
@@ -388,19 +389,20 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
ambiguous_already_seen = set()
for k, v in value.iteritems():
if k.startswith(tag_prefix + ':') and v not in ambiguous_alternatives:
norm = normalize_osm_name_tag(k)
norm_sans_script = normalize_osm_name_tag(k, script=True)
if norm in languages or norm_sans_script in languages:
name_language[norm].append(v)
elif v in ambiguous_alternatives and v not in ambiguous_already_seen:
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
lang = disambiguate_language(v, langs)
if k.startswith(tag_prefix + ':'):
if v not in ambiguous_alternatives:
norm = normalize_osm_name_tag(k)
norm_sans_script = normalize_osm_name_tag(k, script=True)
if norm in languages or norm_sans_script in languages:
name_language[norm].append(v)
elif v not in ambiguous_already_seen:
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
lang = disambiguate_language(v, langs)
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
name_language[lang].append(v)
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
name_language[lang].append(v)
ambiguous_already_seen.add(v)
ambiguous_already_seen.add(v)
elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
if num_langs == 1:
name_language[candidate_languages[0]['lang']].append(v)
@@ -549,23 +551,93 @@ def build_address_format_training_data_limited(language_rtree, infile, out_dir):
print 'did', i, 'formatted addresses'
def build_toponym_data(language_rtree, infile, out_dir):
apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
html_parser = HTMLParser.HTMLParser()
def normalize_wikipedia_title(title):
match = apposition_regex.match(title)
if match:
title = match.group(1)
title = safe_decode(title)
title = html_parser.unescape(title)
title = urllib.unquote_plus(title)
return title.replace(u'_', u' ').strip()
def build_toponym_training_data(language_rtree, infile, out_dir):
i = 0
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote')
for key, value in parse_osm(infile):
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name')
if not name_language:
try:
latitude, longitude = latlon_to_floats(value['lat'], value['lon'])
except Exception:
continue
country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
if not (country and candidate_languages):
continue
name_language = defaultdict(list)
num_langs = len(candidate_languages)
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
num_defaults = len(default_langs)
valid_languages = set([l['lang'] for l in candidate_languages])
have_alternate_names = False
for k, v in value.iteritems():
if k.startswith('wikipedia'):
lang = k.rsplit(':', 1)[-1].lower()
splits = v.split(':', 1)
value_lang = splits[0].lower()
if len(splits) > 1 and value_lang in languages:
lang = value_lang
title = splits[1]
if lang not in languages:
lang = None
continue
have_alternate_names = True
title = normalize_wikipedia_title(title)
name_language[lang].append(title)
continue
if not k.startswith('name:'):
continue
norm = normalize_osm_name_tag(k)
norm_sans_script = normalize_osm_name_tag(k, script=True)
if norm in languages:
lang = norm
elif norm_sans_script in languages:
lang = norm_sans_script
else:
continue
if lang in valid_languages:
have_alternate_names = True
name_language[lang].append(v)
if not have_alternate_names and num_langs == 1 and normalize_osm_name_tag(k, script=True) == 'name':
name_language[candidate_languages[0]['lang']].append(v)
for k, v in name_language.iteritems():
for s in v:
s = s.strip()
if not s:
continue
if k in languages:
writer.writerow((k, country, tsv_string(s)))
writer.writerow((k, country, tsv_string(s)))
if i % 1000 == 0 and i > 0:
print 'did', i, 'toponyms'
i += 1
@@ -684,7 +756,7 @@ if __name__ == '__main__':
if args.streets_file:
build_ways_training_data(language_rtree, args.streets_file, args.out_dir)
if args.borders_file:
build_toponym_data(language_rtree, args.borders_file, args.out_dir)
build_toponym_training_data(language_rtree, args.borders_file, args.out_dir)
if args.address_file and not args.format_only and not args.limited_addresses:
build_address_training_data(language_rtree, args.address_file, args.out_dir)
if args.address_file and args.format_only: