[osm] Only adding country default language toponyms to training data

This commit is contained in:
Al
2015-09-03 13:44:41 -04:00
parent 11c01f64d2
commit 23633e95dd

View File

@@ -10,6 +10,7 @@ import sys
import tempfile import tempfile
import ujson as json import ujson as json
import yaml import yaml
import HTMLParser
from collections import defaultdict, OrderedDict from collections import defaultdict, OrderedDict
from lxml import etree from lxml import etree
@@ -388,19 +389,20 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
ambiguous_already_seen = set() ambiguous_already_seen = set()
for k, v in value.iteritems(): for k, v in value.iteritems():
if k.startswith(tag_prefix + ':') and v not in ambiguous_alternatives: if k.startswith(tag_prefix + ':'):
norm = normalize_osm_name_tag(k) if v not in ambiguous_alternatives:
norm_sans_script = normalize_osm_name_tag(k, script=True) norm = normalize_osm_name_tag(k)
if norm in languages or norm_sans_script in languages: norm_sans_script = normalize_osm_name_tag(k, script=True)
name_language[norm].append(v) if norm in languages or norm_sans_script in languages:
elif v in ambiguous_alternatives and v not in ambiguous_already_seen: name_language[norm].append(v)
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]] elif v not in ambiguous_already_seen:
lang = disambiguate_language(v, langs) langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
lang = disambiguate_language(v, langs)
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE: if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
name_language[lang].append(v) name_language[lang].append(v)
ambiguous_already_seen.add(v) ambiguous_already_seen.add(v)
elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component: elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
if num_langs == 1: if num_langs == 1:
name_language[candidate_languages[0]['lang']].append(v) name_language[candidate_languages[0]['lang']].append(v)
@@ -549,23 +551,93 @@ def build_address_format_training_data_limited(language_rtree, infile, out_dir):
print 'did', i, 'formatted addresses' print 'did', i, 'formatted addresses'
def build_toponym_data(language_rtree, infile, out_dir): apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
html_parser = HTMLParser.HTMLParser()
def normalize_wikipedia_title(title):
match = apposition_regex.match(title)
if match:
title = match.group(1)
title = safe_decode(title)
title = html_parser.unescape(title)
title = urllib.unquote_plus(title)
return title.replace(u'_', u' ').strip()
def build_toponym_training_data(language_rtree, infile, out_dir):
i = 0 i = 0
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w') f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote') writer = csv.writer(f, 'tsv_no_quote')
for key, value in parse_osm(infile): for key, value in parse_osm(infile):
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name') try:
if not name_language: latitude, longitude = latlon_to_floats(value['lat'], value['lon'])
except Exception:
continue continue
country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
if not (country and candidate_languages):
continue
name_language = defaultdict(list)
num_langs = len(candidate_languages)
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
num_defaults = len(default_langs)
valid_languages = set([l['lang'] for l in candidate_languages])
have_alternate_names = False
for k, v in value.iteritems():
if k.startswith('wikipedia'):
lang = k.rsplit(':', 1)[-1].lower()
splits = v.split(':', 1)
value_lang = splits[0].lower()
if len(splits) > 1 and value_lang in languages:
lang = value_lang
title = splits[1]
if lang not in languages:
lang = None
continue
have_alternate_names = True
title = normalize_wikipedia_title(title)
name_language[lang].append(title)
continue
if not k.startswith('name:'):
continue
norm = normalize_osm_name_tag(k)
norm_sans_script = normalize_osm_name_tag(k, script=True)
if norm in languages:
lang = norm
elif norm_sans_script in languages:
lang = norm_sans_script
else:
continue
if lang in valid_languages:
have_alternate_names = True
name_language[lang].append(v)
if not have_alternate_names and num_langs == 1 and normalize_osm_name_tag(k, script=True) == 'name':
name_language[candidate_languages[0]['lang']].append(v)
for k, v in name_language.iteritems(): for k, v in name_language.iteritems():
for s in v: for s in v:
s = s.strip() s = s.strip()
if not s: if not s:
continue continue
if k in languages: writer.writerow((k, country, tsv_string(s)))
writer.writerow((k, country, tsv_string(s)))
if i % 1000 == 0 and i > 0: if i % 1000 == 0 and i > 0:
print 'did', i, 'toponyms' print 'did', i, 'toponyms'
i += 1 i += 1
@@ -684,7 +756,7 @@ if __name__ == '__main__':
if args.streets_file: if args.streets_file:
build_ways_training_data(language_rtree, args.streets_file, args.out_dir) build_ways_training_data(language_rtree, args.streets_file, args.out_dir)
if args.borders_file: if args.borders_file:
build_toponym_data(language_rtree, args.borders_file, args.out_dir) build_toponym_training_data(language_rtree, args.borders_file, args.out_dir)
if args.address_file and not args.format_only and not args.limited_addresses: if args.address_file and not args.format_only and not args.limited_addresses:
build_address_training_data(language_rtree, args.address_file, args.out_dir) build_address_training_data(language_rtree, args.address_file, args.out_dir)
if args.address_file and args.format_only: if args.address_file and args.format_only: