[osm] Only adding country default language toponyms to training data
This commit is contained in:
@@ -10,6 +10,7 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import ujson as json
|
import ujson as json
|
||||||
import yaml
|
import yaml
|
||||||
|
import HTMLParser
|
||||||
|
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import defaultdict, OrderedDict
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@@ -388,19 +389,20 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
|
|||||||
ambiguous_already_seen = set()
|
ambiguous_already_seen = set()
|
||||||
|
|
||||||
for k, v in value.iteritems():
|
for k, v in value.iteritems():
|
||||||
if k.startswith(tag_prefix + ':') and v not in ambiguous_alternatives:
|
if k.startswith(tag_prefix + ':'):
|
||||||
norm = normalize_osm_name_tag(k)
|
if v not in ambiguous_alternatives:
|
||||||
norm_sans_script = normalize_osm_name_tag(k, script=True)
|
norm = normalize_osm_name_tag(k)
|
||||||
if norm in languages or norm_sans_script in languages:
|
norm_sans_script = normalize_osm_name_tag(k, script=True)
|
||||||
name_language[norm].append(v)
|
if norm in languages or norm_sans_script in languages:
|
||||||
elif v in ambiguous_alternatives and v not in ambiguous_already_seen:
|
name_language[norm].append(v)
|
||||||
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
|
elif v not in ambiguous_already_seen:
|
||||||
lang = disambiguate_language(v, langs)
|
langs = [(lang, lang in default_langs) for lang in equivalent_alternatives[v]]
|
||||||
|
lang = disambiguate_language(v, langs)
|
||||||
|
|
||||||
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
|
if lang != AMBIGUOUS_LANGUAGE and lang != UNKNOWN_LANGUAGE:
|
||||||
name_language[lang].append(v)
|
name_language[lang].append(v)
|
||||||
|
|
||||||
ambiguous_already_seen.add(v)
|
ambiguous_already_seen.add(v)
|
||||||
elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
|
elif not has_alternate_names and k.startswith(tag_first_component) and (has_colon or ':' not in k) and normalize_osm_name_tag(k, script=True) == tag_last_component:
|
||||||
if num_langs == 1:
|
if num_langs == 1:
|
||||||
name_language[candidate_languages[0]['lang']].append(v)
|
name_language[candidate_languages[0]['lang']].append(v)
|
||||||
@@ -549,23 +551,93 @@ def build_address_format_training_data_limited(language_rtree, infile, out_dir):
|
|||||||
print 'did', i, 'formatted addresses'
|
print 'did', i, 'formatted addresses'
|
||||||
|
|
||||||
|
|
||||||
def build_toponym_data(language_rtree, infile, out_dir):
|
apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
|
||||||
|
|
||||||
|
html_parser = HTMLParser.HTMLParser()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_wikipedia_title(title):
|
||||||
|
match = apposition_regex.match(title)
|
||||||
|
if match:
|
||||||
|
title = match.group(1)
|
||||||
|
|
||||||
|
title = safe_decode(title)
|
||||||
|
title = html_parser.unescape(title)
|
||||||
|
title = urllib.unquote_plus(title)
|
||||||
|
|
||||||
|
return title.replace(u'_', u' ').strip()
|
||||||
|
|
||||||
|
|
||||||
|
def build_toponym_training_data(language_rtree, infile, out_dir):
|
||||||
i = 0
|
i = 0
|
||||||
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
|
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
|
||||||
writer = csv.writer(f, 'tsv_no_quote')
|
writer = csv.writer(f, 'tsv_no_quote')
|
||||||
|
|
||||||
for key, value in parse_osm(infile):
|
for key, value in parse_osm(infile):
|
||||||
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name')
|
try:
|
||||||
if not name_language:
|
latitude, longitude = latlon_to_floats(value['lat'], value['lon'])
|
||||||
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
|
||||||
|
if not (country and candidate_languages):
|
||||||
|
continue
|
||||||
|
|
||||||
|
name_language = defaultdict(list)
|
||||||
|
|
||||||
|
num_langs = len(candidate_languages)
|
||||||
|
default_langs = set([l['lang'] for l in candidate_languages if l.get('default')])
|
||||||
|
num_defaults = len(default_langs)
|
||||||
|
|
||||||
|
valid_languages = set([l['lang'] for l in candidate_languages])
|
||||||
|
|
||||||
|
have_alternate_names = False
|
||||||
|
|
||||||
|
for k, v in value.iteritems():
|
||||||
|
if k.startswith('wikipedia'):
|
||||||
|
lang = k.rsplit(':', 1)[-1].lower()
|
||||||
|
|
||||||
|
splits = v.split(':', 1)
|
||||||
|
value_lang = splits[0].lower()
|
||||||
|
if len(splits) > 1 and value_lang in languages:
|
||||||
|
lang = value_lang
|
||||||
|
title = splits[1]
|
||||||
|
|
||||||
|
if lang not in languages:
|
||||||
|
lang = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
have_alternate_names = True
|
||||||
|
title = normalize_wikipedia_title(title)
|
||||||
|
name_language[lang].append(title)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not k.startswith('name:'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
norm = normalize_osm_name_tag(k)
|
||||||
|
norm_sans_script = normalize_osm_name_tag(k, script=True)
|
||||||
|
|
||||||
|
if norm in languages:
|
||||||
|
lang = norm
|
||||||
|
elif norm_sans_script in languages:
|
||||||
|
lang = norm_sans_script
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if lang in valid_languages:
|
||||||
|
have_alternate_names = True
|
||||||
|
name_language[lang].append(v)
|
||||||
|
|
||||||
|
if not have_alternate_names and num_langs == 1 and normalize_osm_name_tag(k, script=True) == 'name':
|
||||||
|
name_language[candidate_languages[0]['lang']].append(v)
|
||||||
|
|
||||||
for k, v in name_language.iteritems():
|
for k, v in name_language.iteritems():
|
||||||
for s in v:
|
for s in v:
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
if not s:
|
if not s:
|
||||||
continue
|
continue
|
||||||
if k in languages:
|
writer.writerow((k, country, tsv_string(s)))
|
||||||
writer.writerow((k, country, tsv_string(s)))
|
|
||||||
if i % 1000 == 0 and i > 0:
|
if i % 1000 == 0 and i > 0:
|
||||||
print 'did', i, 'toponyms'
|
print 'did', i, 'toponyms'
|
||||||
i += 1
|
i += 1
|
||||||
@@ -684,7 +756,7 @@ if __name__ == '__main__':
|
|||||||
if args.streets_file:
|
if args.streets_file:
|
||||||
build_ways_training_data(language_rtree, args.streets_file, args.out_dir)
|
build_ways_training_data(language_rtree, args.streets_file, args.out_dir)
|
||||||
if args.borders_file:
|
if args.borders_file:
|
||||||
build_toponym_data(language_rtree, args.borders_file, args.out_dir)
|
build_toponym_training_data(language_rtree, args.borders_file, args.out_dir)
|
||||||
if args.address_file and not args.format_only and not args.limited_addresses:
|
if args.address_file and not args.format_only and not args.limited_addresses:
|
||||||
build_address_training_data(language_rtree, args.address_file, args.out_dir)
|
build_address_training_data(language_rtree, args.address_file, args.out_dir)
|
||||||
if args.address_file and args.format_only:
|
if args.address_file and args.format_only:
|
||||||
|
|||||||
Reference in New Issue
Block a user