[osm] Adding a toponym language training set using planet-borders.osm (all admin borders)

This commit is contained in:
Al
2015-09-03 10:19:11 -04:00
parent db57855c95
commit 27eb4e4aed

View File

@@ -37,15 +37,23 @@ FORMATTER_GIT_REPO = 'https://github.com/openvenues/address-formatting'
WAY_OFFSET = 10 ** 15
RELATION_OFFSET = 2 * 10 ** 15
# Input files
PLANET_ADDRESSES_INPUT_FILE = 'planet-addresses.osm'
PLANET_WAYS_INPUT_FILE = 'planet-ways.osm'
PLANET_VENUES_INPUT_FILE = 'planet-venues.osm'
PLANET_BORDERS_INPUT_FILE = 'planet-borders.osm'
ALL_OSM_TAGS = set(['node', 'way', 'relation'])
WAYS_RELATIONS = set(['way', 'relation'])
# Output files
WAYS_LANGUAGE_DATA_FILENAME = 'streets_by_language.tsv'
ADDRESS_LANGUAGE_DATA_FILENAME = 'address_streets_by_language.tsv'
ADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'formatted_addresses_tagged.tsv'
ADDRESS_FORMAT_DATA_FILENAME = 'formatted_addresses.tsv'
ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
TOPONYM_LANGUAGE_DATA_FILENAME = 'toponyms_by_language.tsv'
class OSMField(object):
def __init__(self, name, c_constant, alternates=None):
@@ -272,8 +280,6 @@ def normalize_osm_name_tag(tag, script=False):
return norm.split('_', 1)[0]
WAYS_LANGUAGE_DATA_FILENAME = 'streets_by_language.tsv'
beginning_re = re.compile('^[^0-9\-]+', re.UNICODE)
end_re = re.compile('[^0-9]+$', re.UNICODE)
@@ -439,11 +445,6 @@ def build_ways_training_data(language_rtree, infile, out_dir):
f.close()
ADDRESS_LANGUAGE_DATA_FILENAME = 'address_streets_by_language.tsv'
ADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'formatted_addresses_tagged.tsv'
ADDRESS_FORMAT_DATA_FILENAME = 'formatted_addresses.tsv'
def build_address_format_training_data(language_rtree, infile, out_dir):
i = 0
@@ -481,7 +482,6 @@ def build_address_format_training_data(language_rtree, infile, out_dir):
print 'did', i, 'formatted addresses'
ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
NAME_KEYS = (
'name',
'addr:housename',
@@ -552,6 +552,30 @@ def build_address_format_training_data_limited(language_rtree, infile, out_dir):
print 'did', i, 'formatted addresses'
def build_toponym_data(language_rtree, infile, out_dir):
i = 0
f = open(os.path.join(out_dir, TOPONYM_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote')
for key, value in parse_osm(infile):
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name')
if not name_language:
continue
for k, v in name_language.iteritems():
for s in v:
s = s.strip()
if not s:
continue
if k in languages:
writer.writerow((k, country, tsv_string(s)))
if i % 1000 == 0 and i > 0:
print 'did', i, 'toponyms'
i += 1
f.close()
def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
i = 0
f = open(os.path.join(out_dir, ADDRESS_LANGUAGE_DATA_FILENAME), 'w')
@@ -626,6 +650,9 @@ if __name__ == '__main__':
parser.add_argument('-v', '--venues-file',
help='Path to planet-venues.osm')
parser.add_argument('-b', '--borders-file',
helpf='Path to planet-borders.osm')
parser.add_argument('-f', '--format-only',
action='store_true',
default=False,
@@ -659,6 +686,8 @@ if __name__ == '__main__':
# Can parallelize
if args.streets_file:
build_ways_training_data(language_rtree, args.streets_file, args.out_dir)
if args.borders_file:
build_toponym_data(language_rtree, args.borders_file, args.out_dir)
if args.address_file and not args.format_only and not args.limited_addresses:
build_address_training_data(language_rtree, args.address_file, args.out_dir)
if args.address_file and args.format_only: