[language_id] Adding formatted addresses and toponyms to language training data

This commit is contained in:
Al
2015-09-04 01:46:49 -04:00
parent 4ebdca0ea7
commit 6a20ce5e85

View File

@@ -7,7 +7,7 @@ import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.osm.osm_address_training_data import WAYS_LANGUAGE_DATA_FILENAME, ADDRESS_LANGUAGE_DATA_FILENAME
from geodata.osm.osm_address_training_data import WAYS_LANGUAGE_DATA_FILENAME, ADDRESS_LANGUAGE_DATA_FILENAME, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME, TOPONYM_LANGUAGE_DATA_FILENAME
LANGUAGES_ALL_FILE = 'languages.all'
LANGAUGES_RANDOM_FILE = 'languages.random'
@@ -21,14 +21,24 @@ def create_language_training_data(osm_dir, split_data=True, train_split=0.8, cv_
ways_path = os.path.join(osm_dir, WAYS_LANGUAGE_DATA_FILENAME)
addresses_path = os.path.join(osm_dir, ADDRESS_LANGUAGE_DATA_FILENAME)
if os.system(' '.join(['cat', ways_path, '>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(ways_path))
addresses_path = os.path.join(osm_dir, ADDRESS_LANGUAGE_DATA_FILENAME)
if os.system(' '.join(['cat', addresses_path, '>>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(addresses_path))
formatted_path = os.path.join(osm_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME)
if os.system(' '.join(['cat', formatted_path, '>>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(formatted_path))
toponyms_path = os.path.join(osm_dir, TOPONYM_LANGUAGE_DATA_FILENAME)
if os.system(' '.join(['cat', toponyms_path, '>>', language_all_path])) != 0:
raise SystemError('Could not find {}'.format(toponyms_path))
languages_random_path = os.path.join(osm_dir, LANGAUGES_RANDOM_FILE)
if os.system(u' '.join(['shuf', '--random-source=/dev/urandom', language_all_path, '>', languages_random_path])) != 0: