[osm] Separating tagged from untagged output
This commit is contained in:
@@ -490,16 +490,39 @@ def strip_keys(value, ignore_keys):
|
|||||||
value.pop(key, None)
|
value.pop(key, None)
|
||||||
|
|
||||||
|
|
||||||
def build_address_format_training_data(language_rtree, infile, out_dir):
|
def build_address_format_training_data(language_rtree, infile, out_dir, tag_components=True):
|
||||||
|
'''
|
||||||
|
Creates formatted address training data for supervised sequence labeling (or potentially
|
||||||
|
for unsupervised learning e.g. for word vectors) using addr:* tags in OSM. The tagged
|
||||||
|
version produces a TSV file that looks like:
|
||||||
|
|
||||||
|
cs cz Gorkého/road ev.2459/house_number | 40004/postcode Trmice/city | CZ/country
|
||||||
|
|
||||||
|
The field structure is similar to other training data created by this script i.e.
|
||||||
|
{language, country, data}. The data field here is a sequence of labeled tokens similar
|
||||||
|
to what we might see in part-of-speech tagging.
|
||||||
|
|
||||||
|
This format uses a special character "|" to denote possible breaks in the input (comma, newline).
|
||||||
|
This information can potentially be used downstream by the sequence model as these
|
||||||
|
breaks may be present at prediction time.
|
||||||
|
|
||||||
|
For the untagged version, lines simply look like:
|
||||||
|
|
||||||
|
The Dignity | 363 Regents Park Road | London N3 1DH
|
||||||
|
|
||||||
|
This may be useful in learning word representations, statistical phrases, morphology
|
||||||
|
or other models requiring only the sequence of words.
|
||||||
|
'''
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
formatter = AddressFormatter()
|
formatter = AddressFormatter()
|
||||||
|
|
||||||
formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
|
if tag_components:
|
||||||
formatted_writer = csv.writer(formatted_file, 'tsv_no_quote')
|
formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
||||||
|
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
||||||
formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
else:
|
||||||
formatted_tagged_writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
|
||||||
|
writer = csv.writer(formatted_file, 'tsv_no_quote')
|
||||||
|
|
||||||
remove_keys = OSM_IGNORE_KEYS
|
remove_keys = OSM_IGNORE_KEYS
|
||||||
|
|
||||||
@@ -509,24 +532,29 @@ def build_address_format_training_data(language_rtree, infile, out_dir):
|
|||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
country, default_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
|
country, candidate_languages, language_props = country_and_languages(language_rtree, latitude, longitude)
|
||||||
if not (country and default_languages):
|
if not (country and candidate_languages):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for key in remove_keys:
|
for key in remove_keys:
|
||||||
_ = value.pop(key, None)
|
_ = value.pop(key, None)
|
||||||
|
|
||||||
formatted_address_tagged = formatter.format_address(country, value)
|
if len(candidate_languages) == 1:
|
||||||
formatted_address_untagged = formatter.format_address(country, value, tag_components=False)
|
language = candidate_languages[0]['lang']
|
||||||
if formatted_address_tagged is not None:
|
else:
|
||||||
formatted_address_tagged = tsv_string(formatted_address_tagged)
|
language = disambiguate_language(v, [(l['lang'], l['default']) for l in candidate_languages])
|
||||||
formatted_tagged_writer.writerow((default_languages[0]['lang'], country, formatted_address_tagged))
|
|
||||||
|
|
||||||
if formatted_address_untagged is not None:
|
formatted_address = formatter.format_address(country, value, tag_components=tag_components)
|
||||||
formatted_address_untagged = tsv_string(formatted_address_untagged)
|
if formatted_address is not None:
|
||||||
formatted_writer.writerow((default_languages[0]['lang'], country, formatted_address_untagged))
|
formatted_address = tsv_string(formatted_address)
|
||||||
|
if tag_components:
|
||||||
|
row = (language, country, formatted_address)
|
||||||
|
else:
|
||||||
|
row = (formatted_address,)
|
||||||
|
|
||||||
if formatted_address_tagged is not None or formatted_address_untagged is not None:
|
writer.writerow(row)
|
||||||
|
|
||||||
|
if formatted_address is not None:
|
||||||
i += 1
|
i += 1
|
||||||
if i % 1000 == 0 and i > 0:
|
if i % 1000 == 0 and i > 0:
|
||||||
print 'did', i, 'formatted addresses'
|
print 'did', i, 'formatted addresses'
|
||||||
|
|||||||
Reference in New Issue
Block a user