diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 6c34bbf5..0749ba57 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -40,6 +40,8 @@ ADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'formatted_addresses_tagged.tsv' ADDRESS_FORMAT_DATA_FILENAME = 'formatted_addresses.tsv' ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv' +ALL_LANGUAGES = 'all' + class OSMAddressFormatter(object): aliases = Aliases( @@ -326,6 +328,8 @@ class OSMAddressFormatter(object): name = canonical if random.random() < sample_probability: names = address_config.sample_phrases.get((language, dictionary), {}).get(canonical, []) + if not names: + names = address_config.sample_phrases.get((ALL_LANGUAGES, dictionary), {}).get(canonical, []) if names: name = random.choice(names) phrase = Chain.phrase(name, language, country) diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 7458ffad..2469eae0 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -172,9 +172,6 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): return country, name_language -ALL_LANGUAGES = 'all' - - def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets=True): ''' Creates a training set for language classification using most OSM ways