diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index be8a062c..98ed0452 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -1099,7 +1099,7 @@ class AddressComponents(object): self.add_sub_building_phrase(AddressFormatter.LEVEL, floor_phrase_type, address_components, generated[AddressFormatter.LEVEL], language, country=country, num_floors=num_floors) self.add_sub_building_phrase(AddressFormatter.UNIT, unit_phrase_type, address_components, generated[AddressFormatter.UNIT], language, country=country, zone=zone) - def replace_name_affixes(self, address_components, language): + def replace_name_affixes(self, address_components, language, country=None): ''' Name normalization ------------------ @@ -1115,10 +1115,11 @@ class AddressComponents(object): name = address_components[component] if not name: continue - replacement = name_affixes.replace_suffixes(name, language) - replacement = name_affixes.replace_prefixes(replacement, language) - if replacement != name and random.random() < replacement_prob and not replacement.isdigit(): - address_components[component] = replacement + + if random.random() < replacement_prob: + replacement = name_affixes.replace_affixes(name, language, country=country) + if replacement != name and not replacement.isdigit(): + address_components[component] = replacement def replace_names(self, address_components): ''' @@ -1486,7 +1487,7 @@ class AddressComponents(object): self.cleanup_boundary_names(address_components) self.country_specific_cleanup(address_components, country) - self.replace_name_affixes(address_components, non_local_language or language) + self.replace_name_affixes(address_components, non_local_language or language, country=country) self.replace_names(address_components) diff --git a/scripts/geodata/geoplanet/geoplanet_training_data.py b/scripts/geodata/geoplanet/geoplanet_training_data.py index 17e1493b..350b15fe 100644 --- a/scripts/geodata/geoplanet/geoplanet_training_data.py +++ b/scripts/geodata/geoplanet/geoplanet_training_data.py @@ -114,22 +114,23 @@ class GeoPlanetFormatter(object): print('Doing variant aliases') variant_aliases = 0 - for i, row in enumerate(self.db.execute('''select a.*, p.name from aliases a + for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a join places p using(id) where a.name_type = "V" and a.language = p.language''')): - place_name = row[-1] + place_name, country_code = row[-2:] + country = country_code.lower() - row = row[:-1] + row = row[:-2] place_id, alias, name_type, language = row language = self.language_codes[language] if language != 'unk': - alias_sans_affixes = name_affixes.replace_prefixes(name_affixes.replace_suffixes(alias, language), language) + alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country) if alias_sans_affixes: alias = alias_sans_affixes - place_name_sans_affixes = name_affixes.replace_prefixes(name_affixes.replace_suffixes(place_name, language), language) + place_name_sans_affixes = name_affixes.replace_affixes(alias, language, country=country) if place_name_sans_affixes: place_name = place_name_sans_affixes else: @@ -280,7 +281,7 @@ class GeoPlanetFormatter(object): if __name__ == '__main__': if len(sys.argv) < 3: - sys.exit('Usage: python download_geoplanet.py geoplanet_db_path out_dir') + sys.exit('Usage: python geoplanet_training_data.py geoplanet_db_path out_dir') geoplanet_db_path = sys.argv[1] out_dir = sys.argv[2] diff --git a/scripts/geodata/names/normalization.py b/scripts/geodata/names/normalization.py index 0a68679f..65db5b96 100644 --- a/scripts/geodata/names/normalization.py +++ b/scripts/geodata/names/normalization.py @@ -109,4 +109,7 @@ class NameAffixes(object): return re.sub(six.u(''), name) + def replace_affixes(self, name, lang, country=None, sim_only=False): + return self.replace_prefixes(self.replace_suffixes(name, lang, country=country, sim_only=sim_only), lang, country=country, sim_only=sim_only) + name_affixes = NameAffixes()