[names] adding new name_affixes call to replace both prefixes/suffixes in one call, using in GeoPlanet training and the generic AddressComponents normalizations
This commit is contained in:
@@ -1099,7 +1099,7 @@ class AddressComponents(object):
|
|||||||
self.add_sub_building_phrase(AddressFormatter.LEVEL, floor_phrase_type, address_components, generated[AddressFormatter.LEVEL], language, country=country, num_floors=num_floors)
|
self.add_sub_building_phrase(AddressFormatter.LEVEL, floor_phrase_type, address_components, generated[AddressFormatter.LEVEL], language, country=country, num_floors=num_floors)
|
||||||
self.add_sub_building_phrase(AddressFormatter.UNIT, unit_phrase_type, address_components, generated[AddressFormatter.UNIT], language, country=country, zone=zone)
|
self.add_sub_building_phrase(AddressFormatter.UNIT, unit_phrase_type, address_components, generated[AddressFormatter.UNIT], language, country=country, zone=zone)
|
||||||
|
|
||||||
def replace_name_affixes(self, address_components, language):
|
def replace_name_affixes(self, address_components, language, country=None):
|
||||||
'''
|
'''
|
||||||
Name normalization
|
Name normalization
|
||||||
------------------
|
------------------
|
||||||
@@ -1115,10 +1115,11 @@ class AddressComponents(object):
|
|||||||
name = address_components[component]
|
name = address_components[component]
|
||||||
if not name:
|
if not name:
|
||||||
continue
|
continue
|
||||||
replacement = name_affixes.replace_suffixes(name, language)
|
|
||||||
replacement = name_affixes.replace_prefixes(replacement, language)
|
if random.random() < replacement_prob:
|
||||||
if replacement != name and random.random() < replacement_prob and not replacement.isdigit():
|
replacement = name_affixes.replace_affixes(name, language, country=country)
|
||||||
address_components[component] = replacement
|
if replacement != name and not replacement.isdigit():
|
||||||
|
address_components[component] = replacement
|
||||||
|
|
||||||
def replace_names(self, address_components):
|
def replace_names(self, address_components):
|
||||||
'''
|
'''
|
||||||
@@ -1486,7 +1487,7 @@ class AddressComponents(object):
|
|||||||
self.cleanup_boundary_names(address_components)
|
self.cleanup_boundary_names(address_components)
|
||||||
self.country_specific_cleanup(address_components, country)
|
self.country_specific_cleanup(address_components, country)
|
||||||
|
|
||||||
self.replace_name_affixes(address_components, non_local_language or language)
|
self.replace_name_affixes(address_components, non_local_language or language, country=country)
|
||||||
|
|
||||||
self.replace_names(address_components)
|
self.replace_names(address_components)
|
||||||
|
|
||||||
|
|||||||
@@ -114,22 +114,23 @@ class GeoPlanetFormatter(object):
|
|||||||
|
|
||||||
print('Doing variant aliases')
|
print('Doing variant aliases')
|
||||||
variant_aliases = 0
|
variant_aliases = 0
|
||||||
for i, row in enumerate(self.db.execute('''select a.*, p.name from aliases a
|
for i, row in enumerate(self.db.execute('''select a.*, p.name, p.country_code from aliases a
|
||||||
join places p using(id)
|
join places p using(id)
|
||||||
where a.name_type = "V"
|
where a.name_type = "V"
|
||||||
and a.language = p.language''')):
|
and a.language = p.language''')):
|
||||||
place_name = row[-1]
|
place_name, country_code = row[-2:]
|
||||||
|
country = country_code.lower()
|
||||||
|
|
||||||
row = row[:-1]
|
row = row[:-2]
|
||||||
place_id, alias, name_type, language = row
|
place_id, alias, name_type, language = row
|
||||||
|
|
||||||
language = self.language_codes[language]
|
language = self.language_codes[language]
|
||||||
if language != 'unk':
|
if language != 'unk':
|
||||||
alias_sans_affixes = name_affixes.replace_prefixes(name_affixes.replace_suffixes(alias, language), language)
|
alias_sans_affixes = name_affixes.replace_affixes(alias, language, country=country)
|
||||||
if alias_sans_affixes:
|
if alias_sans_affixes:
|
||||||
alias = alias_sans_affixes
|
alias = alias_sans_affixes
|
||||||
|
|
||||||
place_name_sans_affixes = name_affixes.replace_prefixes(name_affixes.replace_suffixes(place_name, language), language)
|
place_name_sans_affixes = name_affixes.replace_affixes(alias, language, country=country)
|
||||||
if place_name_sans_affixes:
|
if place_name_sans_affixes:
|
||||||
place_name = place_name_sans_affixes
|
place_name = place_name_sans_affixes
|
||||||
else:
|
else:
|
||||||
@@ -280,7 +281,7 @@ class GeoPlanetFormatter(object):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) < 3:
|
if len(sys.argv) < 3:
|
||||||
sys.exit('Usage: python download_geoplanet.py geoplanet_db_path out_dir')
|
sys.exit('Usage: python geoplanet_training_data.py geoplanet_db_path out_dir')
|
||||||
|
|
||||||
geoplanet_db_path = sys.argv[1]
|
geoplanet_db_path = sys.argv[1]
|
||||||
out_dir = sys.argv[2]
|
out_dir = sys.argv[2]
|
||||||
|
|||||||
@@ -109,4 +109,7 @@ class NameAffixes(object):
|
|||||||
|
|
||||||
return re.sub(six.u(''), name)
|
return re.sub(six.u(''), name)
|
||||||
|
|
||||||
|
def replace_affixes(self, name, lang, country=None, sim_only=False):
|
||||||
|
return self.replace_prefixes(self.replace_suffixes(name, lang, country=country, sim_only=sim_only), lang, country=country, sim_only=sim_only)
|
||||||
|
|
||||||
name_affixes = NameAffixes()
|
name_affixes = NameAffixes()
|
||||||
|
|||||||
Reference in New Issue
Block a user