[openaddresses] CLDR country names for OpenAddresses training set
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import six
|
import six
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
@@ -7,6 +8,8 @@ from geodata.address_expansions.abbreviations import abbreviate
|
|||||||
from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_types_gazetteer
|
from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_types_gazetteer
|
||||||
from geodata.address_formatting.formatter import AddressFormatter
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
from geodata.addresses.components import AddressComponents
|
from geodata.addresses.components import AddressComponents
|
||||||
|
from geodata.countries.names import country_names
|
||||||
|
from geodata.math.sampling import cdf, weighted_choice
|
||||||
|
|
||||||
from geodata.csv_utils import tsv_string, unicode_csv_reader
|
from geodata.csv_utils import tsv_string, unicode_csv_reader
|
||||||
|
|
||||||
@@ -44,11 +47,34 @@ class OpenAddressesFormatter(object):
|
|||||||
return value
|
return value
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def cldr_country_name(self, country_code, language, configs):
|
||||||
|
cldr_country_prob = float(self.get_property('cldr_country_probability', *configs))
|
||||||
|
|
||||||
|
country_name = None
|
||||||
|
|
||||||
|
if random.random() < cldr_country_prob:
|
||||||
|
localized, alpha2, alpha3 = values = range(3)
|
||||||
|
localized_prob = float(self.get_property('localized_name_probability', *configs))
|
||||||
|
alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs))
|
||||||
|
alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs))
|
||||||
|
|
||||||
|
probs = cdf([localized_prob, alpha2_prob, alpha3_prob])
|
||||||
|
|
||||||
|
country_type = weighted_choice(values, probs)
|
||||||
|
|
||||||
|
country_name = country_code.upper()
|
||||||
|
if country_type == localized:
|
||||||
|
country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
|
||||||
|
elif country_type == alpha3:
|
||||||
|
country_name = country_names.alpha3_code(country_code) or country_name
|
||||||
|
|
||||||
|
return country_name
|
||||||
|
|
||||||
def formatted_addresses(self, path, configs, tag_components=True):
|
def formatted_addresses(self, path, configs, tag_components=True):
|
||||||
abbreviate_street_prob = self.get_property('abbreviate_street_probability', *configs)
|
abbreviate_street_prob = float(self.get_property('abbreviate_street_probability', *configs))
|
||||||
separate_street_prob = self.get_property('separate_street_probability', *configs) or 0.0
|
separate_street_prob = float(self.get_property('separate_street_probability', *configs) or 0.0)
|
||||||
abbreviate_unit_prob = self.get_property('abbreviate_unit_probability', *configs)
|
abbreviate_unit_prob = float(self.get_property('abbreviate_unit_probability', *configs))
|
||||||
separate_unit_prob = self.get_property('separate_unit_probability', *configs) or 0.0
|
separate_unit_prob = float(self.get_property('separate_unit_probability', *configs) or 0.0)
|
||||||
|
|
||||||
add_components = self.get_property('add', *configs)
|
add_components = self.get_property('add', *configs)
|
||||||
|
|
||||||
@@ -106,6 +132,10 @@ class OpenAddressesFormatter(object):
|
|||||||
separate_prob=separate_unit_prob)
|
separate_prob=separate_unit_prob)
|
||||||
components[AddressFormatter.UNIT] = unit
|
components[AddressFormatter.UNIT] = unit
|
||||||
|
|
||||||
|
country_name = self.cldr_country_name(country, language, configs)
|
||||||
|
if country_name:
|
||||||
|
components[AddressFormatter.COUNTRY] = country_name
|
||||||
|
|
||||||
if add_components:
|
if add_components:
|
||||||
for k, v in six.iteritems(add_components):
|
for k, v in six.iteritems(add_components):
|
||||||
if k not in components:
|
if k not in components:
|
||||||
|
|||||||
Reference in New Issue
Block a user