[states] adding all forms of the state abbreviation to the trie when doing place name normalization to handle the D.C./DC case
This commit is contained in:
@@ -350,9 +350,9 @@ class AddressComponents(object):
|
|||||||
if is_state:
|
if is_state:
|
||||||
for state in component_names:
|
for state in component_names:
|
||||||
for language in languages:
|
for language in languages:
|
||||||
state_code = state_abbreviations.get_abbreviation(country, language, state, default=None)
|
abbreviations = state_abbreviations.get_all_abbreviations(country, language, state, default=None)
|
||||||
if state_code:
|
if abbreviations:
|
||||||
names.add(state_code.upper())
|
names.update([a.upper() for a in abbreviations])
|
||||||
|
|
||||||
phrase_filter = PhraseFilter([(n.lower(), '') for n in names])
|
phrase_filter = PhraseFilter([(n.lower(), '') for n in names])
|
||||||
|
|
||||||
|
|||||||
@@ -26,33 +26,37 @@ class StateAbbreviations(object):
|
|||||||
country_config = yaml.load(open(os.path.join(base_dir, filename)))
|
country_config = yaml.load(open(os.path.join(base_dir, filename)))
|
||||||
|
|
||||||
country_abbreviations = defaultdict(list)
|
country_abbreviations = defaultdict(list)
|
||||||
country_full_names = defaultdict(list)
|
country_full_names = defaultdict(dict)
|
||||||
|
|
||||||
for abbreviation, vals in six.iteritems(country_config):
|
for abbreviation, vals in six.iteritems(country_config):
|
||||||
for language, full_name in six.iteritems(vals):
|
for language, full_name in six.iteritems(vals):
|
||||||
full_name = safe_decode(full_name)
|
full_name = safe_decode(full_name)
|
||||||
abbreviation = safe_decode(abbreviation)
|
abbreviation = safe_decode(abbreviation)
|
||||||
country_abbreviations[(full_name.lower(), language)].append(abbreviation)
|
country_abbreviations[(full_name.lower(), language)].append(abbreviation)
|
||||||
country_full_names[(abbreviation.lower(), language)].append(full_name)
|
country_full_names[abbreviation.lower()][language] = full_name
|
||||||
|
|
||||||
self.abbreviations[country] = dict(country_abbreviations)
|
self.abbreviations[country] = dict(country_abbreviations)
|
||||||
self.full_names[country] = dict(country_full_names)
|
self.full_names[country] = dict(country_full_names)
|
||||||
|
|
||||||
def get_abbreviation(self, country, language, state, default=None):
|
def get_all_abbreviations(self, country, language, state, default=None):
|
||||||
values = nested_get(self.abbreviations, (country.lower(), (state.lower(), language.lower())))
|
values = nested_get(self.abbreviations, (country.lower(), (state.lower(), language.lower())))
|
||||||
if values is DoesNotExist:
|
if values is DoesNotExist:
|
||||||
return default
|
return default
|
||||||
if len(values) == 1:
|
return values
|
||||||
|
|
||||||
|
def get_abbreviation(self, country, language, state, default=None):
|
||||||
|
values = selfg.get_all_abbreviations(country, language, state, default=default)
|
||||||
|
if values == default:
|
||||||
|
return default
|
||||||
|
elif len(values) == 1:
|
||||||
return values[0]
|
return values[0]
|
||||||
return random.choice(values)
|
return random.choice(values)
|
||||||
|
|
||||||
def get_full_name(self, country, language, state, default=None):
|
def get_full_name(self, country, language, state, default=None):
|
||||||
values = nested_get(self.full_names, (country.lower(), (state.lower(), language.lower())))
|
value = nested_get(self.full_names, (country.lower(), state.lower(), language.lower()))
|
||||||
if values is DoesNotExist:
|
if value is DoesNotExist:
|
||||||
return default
|
return default
|
||||||
if len(values) == 1:
|
return value
|
||||||
return values[0]
|
|
||||||
return random.choice(values)
|
|
||||||
|
|
||||||
|
|
||||||
state_abbreviations = StateAbbreviations()
|
state_abbreviations = StateAbbreviations()
|
||||||
|
|||||||
Reference in New Issue
Block a user