[states] adding all forms of the state abbreviation to the trie when doing place name normalization to handle the D.C./DC case

This commit is contained in:
Al
2016-12-10 13:45:22 -05:00
parent 5098599ed6
commit ffc584f679
2 changed files with 16 additions and 12 deletions

View File

@@ -350,9 +350,9 @@ class AddressComponents(object):
if is_state:
for state in component_names:
for language in languages:
state_code = state_abbreviations.get_abbreviation(country, language, state, default=None)
if state_code:
names.add(state_code.upper())
abbreviations = state_abbreviations.get_all_abbreviations(country, language, state, default=None)
if abbreviations:
names.update([a.upper() for a in abbreviations])
phrase_filter = PhraseFilter([(n.lower(), '') for n in names])

View File

@@ -26,33 +26,37 @@ class StateAbbreviations(object):
country_config = yaml.load(open(os.path.join(base_dir, filename)))
country_abbreviations = defaultdict(list)
country_full_names = defaultdict(list)
country_full_names = defaultdict(dict)
for abbreviation, vals in six.iteritems(country_config):
for language, full_name in six.iteritems(vals):
full_name = safe_decode(full_name)
abbreviation = safe_decode(abbreviation)
country_abbreviations[(full_name.lower(), language)].append(abbreviation)
country_full_names[(abbreviation.lower(), language)].append(full_name)
country_full_names[abbreviation.lower()][language] = full_name
self.abbreviations[country] = dict(country_abbreviations)
self.full_names[country] = dict(country_full_names)
def get_abbreviation(self, country, language, state, default=None):
def get_all_abbreviations(self, country, language, state, default=None):
values = nested_get(self.abbreviations, (country.lower(), (state.lower(), language.lower())))
if values is DoesNotExist:
return default
if len(values) == 1:
return values
def get_abbreviation(self, country, language, state, default=None):
values = selfg.get_all_abbreviations(country, language, state, default=default)
if values == default:
return default
elif len(values) == 1:
return values[0]
return random.choice(values)
def get_full_name(self, country, language, state, default=None):
values = nested_get(self.full_names, (country.lower(), (state.lower(), language.lower())))
if values is DoesNotExist:
value = nested_get(self.full_names, (country.lower(), state.lower(), language.lower()))
if value is DoesNotExist:
return default
if len(values) == 1:
return values[0]
return random.choice(values)
return value
state_abbreviations = StateAbbreviations()