[states] adding all forms of the state abbreviation to the trie when doing place name normalization to handle the D.C./DC case
This commit is contained in:
@@ -350,9 +350,9 @@ class AddressComponents(object):
|
||||
if is_state:
|
||||
for state in component_names:
|
||||
for language in languages:
|
||||
state_code = state_abbreviations.get_abbreviation(country, language, state, default=None)
|
||||
if state_code:
|
||||
names.add(state_code.upper())
|
||||
abbreviations = state_abbreviations.get_all_abbreviations(country, language, state, default=None)
|
||||
if abbreviations:
|
||||
names.update([a.upper() for a in abbreviations])
|
||||
|
||||
phrase_filter = PhraseFilter([(n.lower(), '') for n in names])
|
||||
|
||||
|
||||
@@ -26,33 +26,37 @@ class StateAbbreviations(object):
|
||||
country_config = yaml.load(open(os.path.join(base_dir, filename)))
|
||||
|
||||
country_abbreviations = defaultdict(list)
|
||||
country_full_names = defaultdict(list)
|
||||
country_full_names = defaultdict(dict)
|
||||
|
||||
for abbreviation, vals in six.iteritems(country_config):
|
||||
for language, full_name in six.iteritems(vals):
|
||||
full_name = safe_decode(full_name)
|
||||
abbreviation = safe_decode(abbreviation)
|
||||
country_abbreviations[(full_name.lower(), language)].append(abbreviation)
|
||||
country_full_names[(abbreviation.lower(), language)].append(full_name)
|
||||
country_full_names[abbreviation.lower()][language] = full_name
|
||||
|
||||
self.abbreviations[country] = dict(country_abbreviations)
|
||||
self.full_names[country] = dict(country_full_names)
|
||||
|
||||
def get_abbreviation(self, country, language, state, default=None):
|
||||
def get_all_abbreviations(self, country, language, state, default=None):
|
||||
values = nested_get(self.abbreviations, (country.lower(), (state.lower(), language.lower())))
|
||||
if values is DoesNotExist:
|
||||
return default
|
||||
if len(values) == 1:
|
||||
return values
|
||||
|
||||
def get_abbreviation(self, country, language, state, default=None):
|
||||
values = selfg.get_all_abbreviations(country, language, state, default=default)
|
||||
if values == default:
|
||||
return default
|
||||
elif len(values) == 1:
|
||||
return values[0]
|
||||
return random.choice(values)
|
||||
|
||||
def get_full_name(self, country, language, state, default=None):
|
||||
values = nested_get(self.full_names, (country.lower(), (state.lower(), language.lower())))
|
||||
if values is DoesNotExist:
|
||||
value = nested_get(self.full_names, (country.lower(), state.lower(), language.lower()))
|
||||
if value is DoesNotExist:
|
||||
return default
|
||||
if len(values) == 1:
|
||||
return values[0]
|
||||
return random.choice(values)
|
||||
return value
|
||||
|
||||
|
||||
state_abbreviations = StateAbbreviations()
|
||||
|
||||
Reference in New Issue
Block a user