From ffc584f679a792cd31a2185de58634ecd7cdb457 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 10 Dec 2016 13:45:22 -0500 Subject: [PATCH] [states] adding all forms of the state abbreviation to the trie when doing place name normalization to handle the D.C./DC case --- scripts/geodata/addresses/components.py | 6 ++--- scripts/geodata/states/state_abbreviations.py | 22 +++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 786d5b41..a0cc3917 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -350,9 +350,9 @@ class AddressComponents(object): if is_state: for state in component_names: for language in languages: - state_code = state_abbreviations.get_abbreviation(country, language, state, default=None) - if state_code: - names.add(state_code.upper()) + abbreviations = state_abbreviations.get_all_abbreviations(country, language, state, default=None) + if abbreviations: + names.update([a.upper() for a in abbreviations]) phrase_filter = PhraseFilter([(n.lower(), '') for n in names]) diff --git a/scripts/geodata/states/state_abbreviations.py b/scripts/geodata/states/state_abbreviations.py index e3ad15d1..8e811cf2 100644 --- a/scripts/geodata/states/state_abbreviations.py +++ b/scripts/geodata/states/state_abbreviations.py @@ -26,33 +26,37 @@ class StateAbbreviations(object): country_config = yaml.load(open(os.path.join(base_dir, filename))) country_abbreviations = defaultdict(list) - country_full_names = defaultdict(list) + country_full_names = defaultdict(dict) for abbreviation, vals in six.iteritems(country_config): for language, full_name in six.iteritems(vals): full_name = safe_decode(full_name) abbreviation = safe_decode(abbreviation) country_abbreviations[(full_name.lower(), language)].append(abbreviation) - country_full_names[(abbreviation.lower(), language)].append(full_name) + country_full_names[abbreviation.lower()][language] = full_name self.abbreviations[country] = dict(country_abbreviations) self.full_names[country] = dict(country_full_names) - def get_abbreviation(self, country, language, state, default=None): + def get_all_abbreviations(self, country, language, state, default=None): values = nested_get(self.abbreviations, (country.lower(), (state.lower(), language.lower()))) if values is DoesNotExist: return default - if len(values) == 1: + return values + + def get_abbreviation(self, country, language, state, default=None): + values = selfg.get_all_abbreviations(country, language, state, default=default) + if values == default: + return default + elif len(values) == 1: return values[0] return random.choice(values) def get_full_name(self, country, language, state, default=None): - values = nested_get(self.full_names, (country.lower(), (state.lower(), language.lower()))) - if values is DoesNotExist: + value = nested_get(self.full_names, (country.lower(), state.lower(), language.lower())) + if value is DoesNotExist: return default - if len(values) == 1: - return values[0] - return random.choice(values) + return value state_abbreviations = StateAbbreviations()