From 551cce8cb16f6891fc6faf2340515417177985e2 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 10 Sep 2016 01:08:58 -0400 Subject: [PATCH] [fix] making a separate gazetteer for toponym abbreviations --- scripts/geodata/address_expansions/gazetteers.py | 15 +++++++++------ scripts/geodata/addresses/components.py | 2 +- scripts/geodata/openaddresses/formatter.py | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/scripts/geodata/address_expansions/gazetteers.py b/scripts/geodata/address_expansions/gazetteers.py index d96325a8..493d817e 100644 --- a/scripts/geodata/address_expansions/gazetteers.py +++ b/scripts/geodata/address_expansions/gazetteers.py @@ -157,11 +157,13 @@ HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number') POSTCODE_DICTIONARIES = ('postcode',) -TOPONYM_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('qualifiers', - 'personal_titles', - 'synonyms', - 'toponyms', - ) +TOPONYMS_DICTIONARY = 'toponyms' + +TOPONYM_ABBREVIATION_DICTIONARIES = ('qualifiers', + 'directionals', + 'personal_titles', + 'synonyms', + ) UNIT_ABBREVIATION_DICTIONARIES = ('level_types_basement', @@ -198,5 +200,6 @@ chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY) unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES) street_and_synonyms_gazetteer = create_gazetteer(*(STREET_TYPES_DICTIONARIES + (SYNONYM_DICTIONARY, ))) abbreviations_gazetteer = create_gazetteer(*ALL_ABBREVIATION_DICTIONARIES) -toponym_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES) +toponym_abbreviations_gazetteer = create_gazetteer(*TOPONYM_ABBREVIATION_DICTIONARIES) +toponym_gazetteer = create_gazetteer(TOPONYMS_DICTIONARY) given_name_gazetteer = create_gazetteer(GIVEN_NAME_DICTIONARY) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 8db7b3fb..4b264860 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -774,7 +774,7 @@ class AddressComponents(object): if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob: val = state_abbreviations.get_abbreviation(country, language, val, default=val) elif random.random() < abbreviate_toponym_prob: - val = abbreviate(toponym_gazetteer, val, language, abbreviate_prob=abbreviate_toponym_prob) + val = abbreviate(toponym_abbreviations_gazetteer, val, language, abbreviate_prob=abbreviate_toponym_prob) address_components[component] = val diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index f69aac50..03e6bf20 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -11,7 +11,7 @@ import yaml from geodata.addresses.units import Unit from geodata.address_expansions.abbreviations import abbreviate from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries -from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_types_gazetteer, toponym_gazetteer +from geodata.address_expansions.gazetteers import street_types_gazetteer, unit_types_gazetteer, toponym_abbreviations_gazetteer from geodata.address_formatting.formatter import AddressFormatter from geodata.addresses.components import AddressComponents from geodata.countries.names import country_names @@ -383,7 +383,7 @@ class OpenAddressesFormatter(object): for component_key in AddressFormatter.BOUNDARY_COMPONENTS: component = components.get(component_key, None) if component is not None: - component = abbreviate(toponym_gazetteer, component, language, + component = abbreviate(toponym_abbreviations_gazetteer, component, language, abbreviate_prob=abbreviate_toponym_prob) components[component_key] = component