From 8b57a7acf2f975d3fd5de5fdbc3b1ab79af58540 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 22 Aug 2016 20:29:29 -0400 Subject: [PATCH] [osm] abbreviate toponyms (qualifiers) with some probability so we get those versions in the model's phrase dictionaries --- resources/parser/default.yaml | 1 + scripts/geodata/address_expansions/gazetteers.py | 4 +++- scripts/geodata/addresses/components.py | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/resources/parser/default.yaml b/resources/parser/default.yaml index e6442473..18b734d1 100644 --- a/resources/parser/default.yaml +++ b/resources/parser/default.yaml @@ -108,6 +108,7 @@ category: drop_postcode_probability: 0.3 # drop postal code boundaries: + abbreviate_toponym_probability: 0.35 # OSM relations should inherit properties from their admin_center override_with_admin_center: - id: 92277 # Bangkok diff --git a/scripts/geodata/address_expansions/gazetteers.py b/scripts/geodata/address_expansions/gazetteers.py index a560f6c1..b416473a 100644 --- a/scripts/geodata/address_expansions/gazetteers.py +++ b/scripts/geodata/address_expansions/gazetteers.py @@ -142,7 +142,6 @@ NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY, SURNAME_DICTIONARY,) - NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees', 'building_types', 'company_types', @@ -152,6 +151,8 @@ NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees' 'toponyms', ) +QUALIFIERS_DICTIONARY = 'qualifiers' + HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number') POSTCODE_DICTIONARIES = ('postcode',) @@ -184,6 +185,7 @@ def create_gazetteer(*dictionaries): street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES) +qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY) names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES) chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY) unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 4a136b3f..67544db9 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -11,6 +11,7 @@ from itertools import combinations from geodata.address_formatting.formatter import AddressFormatter +from geodata.address_expansions.abbreviations import abbreviate from geodata.addresses.config import address_config from geodata.addresses.floors import Floor from geodata.addresses.entrances import Entrance @@ -734,6 +735,7 @@ class AddressComponents(object): abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability'))) join_state_district_prob = float(nested_get(self.config, ('state_district', 'join_probability'))) replace_with_non_local_prob = float(nested_get(self.config, ('languages', 'replace_non_local_probability'))) + abbreviate_toponym_prob = float(nested_get(self.config, ('boundaries', 'abbreviate_toponym_probability'))) for component, vals in poly_components.iteritems(): if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob): @@ -748,6 +750,8 @@ class AddressComponents(object): if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob: val = state_abbreviations.get_abbreviation(country, language, val, default=val) + elif random.random() < abbreviate_toponym_prob: + val = abbreviate(qualifiers_gazetteer, val, language, abbreviate_prob=abbreviate_toponym_prob) address_components[component] = val