[osm] abbreviate toponyms (qualifiers) with some probability so we get those versions in the model's phrase dictionaries
This commit is contained in:
@@ -108,6 +108,7 @@ category:
|
|||||||
drop_postcode_probability: 0.3 # drop postal code
|
drop_postcode_probability: 0.3 # drop postal code
|
||||||
|
|
||||||
boundaries:
|
boundaries:
|
||||||
|
abbreviate_toponym_probability: 0.35
|
||||||
# OSM relations should inherit properties from their admin_center
|
# OSM relations should inherit properties from their admin_center
|
||||||
override_with_admin_center:
|
override_with_admin_center:
|
||||||
- id: 92277 # Bangkok
|
- id: 92277 # Bangkok
|
||||||
|
|||||||
@@ -142,7 +142,6 @@ NAME_DICTIONARIES = (GIVEN_NAME_DICTIONARY,
|
|||||||
SURNAME_DICTIONARY,)
|
SURNAME_DICTIONARY,)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
|
NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees',
|
||||||
'building_types',
|
'building_types',
|
||||||
'company_types',
|
'company_types',
|
||||||
@@ -152,6 +151,8 @@ NAME_ABBREVIATION_DICTIONARIES = STREET_TYPES_DICTIONARIES + ('academic_degrees'
|
|||||||
'toponyms',
|
'toponyms',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
QUALIFIERS_DICTIONARY = 'qualifiers'
|
||||||
|
|
||||||
HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
|
HOUSE_NUMBER_DICTIONARIES = ('house_number', 'no_number')
|
||||||
|
|
||||||
POSTCODE_DICTIONARIES = ('postcode',)
|
POSTCODE_DICTIONARIES = ('postcode',)
|
||||||
@@ -184,6 +185,7 @@ def create_gazetteer(*dictionaries):
|
|||||||
|
|
||||||
|
|
||||||
street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
|
street_types_gazetteer = create_gazetteer(*STREET_TYPES_DICTIONARIES)
|
||||||
|
qualifiers_gazetteer = create_gazetteer(QUALIFIERS_DICTIONARY)
|
||||||
names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES)
|
names_gazetteer = create_gazetteer(*NAME_ABBREVIATION_DICTIONARIES)
|
||||||
chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
|
chains_gazetteer = create_gazetteer(CHAIN_DICTIONARY)
|
||||||
unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
|
unit_types_gazetteer = create_gazetteer(*UNIT_ABBREVIATION_DICTIONARIES)
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from itertools import combinations
|
|||||||
|
|
||||||
from geodata.address_formatting.formatter import AddressFormatter
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
|
|
||||||
|
from geodata.address_expansions.abbreviations import abbreviate
|
||||||
from geodata.addresses.config import address_config
|
from geodata.addresses.config import address_config
|
||||||
from geodata.addresses.floors import Floor
|
from geodata.addresses.floors import Floor
|
||||||
from geodata.addresses.entrances import Entrance
|
from geodata.addresses.entrances import Entrance
|
||||||
@@ -734,6 +735,7 @@ class AddressComponents(object):
|
|||||||
abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability')))
|
abbreviate_state_prob = float(nested_get(self.config, ('state', 'abbreviated_probability')))
|
||||||
join_state_district_prob = float(nested_get(self.config, ('state_district', 'join_probability')))
|
join_state_district_prob = float(nested_get(self.config, ('state_district', 'join_probability')))
|
||||||
replace_with_non_local_prob = float(nested_get(self.config, ('languages', 'replace_non_local_probability')))
|
replace_with_non_local_prob = float(nested_get(self.config, ('languages', 'replace_non_local_probability')))
|
||||||
|
abbreviate_toponym_prob = float(nested_get(self.config, ('boundaries', 'abbreviate_toponym_probability')))
|
||||||
|
|
||||||
for component, vals in poly_components.iteritems():
|
for component, vals in poly_components.iteritems():
|
||||||
if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
|
if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
|
||||||
@@ -748,6 +750,8 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
if component == AddressFormatter.STATE and random.random() < abbreviate_state_prob:
|
||||||
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
val = state_abbreviations.get_abbreviation(country, language, val, default=val)
|
||||||
|
elif random.random() < abbreviate_toponym_prob:
|
||||||
|
val = abbreviate(qualifiers_gazetteer, val, language, abbreviate_prob=abbreviate_toponym_prob)
|
||||||
|
|
||||||
address_components[component] = val
|
address_components[component] = val
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user