[addresses] Using YAML inheritance instead of baking it into the config parser
This commit is contained in:
@@ -10,12 +10,13 @@
|
|||||||
# country overrides section. Each country can create its own copy of the entire top-level
|
# country overrides section. Each country can create its own copy of the entire top-level
|
||||||
# structure and it will be recursively merged with the defaults.
|
# structure and it will be recursively merged with the defaults.
|
||||||
|
|
||||||
# Number
|
default: &default
|
||||||
# ======
|
# Number
|
||||||
# Number, No., #, etc. can be used in both floor and apartment numbers,
|
# ======
|
||||||
# so we'll define it separately
|
# Number, No., #, etc. can be used in both floor and apartment numbers,
|
||||||
|
# so we'll define it separately
|
||||||
|
|
||||||
numbers:
|
numbers:
|
||||||
default: &number
|
default: &number
|
||||||
canonical: number # canonical word in libpostal dictionary
|
canonical: number # canonical word in libpostal dictionary
|
||||||
abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
|
abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
|
||||||
@@ -36,11 +37,11 @@ numbers:
|
|||||||
numeric_probability: 0.4 # With this probability, use the standard numeric
|
numeric_probability: 0.4 # With this probability, use the standard numeric
|
||||||
numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3
|
numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3
|
||||||
|
|
||||||
# And
|
# And
|
||||||
# ===
|
# ===
|
||||||
# The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
|
# The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
|
||||||
|
|
||||||
and:
|
and:
|
||||||
default: &and
|
default: &and
|
||||||
canonical: and
|
canonical: and
|
||||||
abbreviated: "&"
|
abbreviated: "&"
|
||||||
@@ -50,26 +51,26 @@ and:
|
|||||||
sample_probability: 0.05
|
sample_probability: 0.05
|
||||||
|
|
||||||
|
|
||||||
# Floor/level
|
# Floor/level
|
||||||
# ===========
|
# ===========
|
||||||
# OSM doesn't usually concern itself with the address beyond the front door
|
# OSM doesn't usually concern itself with the address beyond the front door
|
||||||
# yet many real-world addresses will have qualifying strings like "6th floor"
|
# yet many real-world addresses will have qualifying strings like "6th floor"
|
||||||
# and we'd like the parser to handle those.
|
# and we'd like the parser to handle those.
|
||||||
#
|
#
|
||||||
# When we do get floor numbers in OSM addresses, it's usually in the form of the
|
# When we do get floor numbers in OSM addresses, it's usually in the form of the
|
||||||
# addr:floor or level tag, where the value is typically an integer or a half-floor
|
# addr:floor or level tag, where the value is typically an integer or a half-floor
|
||||||
# (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
|
# (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
|
||||||
# addresses do have a building:levels tag. If we know there are 20 floors in the
|
# addresses do have a building:levels tag. If we know there are 20 floors in the
|
||||||
# building, we can randomly sample numbers <= the # of floors and come up with plausible
|
# building, we can randomly sample numbers <= the # of floors and come up with plausible
|
||||||
# sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
|
# sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
|
||||||
#
|
#
|
||||||
# We're not done yet, because the integer value by itself isn't what people use when
|
# We're not done yet, because the integer value by itself isn't what people use when
|
||||||
# writing addresses. This part of the config helps us rewrite the raw integer floor
|
# writing addresses. This part of the config helps us rewrite the raw integer floor
|
||||||
# numers as the sort of natural language text used in addresses like "Fl #1". The config
|
# numers as the sort of natural language text used in addresses like "Fl #1". The config
|
||||||
# is designed to be cross-lingual, so we can use the same structure with different words
|
# is designed to be cross-lingual, so we can use the same structure with different words
|
||||||
# and do this for addresses in pretty much any language.
|
# and do this for addresses in pretty much any language.
|
||||||
|
|
||||||
levels:
|
levels:
|
||||||
# Numbered floors
|
# Numbered floors
|
||||||
floor: &floor
|
floor: &floor
|
||||||
canonical: floor
|
canonical: floor
|
||||||
@@ -448,15 +449,15 @@ levels:
|
|||||||
probability: 0.25
|
probability: 0.25
|
||||||
|
|
||||||
|
|
||||||
# Intersections
|
# Intersections
|
||||||
# =============
|
# =============
|
||||||
# For constructing intersections like 5th Avenue & Broadway
|
# For constructing intersections like 5th Avenue & Broadway
|
||||||
# In OSM, a node that's part of two ways is an intersection.
|
# In OSM, a node that's part of two ways is an intersection.
|
||||||
#
|
#
|
||||||
# These simple rules make it possible to create training examples
|
# These simple rules make it possible to create training examples
|
||||||
# like: 26th/road Street/road and/intersection 6th/road Avenue/road
|
# like: 26th/road Street/road and/intersection 6th/road Avenue/road
|
||||||
|
|
||||||
cross_streets:
|
cross_streets:
|
||||||
# 26th & 6th Avenue
|
# 26th & 6th Avenue
|
||||||
and: *and
|
and: *and
|
||||||
# 26th @ Broadway
|
# 26th @ Broadway
|
||||||
@@ -487,20 +488,20 @@ cross_streets:
|
|||||||
sample: true
|
sample: true
|
||||||
parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
|
parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
|
||||||
|
|
||||||
# PO Box addresses
|
# PO Box addresses
|
||||||
# ================
|
# ================
|
||||||
# For PO box addresses, there's almost no data in OSM, so we'll need to
|
# For PO box addresses, there's almost no data in OSM, so we'll need to
|
||||||
# generate them somewhat randomly.
|
# generate them somewhat randomly.
|
||||||
#
|
#
|
||||||
# The strategy is: for every amenity=post_office, generate a number of PO box
|
# The strategy is: for every amenity=post_office, generate a number of PO box
|
||||||
# addresses using random numbers (and some alpha-numerics so we capture patterns
|
# addresses using random numbers (and some alpha-numerics so we capture patterns
|
||||||
# like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
|
# like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
|
||||||
# exist, as long as they cover the patterns of digits we expect in real addresses.
|
# exist, as long as they cover the patterns of digits we expect in real addresses.
|
||||||
# The parser cares more about how many digits a number has and the surrounding
|
# The parser cares more about how many digits a number has and the surrounding
|
||||||
# words/phrases than the specific number i.e. numbers in the range 1000-9999
|
# words/phrases than the specific number i.e. numbers in the range 1000-9999
|
||||||
# can simply be normalized to DDDD.
|
# can simply be normalized to DDDD.
|
||||||
|
|
||||||
po_boxes:
|
po_boxes:
|
||||||
po_box: &po_box
|
po_box: &po_box
|
||||||
canonical: post office box
|
canonical: post office box
|
||||||
abbreviated: p.o. box
|
abbreviated: p.o. box
|
||||||
@@ -592,12 +593,12 @@ po_boxes:
|
|||||||
- before: house
|
- before: house
|
||||||
probability: 0.2
|
probability: 0.2
|
||||||
|
|
||||||
# Categories
|
# Categories
|
||||||
# ==========
|
# ==========
|
||||||
# Use the operators "in" and "near" for building category queries
|
# Use the operators "in" and "near" for building category queries
|
||||||
# such as "restaurants in Hackney, London"
|
# such as "restaurants in Hackney, London"
|
||||||
|
|
||||||
categories:
|
categories:
|
||||||
near:
|
near:
|
||||||
default:
|
default:
|
||||||
canonical: near
|
canonical: near
|
||||||
@@ -627,13 +628,13 @@ categories:
|
|||||||
near_me_probability: 0.1
|
near_me_probability: 0.1
|
||||||
in_probability: 0.35
|
in_probability: 0.35
|
||||||
|
|
||||||
# Directions
|
# Directions
|
||||||
# ==========
|
# ==========
|
||||||
# Unit types, stairways, etc. may have a direction associated
|
# Unit types, stairways, etc. may have a direction associated
|
||||||
# with them whether it's right/left or a cardinal direction
|
# with them whether it's right/left or a cardinal direction
|
||||||
# like "East Entrance".
|
# like "East Entrance".
|
||||||
|
|
||||||
directions:
|
directions:
|
||||||
right: &right
|
right: &right
|
||||||
canonical: right
|
canonical: right
|
||||||
abbreviated: r
|
abbreviated: r
|
||||||
@@ -692,7 +693,7 @@ directions:
|
|||||||
- alternative: *rear
|
- alternative: *rear
|
||||||
probability: 0.05
|
probability: 0.05
|
||||||
|
|
||||||
cardinal_directions:
|
cardinal_directions:
|
||||||
east: &east
|
east: &east
|
||||||
canonical: east
|
canonical: east
|
||||||
abbreviated: e
|
abbreviated: e
|
||||||
@@ -760,11 +761,11 @@ cardinal_directions:
|
|||||||
- alternative: *west
|
- alternative: *west
|
||||||
probability: 0.25
|
probability: 0.25
|
||||||
|
|
||||||
# Entrance
|
# Entrance
|
||||||
# ========
|
# ========
|
||||||
# For deriving strings like "North Entrance"
|
# For deriving strings like "North Entrance"
|
||||||
|
|
||||||
entrances:
|
entrances:
|
||||||
entrance: &entrance
|
entrance: &entrance
|
||||||
canonical: entrance
|
canonical: entrance
|
||||||
abbreviated: ent
|
abbreviated: ent
|
||||||
@@ -793,11 +794,11 @@ entrances:
|
|||||||
- alternative:
|
- alternative:
|
||||||
canonical: freight
|
canonical: freight
|
||||||
|
|
||||||
# Staircase
|
# Staircase
|
||||||
# =========
|
# =========
|
||||||
# For deriving strings like "Staircase A" in apartment buildings
|
# For deriving strings like "Staircase A" in apartment buildings
|
||||||
|
|
||||||
staircases:
|
staircases:
|
||||||
stair: &stair
|
stair: &stair
|
||||||
canonical: stair
|
canonical: stair
|
||||||
sample: true
|
sample: true
|
||||||
@@ -842,13 +843,13 @@ staircases:
|
|||||||
- alternative: *front
|
- alternative: *front
|
||||||
|
|
||||||
|
|
||||||
# Unit types
|
# Unit types
|
||||||
# ==========
|
# ==========
|
||||||
# Unit information is common in residential addresses, offices, business parks, etc.
|
# Unit information is common in residential addresses, offices, business parks, etc.
|
||||||
# Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
|
# Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
|
||||||
# refer to the
|
# refer to the
|
||||||
|
|
||||||
units:
|
units:
|
||||||
# Units are not part of the global address formats (and are not always standard)
|
# Units are not part of the global address formats (and are not always standard)
|
||||||
# This is a list of places in the address where the unit line might go
|
# This is a list of places in the address where the unit line might go
|
||||||
order:
|
order:
|
||||||
@@ -1175,6 +1176,7 @@ units:
|
|||||||
countries:
|
countries:
|
||||||
# United States
|
# United States
|
||||||
us:
|
us:
|
||||||
|
<<: *default
|
||||||
levels:
|
levels:
|
||||||
storey: &story
|
storey: &story
|
||||||
canonical: story
|
canonical: story
|
||||||
@@ -1261,6 +1263,7 @@ countries:
|
|||||||
# Canada
|
# Canada
|
||||||
# Specifically Canadian English. If the address is in French it will use fr.yaml
|
# Specifically Canadian English. If the address is in French it will use fr.yaml
|
||||||
ca:
|
ca:
|
||||||
|
<<: *default
|
||||||
levels:
|
levels:
|
||||||
# Note: Canadian English uses "storey" keeping with the British convention, so no need to change that
|
# Note: Canadian English uses "storey" keeping with the British convention, so no need to change that
|
||||||
|
|
||||||
@@ -1288,6 +1291,7 @@ countries:
|
|||||||
combined_probability: 0.1
|
combined_probability: 0.1
|
||||||
# Australia
|
# Australia
|
||||||
au:
|
au:
|
||||||
|
<<: *default
|
||||||
po_boxes: &australia_po_boxes
|
po_boxes: &australia_po_boxes
|
||||||
alphanumeric:
|
alphanumeric:
|
||||||
default: *po_box
|
default: *po_box
|
||||||
@@ -1330,6 +1334,7 @@ countries:
|
|||||||
|
|
||||||
# New Zealand - same rules as Australia
|
# New Zealand - same rules as Australia
|
||||||
nz:
|
nz:
|
||||||
|
<<: *default
|
||||||
po_boxes: *australia_po_boxes
|
po_boxes: *australia_po_boxes
|
||||||
units: *australia_unit_types
|
units: *australia_unit_types
|
||||||
|
|
||||||
|
|||||||
@@ -7,9 +7,9 @@ import yaml
|
|||||||
from collections import Mapping
|
from collections import Mapping
|
||||||
|
|
||||||
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
|
||||||
|
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
|
||||||
from geodata.math.sampling import cdf, check_probability_distribution
|
from geodata.math.sampling import cdf, check_probability_distribution
|
||||||
|
|
||||||
|
|
||||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
|
|
||||||
ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
@@ -19,35 +19,6 @@ DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
|||||||
'resources', 'dictionaries')
|
'resources', 'dictionaries')
|
||||||
|
|
||||||
|
|
||||||
def recursive_merge(a, b):
|
|
||||||
for k, v in six.iteritems(b):
|
|
||||||
if isinstance(v, Mapping):
|
|
||||||
existing = a.get(k, v)
|
|
||||||
merged = recursive_merge(existing, v)
|
|
||||||
a[k] = merged
|
|
||||||
else:
|
|
||||||
a[k] = b[k]
|
|
||||||
return a
|
|
||||||
|
|
||||||
|
|
||||||
class DoesNotExist:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def nested_get(obj, keys):
|
|
||||||
if len(keys) == 0:
|
|
||||||
return obj
|
|
||||||
try:
|
|
||||||
for key in keys[:-1]:
|
|
||||||
obj = obj.get(key, {})
|
|
||||||
if not hasattr(obj, 'items'):
|
|
||||||
return DoesNotExist
|
|
||||||
key = keys[-1]
|
|
||||||
return obj.get(key, DoesNotExist)
|
|
||||||
except AttributeError:
|
|
||||||
return DoesNotExist
|
|
||||||
|
|
||||||
|
|
||||||
class AddressConfig(object):
|
class AddressConfig(object):
|
||||||
def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
|
def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
|
||||||
self.address_configs = {}
|
self.address_configs = {}
|
||||||
@@ -58,17 +29,14 @@ class AddressConfig(object):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
|
config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
|
||||||
|
default = config['default']
|
||||||
countries = config.pop('countries', {})
|
countries = config.pop('countries', {})
|
||||||
|
|
||||||
for k in countries.keys():
|
if countries:
|
||||||
country_config = countries[k]
|
default['countries'] = countries
|
||||||
config_copy = copy.deepcopy(config)
|
|
||||||
countries[k] = recursive_merge(config_copy, country_config)
|
|
||||||
|
|
||||||
config['countries'] = countries
|
|
||||||
|
|
||||||
lang = filename.strip('.yaml')
|
lang = filename.strip('.yaml')
|
||||||
self.address_configs[lang] = config
|
self.address_configs[lang] = default
|
||||||
|
|
||||||
self.sample_phrases = {}
|
self.sample_phrases = {}
|
||||||
|
|
||||||
@@ -87,6 +55,7 @@ class AddressConfig(object):
|
|||||||
if country_config:
|
if country_config:
|
||||||
config = country_config
|
config = country_config
|
||||||
|
|
||||||
|
|
||||||
value = nested_get(config, keys)
|
value = nested_get(config, keys)
|
||||||
if value is not DoesNotExist:
|
if value is not DoesNotExist:
|
||||||
return value
|
return value
|
||||||
|
|||||||
Reference in New Issue
Block a user