[addresses] Using YAML inheritance instead of baking it into the config parser

This commit is contained in:
Al
2016-04-26 18:29:05 -04:00
parent cd10951afb
commit 37747709ee
2 changed files with 1142 additions and 1168 deletions

View File

@@ -10,12 +10,13 @@
# country overrides section. Each country can create its own copy of the entire top-level # country overrides section. Each country can create its own copy of the entire top-level
# structure and it will be recursively merged with the defaults. # structure and it will be recursively merged with the defaults.
# Number default: &default
# ====== # Number
# Number, No., #, etc. can be used in both floor and apartment numbers, # ======
# so we'll define it separately # Number, No., #, etc. can be used in both floor and apartment numbers,
# so we'll define it separately
numbers: numbers:
default: &number default: &number
canonical: number # canonical word in libpostal dictionary canonical: number # canonical word in libpostal dictionary
abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted) abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
@@ -36,11 +37,11 @@ numbers:
numeric_probability: 0.4 # With this probability, use the standard numeric numeric_probability: 0.4 # With this probability, use the standard numeric
numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3 numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3
# And # And
# === # ===
# The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc. # The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
and: and:
default: &and default: &and
canonical: and canonical: and
abbreviated: "&" abbreviated: "&"
@@ -50,26 +51,26 @@ and:
sample_probability: 0.05 sample_probability: 0.05
# Floor/level # Floor/level
# =========== # ===========
# OSM doesn't usually concern itself with the address beyond the front door # OSM doesn't usually concern itself with the address beyond the front door
# yet many real-world addresses will have qualifying strings like "6th floor" # yet many real-world addresses will have qualifying strings like "6th floor"
# and we'd like the parser to handle those. # and we'd like the parser to handle those.
# #
# When we do get floor numbers in OSM addresses, it's usually in the form of the # When we do get floor numbers in OSM addresses, it's usually in the form of the
# addr:floor or level tag, where the value is typically an integer or a half-floor # addr:floor or level tag, where the value is typically an integer or a half-floor
# (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM # (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
# addresses do have a building:levels tag. If we know there are 20 floors in the # addresses do have a building:levels tag. If we know there are 20 floors in the
# building, we can randomly sample numbers <= the # of floors and come up with plausible # building, we can randomly sample numbers <= the # of floors and come up with plausible
# sounding addresses (i.e. a Floor 20 address is not as likely outside major cities). # sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
# #
# We're not done yet, because the integer value by itself isn't what people use when # We're not done yet, because the integer value by itself isn't what people use when
# writing addresses. This part of the config helps us rewrite the raw integer floor # writing addresses. This part of the config helps us rewrite the raw integer floor
# numers as the sort of natural language text used in addresses like "Fl #1". The config # numers as the sort of natural language text used in addresses like "Fl #1". The config
# is designed to be cross-lingual, so we can use the same structure with different words # is designed to be cross-lingual, so we can use the same structure with different words
# and do this for addresses in pretty much any language. # and do this for addresses in pretty much any language.
levels: levels:
# Numbered floors # Numbered floors
floor: &floor floor: &floor
canonical: floor canonical: floor
@@ -448,15 +449,15 @@ levels:
probability: 0.25 probability: 0.25
# Intersections # Intersections
# ============= # =============
# For constructing intersections like 5th Avenue & Broadway # For constructing intersections like 5th Avenue & Broadway
# In OSM, a node that's part of two ways is an intersection. # In OSM, a node that's part of two ways is an intersection.
# #
# These simple rules make it possible to create training examples # These simple rules make it possible to create training examples
# like: 26th/road Street/road and/intersection 6th/road Avenue/road # like: 26th/road Street/road and/intersection 6th/road Avenue/road
cross_streets: cross_streets:
# 26th & 6th Avenue # 26th & 6th Avenue
and: *and and: *and
# 26th @ Broadway # 26th @ Broadway
@@ -487,20 +488,20 @@ cross_streets:
sample: true sample: true
parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th) parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
# PO Box addresses # PO Box addresses
# ================ # ================
# For PO box addresses, there's almost no data in OSM, so we'll need to # For PO box addresses, there's almost no data in OSM, so we'll need to
# generate them somewhat randomly. # generate them somewhat randomly.
# #
# The strategy is: for every amenity=post_office, generate a number of PO box # The strategy is: for every amenity=post_office, generate a number of PO box
# addresses using random numbers (and some alpha-numerics so we capture patterns # addresses using random numbers (and some alpha-numerics so we capture patterns
# like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually # like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
# exist, as long as they cover the patterns of digits we expect in real addresses. # exist, as long as they cover the patterns of digits we expect in real addresses.
# The parser cares more about how many digits a number has and the surrounding # The parser cares more about how many digits a number has and the surrounding
# words/phrases than the specific number i.e. numbers in the range 1000-9999 # words/phrases than the specific number i.e. numbers in the range 1000-9999
# can simply be normalized to DDDD. # can simply be normalized to DDDD.
po_boxes: po_boxes:
po_box: &po_box po_box: &po_box
canonical: post office box canonical: post office box
abbreviated: p.o. box abbreviated: p.o. box
@@ -592,12 +593,12 @@ po_boxes:
- before: house - before: house
probability: 0.2 probability: 0.2
# Categories # Categories
# ========== # ==========
# Use the operators "in" and "near" for building category queries # Use the operators "in" and "near" for building category queries
# such as "restaurants in Hackney, London" # such as "restaurants in Hackney, London"
categories: categories:
near: near:
default: default:
canonical: near canonical: near
@@ -627,13 +628,13 @@ categories:
near_me_probability: 0.1 near_me_probability: 0.1
in_probability: 0.35 in_probability: 0.35
# Directions # Directions
# ========== # ==========
# Unit types, stairways, etc. may have a direction associated # Unit types, stairways, etc. may have a direction associated
# with them whether it's right/left or a cardinal direction # with them whether it's right/left or a cardinal direction
# like "East Entrance". # like "East Entrance".
directions: directions:
right: &right right: &right
canonical: right canonical: right
abbreviated: r abbreviated: r
@@ -692,7 +693,7 @@ directions:
- alternative: *rear - alternative: *rear
probability: 0.05 probability: 0.05
cardinal_directions: cardinal_directions:
east: &east east: &east
canonical: east canonical: east
abbreviated: e abbreviated: e
@@ -760,11 +761,11 @@ cardinal_directions:
- alternative: *west - alternative: *west
probability: 0.25 probability: 0.25
# Entrance # Entrance
# ======== # ========
# For deriving strings like "North Entrance" # For deriving strings like "North Entrance"
entrances: entrances:
entrance: &entrance entrance: &entrance
canonical: entrance canonical: entrance
abbreviated: ent abbreviated: ent
@@ -793,11 +794,11 @@ entrances:
- alternative: - alternative:
canonical: freight canonical: freight
# Staircase # Staircase
# ========= # =========
# For deriving strings like "Staircase A" in apartment buildings # For deriving strings like "Staircase A" in apartment buildings
staircases: staircases:
stair: &stair stair: &stair
canonical: stair canonical: stair
sample: true sample: true
@@ -842,13 +843,13 @@ staircases:
- alternative: *front - alternative: *front
# Unit types # Unit types
# ========== # ==========
# Unit information is common in residential addresses, offices, business parks, etc. # Unit information is common in residential addresses, offices, business parks, etc.
# Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to # Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
# refer to the # refer to the
units: units:
# Units are not part of the global address formats (and are not always standard) # Units are not part of the global address formats (and are not always standard)
# This is a list of places in the address where the unit line might go # This is a list of places in the address where the unit line might go
order: order:
@@ -1175,6 +1176,7 @@ units:
countries: countries:
# United States # United States
us: us:
<<: *default
levels: levels:
storey: &story storey: &story
canonical: story canonical: story
@@ -1261,6 +1263,7 @@ countries:
# Canada # Canada
# Specifically Canadian English. If the address is in French it will use fr.yaml # Specifically Canadian English. If the address is in French it will use fr.yaml
ca: ca:
<<: *default
levels: levels:
# Note: Canadian English uses "storey" keeping with the British convention, so no need to change that # Note: Canadian English uses "storey" keeping with the British convention, so no need to change that
@@ -1288,6 +1291,7 @@ countries:
combined_probability: 0.1 combined_probability: 0.1
# Australia # Australia
au: au:
<<: *default
po_boxes: &australia_po_boxes po_boxes: &australia_po_boxes
alphanumeric: alphanumeric:
default: *po_box default: *po_box
@@ -1330,6 +1334,7 @@ countries:
# New Zealand - same rules as Australia # New Zealand - same rules as Australia
nz: nz:
<<: *default
po_boxes: *australia_po_boxes po_boxes: *australia_po_boxes
units: *australia_unit_types units: *australia_unit_types

View File

@@ -7,9 +7,9 @@ import yaml
from collections import Mapping from collections import Mapping
from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
from geodata.math.sampling import cdf, check_probability_distribution from geodata.math.sampling import cdf, check_probability_distribution
this_dir = os.path.realpath(os.path.dirname(__file__)) this_dir = os.path.realpath(os.path.dirname(__file__))
ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
@@ -19,35 +19,6 @@ DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'dictionaries') 'resources', 'dictionaries')
def recursive_merge(a, b):
for k, v in six.iteritems(b):
if isinstance(v, Mapping):
existing = a.get(k, v)
merged = recursive_merge(existing, v)
a[k] = merged
else:
a[k] = b[k]
return a
class DoesNotExist:
pass
def nested_get(obj, keys):
if len(keys) == 0:
return obj
try:
for key in keys[:-1]:
obj = obj.get(key, {})
if not hasattr(obj, 'items'):
return DoesNotExist
key = keys[-1]
return obj.get(key, DoesNotExist)
except AttributeError:
return DoesNotExist
class AddressConfig(object): class AddressConfig(object):
def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR): def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
self.address_configs = {} self.address_configs = {}
@@ -58,17 +29,14 @@ class AddressConfig(object):
continue continue
config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename))) config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
default = config['default']
countries = config.pop('countries', {}) countries = config.pop('countries', {})
for k in countries.keys(): if countries:
country_config = countries[k] default['countries'] = countries
config_copy = copy.deepcopy(config)
countries[k] = recursive_merge(config_copy, country_config)
config['countries'] = countries
lang = filename.strip('.yaml') lang = filename.strip('.yaml')
self.address_configs[lang] = config self.address_configs[lang] = default
self.sample_phrases = {} self.sample_phrases = {}
@@ -87,6 +55,7 @@ class AddressConfig(object):
if country_config: if country_config:
config = country_config config = country_config
value = nested_get(config, keys) value = nested_get(config, keys)
if value is not DoesNotExist: if value is not DoesNotExist:
return value return value