[addresses] Using YAML inheritance instead of baking it into the config parser

2016-04-26 18:29:05 -04:00
parent cd10951afb
commit 37747709ee
2 changed files with 1142 additions and 1168 deletions
--- a/resources/addresses/en.yaml
+++ b/resources/addresses/en.yaml
@@ -10,12 +10,13 @@
 # country overrides section. Each country can create its own copy of the entire top-level
 # structure and it will be recursively merged with the defaults.
-# Number
+default: &default
-# ======
+    # Number
-# Number, No., #, etc. can be used in both floor and apartment numbers,
+    # ======
-# so we'll define it separately
+    # Number, No., #, etc. can be used in both floor and apartment numbers,
    # so we'll define it separately
-numbers:
+    numbers:
        default: &number
            canonical: number # canonical word in libpostal dictionary
            abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
@@ -36,11 +37,11 @@ numbers:
            numeric_probability: 0.4 # With this probability, use the standard numeric
            numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3
-# And
+    # And
-# ===
+    # ===
-# The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
+    # The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
-and:
+    and:
        default: &and
            canonical: and
            abbreviated: "&"
@@ -50,26 +51,26 @@ and:
            sample_probability: 0.05
-# Floor/level
+    # Floor/level
-# ===========
+    # ===========
-# OSM doesn't usually concern itself with the address beyond the front door
+    # OSM doesn't usually concern itself with the address beyond the front door
-# yet many real-world addresses will have qualifying strings like "6th floor"
+    # yet many real-world addresses will have qualifying strings like "6th floor"
-# and we'd like the parser to handle those.
+    # and we'd like the parser to handle those.
-#
+    #
-# When we do get floor numbers in OSM addresses, it's usually in the form of the
+    # When we do get floor numbers in OSM addresses, it's usually in the form of the
-# addr:floor or level tag, where the value is typically an integer or a half-floor
+    # addr:floor or level tag, where the value is typically an integer or a half-floor
-# (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
+    # (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
-# addresses do have a building:levels tag. If we know there are 20 floors in the
+    # addresses do have a building:levels tag. If we know there are 20 floors in the
-# building, we can randomly sample numbers <= the # of floors and come up with plausible
+    # building, we can randomly sample numbers <= the # of floors and come up with plausible
-# sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
+    # sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
-#
+    #
-# We're not done yet, because the integer value by itself isn't what people use when
+    # We're not done yet, because the integer value by itself isn't what people use when
-# writing addresses. This part of the config helps us rewrite the raw integer floor
+    # writing addresses. This part of the config helps us rewrite the raw integer floor
-# numers as the sort of natural language text used in addresses like "Fl #1". The config
+    # numers as the sort of natural language text used in addresses like "Fl #1". The config
-# is designed to be cross-lingual, so we can use the same structure with different words
+    # is designed to be cross-lingual, so we can use the same structure with different words
-# and do this for addresses in pretty much any language.
+    # and do this for addresses in pretty much any language.
-levels:
+    levels:
        # Numbered floors
        floor: &floor
            canonical: floor
@@ -448,15 +449,15 @@ levels:
              probability: 0.25
-# Intersections
+    # Intersections
-# =============
+    # =============
-# For constructing intersections like 5th Avenue & Broadway
+    # For constructing intersections like 5th Avenue & Broadway
-# In OSM, a node that's part of two ways is an intersection.
+    # In OSM, a node that's part of two ways is an intersection.
-#
+    #
-# These simple rules make it possible to create training examples
+    # These simple rules make it possible to create training examples
-# like: 26th/road Street/road and/intersection 6th/road Avenue/road
+    # like: 26th/road Street/road and/intersection 6th/road Avenue/road
-cross_streets:
+    cross_streets:
        # 26th & 6th Avenue
        and: *and
        # 26th @ Broadway
@@ -487,20 +488,20 @@ cross_streets:
            sample: true
            parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
-# PO Box addresses
+    # PO Box addresses
-# ================
+    # ================
-# For PO box addresses, there's almost no data in OSM, so we'll need to
+    # For PO box addresses, there's almost no data in OSM, so we'll need to
-# generate them somewhat randomly.
+    # generate them somewhat randomly.
-#
+    #
-# The strategy is: for every amenity=post_office, generate a number of PO box
+    # The strategy is: for every amenity=post_office, generate a number of PO box
-# addresses using random numbers (and some alpha-numerics so we capture patterns
+    # addresses using random numbers (and some alpha-numerics so we capture patterns
-# like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
+    # like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
-# exist, as long as they cover the patterns of digits we expect in real addresses.
+    # exist, as long as they cover the patterns of digits we expect in real addresses.
-# The parser cares more about how many digits a number has and the surrounding 
+    # The parser cares more about how many digits a number has and the surrounding 
-# words/phrases than the specific number i.e. numbers in the range 1000-9999
+    # words/phrases than the specific number i.e. numbers in the range 1000-9999
-# can simply be normalized to DDDD.
+    # can simply be normalized to DDDD.
-po_boxes:
+    po_boxes:
        po_box: &po_box
            canonical: post office box
            abbreviated: p.o. box
@@ -592,12 +593,12 @@ po_boxes:
            - before: house
              probability: 0.2
-# Categories
+    # Categories
-# ==========
+    # ==========
-# Use the operators "in" and "near" for building category queries
+    # Use the operators "in" and "near" for building category queries
-# such as "restaurants in Hackney, London"
+    # such as "restaurants in Hackney, London"
-categories:
+    categories:
        near:
            default:
                canonical: near
@@ -627,13 +628,13 @@ categories:
        near_me_probability: 0.1
        in_probability: 0.35
-# Directions
+    # Directions
-# ==========
+    # ==========
-# Unit types, stairways, etc. may have a direction associated
+    # Unit types, stairways, etc. may have a direction associated
-# with them whether it's right/left or a cardinal direction
+    # with them whether it's right/left or a cardinal direction
-# like "East Entrance".
+    # like "East Entrance".
-directions:
+    directions:
        right: &right
            canonical: right
            abbreviated: r
@@ -692,7 +693,7 @@ directions:
            - alternative: *rear
              probability: 0.05
-cardinal_directions:
+    cardinal_directions:
        east: &east
            canonical: east
            abbreviated: e
@@ -760,11 +761,11 @@ cardinal_directions:
            - alternative: *west
              probability: 0.25
-# Entrance
+    # Entrance
-# ========
+    # ========
-# For deriving strings like "North Entrance"
+    # For deriving strings like "North Entrance"
-entrances:
+    entrances:
        entrance: &entrance
            canonical: entrance
            abbreviated: ent
@@ -793,11 +794,11 @@ entrances:
                    - alternative:
                          canonical: freight
-# Staircase
+    # Staircase
-# =========
+    # =========
-# For deriving strings like "Staircase A" in apartment buildings
+    # For deriving strings like "Staircase A" in apartment buildings
-staircases:
+    staircases:
        stair: &stair
            canonical: stair
            sample: true
@@ -842,13 +843,13 @@ staircases:
                    - alternative: *front
-# Unit types
+    # Unit types
-# ==========
+    # ==========
-# Unit information is common in residential addresses, offices, business parks, etc.
+    # Unit information is common in residential addresses, offices, business parks, etc.
-# Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
+    # Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
-# refer to the 
+    # refer to the 
-units:
+    units:
        # Units are not part of the global address formats (and are not always standard)
        # This is a list of places in the address where the unit line might go
        order:
@@ -1175,6 +1176,7 @@ units:
 countries:
    # United States
    us:
        <<: *default
        levels:
            storey: &story
                canonical: story
@@ -1261,6 +1263,7 @@ countries:
    # Canada
    # Specifically Canadian English. If the address is in French it will use fr.yaml
    ca:
        <<: *default
        levels:
            # Note: Canadian English uses "storey" keeping with the British convention, so no need to change that
@@ -1288,6 +1291,7 @@ countries:
            combined_probability: 0.1
    # Australia
    au:
        <<: *default
        po_boxes: &australia_po_boxes
            alphanumeric:
                default: *po_box
@@ -1330,6 +1334,7 @@ countries:
    # New Zealand - same rules as Australia
    nz:
        <<: *default
        po_boxes: *australia_po_boxes
        units: *australia_unit_types
--- a/scripts/geodata/addresses/config.py
+++ b/scripts/geodata/addresses/config.py
@@ -7,9 +7,9 @@ import yaml
 from collections import Mapping
 from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
 from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
 from geodata.math.sampling import cdf, check_probability_distribution
 this_dir = os.path.realpath(os.path.dirname(__file__))
 ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
@@ -19,35 +19,6 @@ DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                                'resources', 'dictionaries')
 def recursive_merge(a, b):
    for k, v in six.iteritems(b):
        if isinstance(v, Mapping):
            existing = a.get(k, v)
            merged = recursive_merge(existing, v)
            a[k] = merged
        else:
            a[k] = b[k]
    return a
 class DoesNotExist:
    pass
 def nested_get(obj, keys):
    if len(keys) == 0:
        return obj
    try:
        for key in keys[:-1]:
            obj = obj.get(key, {})
            if not hasattr(obj, 'items'):
                return DoesNotExist
        key = keys[-1]
        return obj.get(key, DoesNotExist)
    except AttributeError:
        return DoesNotExist
 class AddressConfig(object):
    def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
        self.address_configs = {}
@@ -58,17 +29,14 @@ class AddressConfig(object):
                continue
            config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
            default = config['default']
            countries = config.pop('countries', {})
-            for k in countries.keys():
+            if countries:
-                country_config = countries[k]
+                default['countries'] = countries
                config_copy = copy.deepcopy(config)
                countries[k] = recursive_merge(config_copy, country_config)
            config['countries'] = countries
            lang = filename.strip('.yaml')
-            self.address_configs[lang] = config
+            self.address_configs[lang] = default
        self.sample_phrases = {}
@@ -87,6 +55,7 @@ class AddressConfig(object):
            if country_config:
                config = country_config
        value = nested_get(config, keys)
        if value is not DoesNotExist:
            return value