[fix] Removing YAML inheritance as it doesn't merge nested dictionaries

2016-04-27 15:10:08 -04:00
parent 169f1db3bd
commit dff4a5e76e
2 changed files with 1147 additions and 1149 deletions
--- a/resources/addresses/en.yaml
+++ b/resources/addresses/en.yaml
@@ -10,13 +10,12 @@
 # country overrides section. Each country can create its own copy of the entire top-level
 # structure and it will be recursively merged with the defaults.

-default: &default
-    # Number
-    # ======
-    # Number, No., #, etc. can be used in both floor and apartment numbers,
-    # so we'll define it separately
+# Number
+# ======
+# Number, No., #, etc. can be used in both floor and apartment numbers,
+# so we'll define it separately

-    numbers:
+numbers:
    default: &number
        canonical: number # canonical word in libpostal dictionary
        abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
@@ -37,11 +36,11 @@ default: &default
        numeric_probability: 0.4 # With this probability, use the standard numeric
        numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3

-    # And
-    # ===
-    # The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
+# And
+# ===
+# The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.

-    and:
+and:
    default: &and
        canonical: and
        abbreviated: "&"
@@ -51,26 +50,26 @@ default: &default
        sample_probability: 0.05


-    # Floor/level
-    # ===========
-    # OSM doesn't usually concern itself with the address beyond the front door
-    # yet many real-world addresses will have qualifying strings like "6th floor"
-    # and we'd like the parser to handle those.
-    #
-    # When we do get floor numbers in OSM addresses, it's usually in the form of the
-    # addr:floor or level tag, where the value is typically an integer or a half-floor
-    # (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
-    # addresses do have a building:levels tag. If we know there are 20 floors in the
-    # building, we can randomly sample numbers <= the # of floors and come up with plausible
-    # sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
-    #
-    # We're not done yet, because the integer value by itself isn't what people use when
-    # writing addresses. This part of the config helps us rewrite the raw integer floor
-    # numers as the sort of natural language text used in addresses like "Fl #1". The config
-    # is designed to be cross-lingual, so we can use the same structure with different words
-    # and do this for addresses in pretty much any language.
+# Floor/level
+# ===========
+# OSM doesn't usually concern itself with the address beyond the front door
+# yet many real-world addresses will have qualifying strings like "6th floor"
+# and we'd like the parser to handle those.
+#
+# When we do get floor numbers in OSM addresses, it's usually in the form of the
+# addr:floor or level tag, where the value is typically an integer or a half-floor
+# (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
+# addresses do have a building:levels tag. If we know there are 20 floors in the
+# building, we can randomly sample numbers <= the # of floors and come up with plausible
+# sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
+#
+# We're not done yet, because the integer value by itself isn't what people use when
+# writing addresses. This part of the config helps us rewrite the raw integer floor
+# numers as the sort of natural language text used in addresses like "Fl #1". The config
+# is designed to be cross-lingual, so we can use the same structure with different words
+# and do this for addresses in pretty much any language.

-    levels:
+levels:
    # Numbered floors
    floor: &floor
        canonical: floor
@@ -449,15 +448,15 @@ default: &default
          probability: 0.25


-    # Intersections
-    # =============
-    # For constructing intersections like 5th Avenue & Broadway
-    # In OSM, a node that's part of two ways is an intersection.
-    #
-    # These simple rules make it possible to create training examples
-    # like: 26th/road Street/road and/intersection 6th/road Avenue/road
+# Intersections
+# =============
+# For constructing intersections like 5th Avenue & Broadway
+# In OSM, a node that's part of two ways is an intersection.
+#
+# These simple rules make it possible to create training examples
+# like: 26th/road Street/road and/intersection 6th/road Avenue/road

-    cross_streets:
+cross_streets:
    # 26th & 6th Avenue
    and: *and
    # 26th @ Broadway
@@ -488,20 +487,20 @@ default: &default
        sample: true
        parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)

-    # PO Box addresses
-    # ================
-    # For PO box addresses, there's almost no data in OSM, so we'll need to
-    # generate them somewhat randomly.
-    #
-    # The strategy is: for every amenity=post_office, generate a number of PO box
-    # addresses using random numbers (and some alpha-numerics so we capture patterns
-    # like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
-    # exist, as long as they cover the patterns of digits we expect in real addresses.
-    # The parser cares more about how many digits a number has and the surrounding 
-    # words/phrases than the specific number i.e. numbers in the range 1000-9999
-    # can simply be normalized to DDDD.
+# PO Box addresses
+# ================
+# For PO box addresses, there's almost no data in OSM, so we'll need to
+# generate them somewhat randomly.
+#
+# The strategy is: for every amenity=post_office, generate a number of PO box
+# addresses using random numbers (and some alpha-numerics so we capture patterns
+# like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
+# exist, as long as they cover the patterns of digits we expect in real addresses.
+# The parser cares more about how many digits a number has and the surrounding 
+# words/phrases than the specific number i.e. numbers in the range 1000-9999
+# can simply be normalized to DDDD.

-    po_boxes:
+po_boxes:
    po_box: &po_box
        canonical: post office box
        abbreviated: p.o. box
@@ -593,12 +592,12 @@ default: &default
        - before: house
          probability: 0.2

-    # Categories
-    # ==========
-    # Use the operators "in" and "near" for building category queries
-    # such as "restaurants in Hackney, London"
+# Categories
+# ==========
+# Use the operators "in" and "near" for building category queries
+# such as "restaurants in Hackney, London"

-    categories:
+categories:
    near:
        default:
            canonical: near
@@ -628,13 +627,13 @@ default: &default
    near_me_probability: 0.1
    in_probability: 0.35

-    # Directions
-    # ==========
-    # Unit types, stairways, etc. may have a direction associated
-    # with them whether it's right/left or a cardinal direction
-    # like "East Entrance".
+# Directions
+# ==========
+# Unit types, stairways, etc. may have a direction associated
+# with them whether it's right/left or a cardinal direction
+# like "East Entrance".

-    directions:
+directions:
    right: &right
        canonical: right
        abbreviated: r
@@ -693,7 +692,7 @@ default: &default
        - alternative: *rear
          probability: 0.05

-    cardinal_directions:
+cardinal_directions:
    east: &east
        canonical: east
        abbreviated: e
@@ -761,11 +760,11 @@ default: &default
        - alternative: *west
          probability: 0.25

-    # Entrance
-    # ========
-    # For deriving strings like "North Entrance"
+# Entrance
+# ========
+# For deriving strings like "North Entrance"

-    entrances:
+entrances:
    entrance: &entrance
        canonical: entrance
        abbreviated: ent
@@ -794,11 +793,11 @@ default: &default
                - alternative:
                      canonical: freight

-    # Staircase
-    # =========
-    # For deriving strings like "Staircase A" in apartment buildings
+# Staircase
+# =========
+# For deriving strings like "Staircase A" in apartment buildings

-    staircases:
+staircases:
    stair: &stair
        canonical: stair
        sample: true
@@ -843,13 +842,13 @@ default: &default
                - alternative: *front


-    # Unit types
-    # ==========
-    # Unit information is common in residential addresses, offices, business parks, etc.
-    # Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
-    # refer to the 
+# Unit types
+# ==========
+# Unit information is common in residential addresses, offices, business parks, etc.
+# Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
+# refer to the 

-    units:
+units:
    # Units are not part of the global address formats (and are not always standard)
    # This is a list of places in the address where the unit line might go
    order:
@@ -1176,7 +1175,6 @@ default: &default
 countries:
    # United States
    us:
-        <<: *default
        levels:
            storey: &story
                canonical: story
@@ -1263,7 +1261,6 @@ countries:
    # Canada
    # Specifically Canadian English. If the address is in French it will use fr.yaml
    ca:
-        <<: *default
        levels:
            # Note: Canadian English uses "storey" keeping with the British convention, so no need to change that

@@ -1291,7 +1288,6 @@ countries:
            combined_probability: 0.1
    # Australia
    au:
-        <<: *default
        po_boxes: &australia_po_boxes
            alphanumeric:
                default: *po_box
@@ -1334,7 +1330,6 @@ countries:

    # New Zealand - same rules as Australia
    nz:
-        <<: *default
        po_boxes: *australia_po_boxes
        units: *australia_unit_types

--- a/scripts/geodata/addresses/config.py
+++ b/scripts/geodata/addresses/config.py
@@ -10,6 +10,7 @@ from geodata.address_expansions.address_dictionaries import address_phrase_dicti
 from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
 from geodata.math.sampling import cdf, check_probability_distribution

+
 this_dir = os.path.realpath(os.path.dirname(__file__))

 ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
@@ -29,14 +30,17 @@ class AddressConfig(object):
                continue

            config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
-            default = config['default']
            countries = config.pop('countries', {})

-            if countries:
-                default['countries'] = countries
+            for k in countries.keys():
+                country_config = countries[k]
+                config_copy = copy.deepcopy(config)
+                countries[k] = recursive_merge(config_copy, country_config)
+
+            config['countries'] = countries

            lang = filename.strip('.yaml')
-            self.address_configs[lang] = default
+            self.address_configs[lang] = config

        self.sample_phrases = {}

@@ -55,7 +59,6 @@ class AddressConfig(object):
            if country_config:
                config = country_config

-
        value = nested_get(config, keys)
        if value is not DoesNotExist:
            return value