From dc73465bba0cb98943fc5440a1b3354b682423c6 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Tue, 26 Apr 2016 18:29:05 -0400
Subject: [PATCH] [addresses] Using YAML inheritance instead of baking it into
 the config parser

---
 resources/addresses/en.yaml         | 2267 ++++++++++++++-------------
 scripts/geodata/addresses/config.py |   43 +-
 2 files changed, 1142 insertions(+), 1168 deletions(-)

diff --git a/resources/addresses/en.yaml b/resources/addresses/en.yaml
index 6de4002f..d4954ca5 100644
--- a/resources/addresses/en.yaml
+++ b/resources/addresses/en.yaml
@@ -10,1163 +10,1164 @@
 # country overrides section. Each country can create its own copy of the entire top-level
 # structure and it will be recursively merged with the defaults.
 
-# Number
-# ======
-# Number, No., #, etc. can be used in both floor and apartment numbers,
-# so we'll define it separately
+default: &default
+    # Number
+    # ======
+    # Number, No., #, etc. can be used in both floor and apartment numbers,
+    # so we'll define it separately
 
-numbers:
-    default: &number
-        canonical: number # canonical word in libpostal dictionary
-        abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
-        sample: true # Randomly sample other variations (e.g. num, nr)
-        # Probabilities
-        canonical_probability: 0.3 # With this probability, use the canonical
-        abbreviated_probability: 0.5 # With this probability, use the abbreviated form
-        sample_probability: 0.2 # With  this probability, sample other variations
-        sample_exclude:
-            - "#" # Used in numeric affix. Needs to be quoted, otherwise it's a comment
-        numeric:
-            direction: left
-        numeric_affix:
-            affix: "#" # e.g. #3, #2F, etc.
-            direction: left # affix goes on the number's left
+    numbers:
+        default: &number
+            canonical: number # canonical word in libpostal dictionary
+            abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
+            sample: true # Randomly sample other variations (e.g. num, nr)
+            # Probabilities
+            canonical_probability: 0.3 # With this probability, use the canonical
+            abbreviated_probability: 0.5 # With this probability, use the abbreviated form
+            sample_probability: 0.2 # With  this probability, sample other variations
+            sample_exclude:
+                - "#" # Used in numeric affix. Needs to be quoted, otherwise it's a comment
+            numeric:
+                direction: left
+            numeric_affix:
+                affix: "#" # e.g. #3, #2F, etc.
+                direction: left # affix goes on the number's left
 
-        # Probabilities for numbers
-        numeric_probability: 0.4 # With this probability, use the standard numeric
-        numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3
+            # Probabilities for numbers
+            numeric_probability: 0.4 # With this probability, use the standard numeric
+            numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3
 
-# And
-# ===
-# The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
+    # And
+    # ===
+    # The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
 
-and:
-    default: &and
-        canonical: and
-        abbreviated: "&"
-        canonical_probability: 0.2
-        abbreviated_probability: 0.75
-        sample: true
-        sample_probability: 0.05
+    and:
+        default: &and
+            canonical: and
+            abbreviated: "&"
+            canonical_probability: 0.2
+            abbreviated_probability: 0.75
+            sample: true
+            sample_probability: 0.05
 
 
-# Floor/level
-# ===========
-# OSM doesn't usually concern itself with the address beyond the front door
-# yet many real-world addresses will have qualifying strings like "6th floor"
-# and we'd like the parser to handle those.
-#
-# When we do get floor numbers in OSM addresses, it's usually in the form of the
-# addr:floor or level tag, where the value is typically an integer or a half-floor
-# (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
-# addresses do have a building:levels tag. If we know there are 20 floors in the
-# building, we can randomly sample numbers <= the # of floors and come up with plausible
-# sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
-#
-# We're not done yet, because the integer value by itself isn't what people use when
-# writing addresses. This part of the config helps us rewrite the raw integer floor
-# numers as the sort of natural language text used in addresses like "Fl #1". The config
-# is designed to be cross-lingual, so we can use the same structure with different words
-# and do this for addresses in pretty much any language.
+    # Floor/level
+    # ===========
+    # OSM doesn't usually concern itself with the address beyond the front door
+    # yet many real-world addresses will have qualifying strings like "6th floor"
+    # and we'd like the parser to handle those.
+    #
+    # When we do get floor numbers in OSM addresses, it's usually in the form of the
+    # addr:floor or level tag, where the value is typically an integer or a half-floor
+    # (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
+    # addresses do have a building:levels tag. If we know there are 20 floors in the
+    # building, we can randomly sample numbers <= the # of floors and come up with plausible
+    # sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
+    #
+    # We're not done yet, because the integer value by itself isn't what people use when
+    # writing addresses. This part of the config helps us rewrite the raw integer floor
+    # numers as the sort of natural language text used in addresses like "Fl #1". The config
+    # is designed to be cross-lingual, so we can use the same structure with different words
+    # and do this for addresses in pretty much any language.
 
-levels:
-    # Numbered floors
-    floor: &floor
-        canonical: floor
-        plural: floors
-        abbreviated: fl
-        canonical_probability: 0.5 # With this probability, use canonical version
-        abbreviated_probability: 0.4 # With this probability, use abbreviated version
-        sample_probability: 0.1 # With this probability, sample from the other forms
-        sample_exclude:
-            - / f # Exclude this abbreviation since it's used as an affix
-        sample: true
-        # e.g. Floor 1
-        numeric:
-            direction: left # Floor/Fl goes to the left of the number
-            direction_probability: 0.8 # With 1 - this probability, Floor/Fl goes on the other side of the number
-            add_number_phrase: true # Occasionally add variation of "number", e.g. Floor No. 1
-            add_number_phrase_probability: 0.4 # With this probability, use Floor No. 1 or Floor #1 vs. Floor 1
-        # e.g. 2/F, 3/F
-        numeric_affix:
-            affix: /f
-            direction: right # affix goes to number's right (always)
-         # e.g. 1st Floor
-        ordinal:
-            direction: right # canonical or abbreviated form goes to the ordinal's right
-        # Probabilities
-        numeric_probability: 0.75 # Use the simple number e.g. Floor 1 (or Floor No. 1)
-        numeric_affix_probability: 0.05 # Use the 2/F (less common)
-        ordinal_probability: 0.2 # Use the ordinal e.g. 1st Floor
-    # The word "level" is also occasionally used
-    level: &level
-        canonical: level
-        plural: levels
-        abbreviated: lvl
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.3
-        sample_probability: 0.2
-        sample_exclude:
-            - / l # Exclude this abbreviation since it's used as an affix
-        numeric:
-            direction: left # Level/Lvl goes to the left of the number
-            direction_probability: 0.8 # With 1 - this probability, Level/Lvl goes on the other side of the number
-            add_number_phrase: true # Occasionally add variation of "number", e.g. Level No. 1
-            add_number_phrase_probability: 0.4 # With this probability, use Level No. 1 or Level #1 vs. Level 1
-        # e.g. 2/L, 3/L (ambiguous with left)
-        numeric_affix:
-            affix: /l
-            direction: right
-        ordinal:
-            direction: right
-        numeric_probability: 0.4
-        numeric_affix_probability: 0.05
-        ordinal_probability: 0.55
-    platform: &platform
-        canonical: platform
-        plural: platforms
-        abbreviated: pf
-        canonical_probability: 0.7
-        abbreviated_probability: 0.3
-        numeric:
-            direction: left
-        ordinal:
-            direction: right
-        numeric_probability: 0.5 # e.g. Platform 1
-        ordinal_probability: 0.5 # e.g. 1st Platform
-    storey: &storey
-        canonical: storey
-        plural: storeys
-        numeric:
-            direction: left
-        ordinal:
-            direction: right
-        numeric_probability: 0.025 # e.g. Storey 2, less common
-        ordinal_probability: 0.975 # e.g. 2nd Storey, more common
-    # Special instructions for ground floor
-    ground_floor: &ground_floor
-        canonical: ground floor
-        abbreviated: g/f
-        canonical_probability: 0.4
-        abbreviated_probability: 0.4
-        sample_probability: 0.2
-        sample: true
-    ground: &ground
-        canonical: ground
-        abbreviated: g
-        sample: true
-        canonical_probability: 0.6
-        abbreviated_probability: 0.1
-        sample_probability: 0.3
-    ground_level: &ground_level
-        canonical: ground level
-        abbreviated: g/l
-        sample: true
-        canonical_probability: 0.4
-        abbreviated_probability: 0.2
-        sample_probability: 0.4
-    # Special instructions for lower ground floor (added randomly, not an alias for a floor number)
-    lower_ground_floor: &lower_ground_floor
-        canonical: lower ground floor
-        abbreviated: lg
-        sample: true
-        # Probabilities
-        canonical_probability: 0.6
-        abbreviated_probability: 0.3
-        sample_probability: 0.1
-    # Special instructions for upper ground floor (added randomly, not an alias for a floor number)
-    upper_ground_floor: &upper_ground_floor
-        canonical: upper ground floor
-        abbreviated: ug
-        sample: true
-        # Probabilities
-        canonical_probability: 0.6
-        abbreviated_probability: 0.2
-        sample_probability: 0.2
-    upper: &upper
-        canonical: upper
-        abbreviated: uppr
-        sample: true
-        canonical_probability: 0.8
-        abbreviated_probability: 0.1
-        sample_probability: 0.1
-    lower_level: &lower_level
-        canonical: lower level
-        abbreviated: lwr lvl
-        sample: true
-        canonical_probability: 0.7
-        abbreviated_probability: 0.1
-        sample_probability: 0.2
-    lobby: &lobby
-        canonical: lobby
-    upstairs: &upstairs
-        canonical: upstairs
-    downstairs: &downstairs
-        canonical: downstairs
-    # Special instructions for podium level (added randomly)
-    podium_level: &podium_level
-        canonical: podium level
-        abbreviated: pd lvl
-        sample: true
-        canonical_probability: 0.6
-        abbreviated_probability: 0.2
-        sample_probability: 0.2
-    podium: &podium
-        canonical: podium
-        abbreviated: pd
-        sample: true
-        canonical_probability: 0.6
-        abbreviated_probability: 0.2
-        sample_probability: 0.2
-    # Used when floor number is < 0 (starts at -1 in all countries)
-    basement: &basement
-        canonical: basement
-        abbreviated: bsmt
-        sample: true
-        # e.g. Basement 1
-        numeric:
-            direction: left
-        # e.g. B1
-        numeric_affix:
-            affix: b
-            direction: left
-        # e.g. 2nd Basement
-        ordinal:
-            direction: right
-        standalone_probability: 0.985
-        number_abs_value: true
-        number_min_abs_value: 1
-        numeric_probability: 0.005
-        numeric_affix_probability: 0.005
-        ordinal_probability: 0.005
-    cellar: &cellar
-        canonical: cellar
-        sample: true
-        canonical_probability: 0.8
-        sample_probability: 0.2
-    # Floor number of <= -2 can be basement 2, sub-basement, sub-basement 1, etc.
-    sub_basement: &sub_basement
-        canonical: sub basement
-        abbreviated: sb
-        sample: true
-        # e.g. Sub-basement 1
-        numeric:
-            direction: left
-        # e.g. SB1
-        numeric_affix:
-            affix: sb
-            direction: left
-        # e.g. 2nd Sub-basement
-        ordinal:
-            direction: right
-        number_abs_value: true
-        number_min_abs_value: 2
-        # Basement 2 == Sub-basement 1
-        number_subtract_abs_value: 1
-        standalone_probability: 0.985
-        numeric_probability: 0.005
-        numeric_affix_probability: 0.005
-        ordinal_probability: 0.005
-    top_floor: &top_floor
-        canonical: top floor
-        abbreviated: tf
-        sample: true
-        canonical_probability: 0.6
-        abbreviated_probability: 0.3
-        sample_probability: 0.1
-    # Mezzanine level (floor number {0.5, 1.5, ...}, also be added at random)
-    mezzanine: &mezzanine
-        canonical: mezzanine
-        abbreviated: mezz
-        sample: true
-        canonical_probability: 0.8
-        abbreviated_probability: 0.1
-        sample_probability: 0.1
-        # Mezzanine/Mezz 2 or Mezzanine/Mezz A
-        numeric:
-            direction: left
-        # M2
-        numeric_affix:
-            affix: m
-            direction: left
-        # 2nd Mezzanine
-        ordinal:
-            direction: right
-        # Floor 0.5 is just plain mezzanine, no number
-        number_abs_value: true
-        number_min_abs_value: 1
-        standalone_probability: 0.5
-        numeric_probability: 0.1
-        numeric_affix_probability: 0.1
-        ordinal_probability: 0.3
-    mezzanine_floor: &mezzanine_floor
-        canonical: mezzanine floor
-        abbreviated: mezz floor
-        sample: true
-        canonical_probability: 0.7
-        abbreviated_probability: 0.2
-        sample_probability: 0.1
-    mezzanine_level: &mezzanine_level
-        canonical: mezzanine level
-        abbreviated: mezz level
-        sample: true
-        canonical_probability: 0.7
-        abbreviated_probability: 0.2
-        sample_probability: 0.1
-    lower_mezzanine: &lower_mezzanine
-        canonical: lower mezzanine
-        abbreviated: lower mezz
-        sample: true
-        canonical_probability: 0.7
-        abbreviated_probability: 0.2
-        sample_probability: 0.1
-    upper_mezzanine: &upper_mezzanine
-        canonical: upper mezzanine
-        abbreviated: upper mezz
-        sample: true
-        canonical_probability: 0.7
-        abbreviated_probability: 0.2
-        sample_probability: 0.1
-        # Should be at least level 1.5
-        number_min_abs_value: 1
-    aliases:
-        "<-1":
-            default: *basement
+    levels:
+        # Numbered floors
+        floor: &floor
+            canonical: floor
+            plural: floors
+            abbreviated: fl
+            canonical_probability: 0.5 # With this probability, use canonical version
+            abbreviated_probability: 0.4 # With this probability, use abbreviated version
+            sample_probability: 0.1 # With this probability, sample from the other forms
+            sample_exclude:
+                - / f # Exclude this abbreviation since it's used as an affix
+            sample: true
+            # e.g. Floor 1
+            numeric:
+                direction: left # Floor/Fl goes to the left of the number
+                direction_probability: 0.8 # With 1 - this probability, Floor/Fl goes on the other side of the number
+                add_number_phrase: true # Occasionally add variation of "number", e.g. Floor No. 1
+                add_number_phrase_probability: 0.4 # With this probability, use Floor No. 1 or Floor #1 vs. Floor 1
+            # e.g. 2/F, 3/F
+            numeric_affix:
+                affix: /f
+                direction: right # affix goes to number's right (always)
+             # e.g. 1st Floor
+            ordinal:
+                direction: right # canonical or abbreviated form goes to the ordinal's right
+            # Probabilities
+            numeric_probability: 0.75 # Use the simple number e.g. Floor 1 (or Floor No. 1)
+            numeric_affix_probability: 0.05 # Use the 2/F (less common)
+            ordinal_probability: 0.2 # Use the ordinal e.g. 1st Floor
+        # The word "level" is also occasionally used
+        level: &level
+            canonical: level
+            plural: levels
+            abbreviated: lvl
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.3
+            sample_probability: 0.2
+            sample_exclude:
+                - / l # Exclude this abbreviation since it's used as an affix
+            numeric:
+                direction: left # Level/Lvl goes to the left of the number
+                direction_probability: 0.8 # With 1 - this probability, Level/Lvl goes on the other side of the number
+                add_number_phrase: true # Occasionally add variation of "number", e.g. Level No. 1
+                add_number_phrase_probability: 0.4 # With this probability, use Level No. 1 or Level #1 vs. Level 1
+            # e.g. 2/L, 3/L (ambiguous with left)
+            numeric_affix:
+                affix: /l
+                direction: right
+            ordinal:
+                direction: right
+            numeric_probability: 0.4
+            numeric_affix_probability: 0.05
+            ordinal_probability: 0.55
+        platform: &platform
+            canonical: platform
+            plural: platforms
+            abbreviated: pf
+            canonical_probability: 0.7
+            abbreviated_probability: 0.3
+            numeric:
+                direction: left
+            ordinal:
+                direction: right
+            numeric_probability: 0.5 # e.g. Platform 1
+            ordinal_probability: 0.5 # e.g. 1st Platform
+        storey: &storey
+            canonical: storey
+            plural: storeys
+            numeric:
+                direction: left
+            ordinal:
+                direction: right
+            numeric_probability: 0.025 # e.g. Storey 2, less common
+            ordinal_probability: 0.975 # e.g. 2nd Storey, more common
+        # Special instructions for ground floor
+        ground_floor: &ground_floor
+            canonical: ground floor
+            abbreviated: g/f
+            canonical_probability: 0.4
+            abbreviated_probability: 0.4
+            sample_probability: 0.2
+            sample: true
+        ground: &ground
+            canonical: ground
+            abbreviated: g
+            sample: true
+            canonical_probability: 0.6
+            abbreviated_probability: 0.1
+            sample_probability: 0.3
+        ground_level: &ground_level
+            canonical: ground level
+            abbreviated: g/l
+            sample: true
+            canonical_probability: 0.4
+            abbreviated_probability: 0.2
+            sample_probability: 0.4
+        # Special instructions for lower ground floor (added randomly, not an alias for a floor number)
+        lower_ground_floor: &lower_ground_floor
+            canonical: lower ground floor
+            abbreviated: lg
+            sample: true
+            # Probabilities
+            canonical_probability: 0.6
+            abbreviated_probability: 0.3
+            sample_probability: 0.1
+        # Special instructions for upper ground floor (added randomly, not an alias for a floor number)
+        upper_ground_floor: &upper_ground_floor
+            canonical: upper ground floor
+            abbreviated: ug
+            sample: true
+            # Probabilities
+            canonical_probability: 0.6
+            abbreviated_probability: 0.2
+            sample_probability: 0.2
+        upper: &upper
+            canonical: upper
+            abbreviated: uppr
+            sample: true
+            canonical_probability: 0.8
+            abbreviated_probability: 0.1
+            sample_probability: 0.1
+        lower_level: &lower_level
+            canonical: lower level
+            abbreviated: lwr lvl
+            sample: true
+            canonical_probability: 0.7
+            abbreviated_probability: 0.1
+            sample_probability: 0.2
+        lobby: &lobby
+            canonical: lobby
+        upstairs: &upstairs
+            canonical: upstairs
+        downstairs: &downstairs
+            canonical: downstairs
+        # Special instructions for podium level (added randomly)
+        podium_level: &podium_level
+            canonical: podium level
+            abbreviated: pd lvl
+            sample: true
+            canonical_probability: 0.6
+            abbreviated_probability: 0.2
+            sample_probability: 0.2
+        podium: &podium
+            canonical: podium
+            abbreviated: pd
+            sample: true
+            canonical_probability: 0.6
+            abbreviated_probability: 0.2
+            sample_probability: 0.2
+        # Used when floor number is < 0 (starts at -1 in all countries)
+        basement: &basement
+            canonical: basement
+            abbreviated: bsmt
+            sample: true
+            # e.g. Basement 1
+            numeric:
+                direction: left
+            # e.g. B1
+            numeric_affix:
+                affix: b
+                direction: left
+            # e.g. 2nd Basement
+            ordinal:
+                direction: right
+            standalone_probability: 0.985
+            number_abs_value: true
+            number_min_abs_value: 1
+            numeric_probability: 0.005
+            numeric_affix_probability: 0.005
+            ordinal_probability: 0.005
+        cellar: &cellar
+            canonical: cellar
+            sample: true
+            canonical_probability: 0.8
+            sample_probability: 0.2
+        # Floor number of <= -2 can be basement 2, sub-basement, sub-basement 1, etc.
+        sub_basement: &sub_basement
+            canonical: sub basement
+            abbreviated: sb
+            sample: true
+            # e.g. Sub-basement 1
+            numeric:
+                direction: left
+            # e.g. SB1
+            numeric_affix:
+                affix: sb
+                direction: left
+            # e.g. 2nd Sub-basement
+            ordinal:
+                direction: right
+            number_abs_value: true
+            number_min_abs_value: 2
+            # Basement 2 == Sub-basement 1
+            number_subtract_abs_value: 1
+            standalone_probability: 0.985
+            numeric_probability: 0.005
+            numeric_affix_probability: 0.005
+            ordinal_probability: 0.005
+        top_floor: &top_floor
+            canonical: top floor
+            abbreviated: tf
+            sample: true
+            canonical_probability: 0.6
+            abbreviated_probability: 0.3
+            sample_probability: 0.1
+        # Mezzanine level (floor number {0.5, 1.5, ...}, also be added at random)
+        mezzanine: &mezzanine
+            canonical: mezzanine
+            abbreviated: mezz
+            sample: true
+            canonical_probability: 0.8
+            abbreviated_probability: 0.1
+            sample_probability: 0.1
+            # Mezzanine/Mezz 2 or Mezzanine/Mezz A
+            numeric:
+                direction: left
+            # M2
+            numeric_affix:
+                affix: m
+                direction: left
+            # 2nd Mezzanine
+            ordinal:
+                direction: right
+            # Floor 0.5 is just plain mezzanine, no number
+            number_abs_value: true
+            number_min_abs_value: 1
+            standalone_probability: 0.5
+            numeric_probability: 0.1
+            numeric_affix_probability: 0.1
+            ordinal_probability: 0.3
+        mezzanine_floor: &mezzanine_floor
+            canonical: mezzanine floor
+            abbreviated: mezz floor
+            sample: true
+            canonical_probability: 0.7
+            abbreviated_probability: 0.2
+            sample_probability: 0.1
+        mezzanine_level: &mezzanine_level
+            canonical: mezzanine level
+            abbreviated: mezz level
+            sample: true
+            canonical_probability: 0.7
+            abbreviated_probability: 0.2
+            sample_probability: 0.1
+        lower_mezzanine: &lower_mezzanine
+            canonical: lower mezzanine
+            abbreviated: lower mezz
+            sample: true
+            canonical_probability: 0.7
+            abbreviated_probability: 0.2
+            sample_probability: 0.1
+        upper_mezzanine: &upper_mezzanine
+            canonical: upper mezzanine
+            abbreviated: upper mezz
+            sample: true
+            canonical_probability: 0.7
+            abbreviated_probability: 0.2
+            sample_probability: 0.1
+            # Should be at least level 1.5
+            number_min_abs_value: 1
+        aliases:
+            "<-1":
+                default: *basement
+                probability: 0.6
+                alternatives:
+                    - alternative: *sub_basement
+                      probability: 0.3995
+                    - alternative: *floor
+                      probability: 0.0005
+            "-1":
+                default: *basement
+                probability: 0.7
+                alternatives:
+                    - alternative: *cellar
+                      probability: 0.1
+                    - alternative: *lower_ground_floor
+                      probability: 0.1
+                    - alternative: *downstairs
+                      probability: 0.0495
+                    - alternative: *lower_level
+                      probability: 0.05
+                    - alternative: *floor
+                      probability: 0.0005
+            # Special token for half-floors
+            half_floors:
+                default: *mezzanine
+                probability: 0.8
+                alternatives:
+                    - alternative: *mezzanine_floor
+                      probability: 0.1
+                    - alternative: *mezzanine_level
+                      probability: 0.1
+                aliases:
+                    "1":
+                        default: *upper_mezzanine
+                        probability: 0.5
+                        alternatives:
+                            - alternative: *mezzanine
+                              probability: 0.5
+            half_floors_negative:
+                default: *lower_mezzanine
+            "0":
+                default: *ground_floor
+                probability: 0.9
+                alternatives:
+                    - alternative: *ground
+                      probability: 0.02
+                    - alternative: *ground_level
+                      probability: 0.01
+                    - alternative: *lower_ground_floor
+                      probability: 0.025
+                    - alternative: *upper_ground_floor
+                      probability: 0.025
+                    - alternative: *lobby
+                      probability: 0.005
+                    - alternative: *floor
+                      # Floor 0 is uncommon
+                      probability: 0.01
+                    - alternative: *level
+                      probability: 0.005
+            "1":
+                # Most of the time just say 1st Floor
+                default: *floor
+                probability: 0.9
+                alternatives:
+                    - alternative: *upper_ground_floor
+                      probability: 0.075
+                    - alternative: *podium_level
+                      probability: 0.01
+                    - alternative: *podium
+                      probability: 0.005
+                    - alternative: *upstairs
+                      probability: 0.01
+            top:
+                default: *floor
+                probability: 0.85
+                alternatives:
+                    - alternative: *level
+                      probability: 0.1
+                    - alternative: *top_floor
+                      probability: 0.05
+
+        # Integer for whether floors start at 0 or 1
+        numbering_starts_at: 0
+
+        # Associated phrases for alphanumeric floors (Floor 1, Floor A)
+        alphanumeric:
+            default: *floor
+            probability: 0.8
+            add_number_phrase: true
+            add_number_phrase_probability: 0.3
+            alternatives:
+                - alternative: *level
+                  probability: 0.15
+                - alternative: *platform
+                  probability: 0.025
+                - alternative: *storey
+                  probability: 0.025
+            numeric_probability: 0.99 # With this probability, pick an integer
+            alpha_probability: 0.0098 # With this probability, pick a letter e.g. Floor A
+            numeric_plus_alpha_probability: 0.0001 # e.g. Floor 2A
+            alpha_plus_numeric_probability: 0.0001 # e.g. Floor A2
+
+
+        # Floors are not part of the global address formats (and are not always standard)
+        # This is a list of places in the address where the floor number might go
+        order:
+            # e.g. 123 East 45th St, 6th Floor, NYC
+            - after: road
+              probability: 0.5
+            # e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London
+            - before: house
+              probability: 0.25
+            # e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London
+            - before: road
+              probability: 0.25
+
+
+    # Intersections
+    # =============
+    # For constructing intersections like 5th Avenue & Broadway
+    # In OSM, a node that's part of two ways is an intersection.
+    #
+    # These simple rules make it possible to create training examples
+    # like: 26th/road Street/road and/intersection 6th/road Avenue/road
+
+    cross_streets:
+        # 26th & 6th Avenue
+        and: *and
+        # 26th @ Broadway
+        at: &at
+            canonical: at
+            abbreviated: "@"
+            canonical_probability: 0.7
+            abbreviated_probability: 0.3
+            sample: true
+        corner_of: &corner_of
+            canonical: corner of
+
+        intersection:
+            default: *and
+            probability: 0.7
+            alternatives:
+                - alternative: *at
+                  probability: 0.15
+                - alternative: *corner_of
+                  probability: 0.15
+
+        # 26th betw 5th Ave and 6th Ave
+        between:
+            canonical: between
+            abbreviated: betw
+            canonical_probability: 0.5
+            abbreviated_probability: 0.5
+            sample: true
+            parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
+
+    # PO Box addresses
+    # ================
+    # For PO box addresses, there's almost no data in OSM, so we'll need to
+    # generate them somewhat randomly.
+    #
+    # The strategy is: for every amenity=post_office, generate a number of PO box
+    # addresses using random numbers (and some alpha-numerics so we capture patterns
+    # like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
+    # exist, as long as they cover the patterns of digits we expect in real addresses.
+    # The parser cares more about how many digits a number has and the surrounding 
+    # words/phrases than the specific number i.e. numbers in the range 1000-9999
+    # can simply be normalized to DDDD.
+
+    po_boxes:
+        po_box: &po_box
+            canonical: post office box
+            abbreviated: p.o. box
+            sample: true
+            canonical_probability: 0.01
+            abbreviated_probability: 0.95
+            sample_probability: 0.04
+
+            numeric:
+                direction: left
+                add_number_phrase: true
+                add_number_phrase_probability: 0.4 # PO Box #1234
+
+            numeric_probability: 1.0
+
+        box: &box
+            canonical: box
+            sample: true
+            canonical_probability: 0.8
+            sample_probability: 0.2
+            numeric:
+                direction: left
+                add_number_phrase: true
+                add_number_phrase_probability: 0.4 # Box #1234
+
+            numeric_probability: 1.0
+
+        private_mail_box: &private_mail_box
+            canonical: private mail box
+            abbreviated: pmb
+            prefer_abbreviated: true
+            sample: true
+            canonical_probability: 0.01
+            abbreviated_probability: 0.95
+            sample_probability: 0.04
+
+            numeric:
+                direction: left
+                add_number_phrase: true
+                add_number_phrase_probability: 0.4 # PMB #1234
+
+            numeric_probability: 1.0
+
+        alphanumeric:
+            # Don't sample all the forms in post_office.txt as many of the PO box
+            # phrases appear only in Australia
+            sample: false
+            default: *po_box
+            probability: 0.995
+            alternatives:
+                - alternative: *box
+                  probability: 0.005
+
+            numeric_probability: 0.9 # PO Box 123
+            alpha_probability: 0.05 # PO Box A
+            numeric_plus_alpha_probability: 0.04 # PO Box 123G
+            alpha_plus_numeric_probability: 0.01 # PO Box A123
+            alpha_plus_numeric_whitespace_probability: 0.1
+            numeric_plus_alpha_whitespace_probability: 0.1
+
+        digits:
+            - length: 1
+              probability: 0.05
+            - length: 2
+              probability: 0.1
+            - length: 3
+              probability: 0.2
+            - length: 4
+              probability: 0.5
+            - length: 5
+              probability: 0.1
+            - length: 6
+              probability: 0.05
+
+        zones:
+            # Overrides for commercial/office areas (landuse=commercial in OSM)
+            commercial:
+                default: *po_box
+                probability: 0.7
+                alternatives:
+                    - alternative: *private_mail_box
+                      probability: 0.2
+                    - alternative: *box
+                      probability: 0.1
+
+        order:
+            - after: house
+              probability: 0.8
+            - before: house
+              probability: 0.2
+
+    # Categories
+    # ==========
+    # Use the operators "in" and "near" for building category queries
+    # such as "restaurants in Hackney, London"
+
+    categories:
+        near:
+            default:
+                canonical: near
+            probability: 0.8
+            alternatives:
+                - alternative:
+                      canonical: around
+                  probability: 0.2
+        nearby:
+            default:
+                canonical: nearby
             probability: 0.6
             alternatives:
-                - alternative: *sub_basement
-                  probability: 0.3995
-                - alternative: *floor
-                  probability: 0.0005
-        "-1":
-            default: *basement
-            probability: 0.7
-            alternatives:
-                - alternative: *cellar
-                  probability: 0.1
-                - alternative: *lower_ground_floor
-                  probability: 0.1
-                - alternative: *downstairs
-                  probability: 0.0495
-                - alternative: *lower_level
-                  probability: 0.05
-                - alternative: *floor
-                  probability: 0.0005
-        # Special token for half-floors
-        half_floors:
-            default: *mezzanine
-            probability: 0.8
-            alternatives:
-                - alternative: *mezzanine_floor
-                  probability: 0.1
-                - alternative: *mezzanine_level
-                  probability: 0.1
-            aliases:
-                "1":
-                    default: *upper_mezzanine
-                    probability: 0.5
-                    alternatives:
-                        - alternative: *mezzanine
-                          probability: 0.5
-        half_floors_negative:
-            default: *lower_mezzanine
-        "0":
-            default: *ground_floor
-            probability: 0.9
-            alternatives:
-                - alternative: *ground
-                  probability: 0.02
-                - alternative: *ground_level
-                  probability: 0.01
-                - alternative: *lower_ground_floor
-                  probability: 0.025
-                - alternative: *upper_ground_floor
-                  probability: 0.025
-                - alternative: *lobby
-                  probability: 0.005
-                - alternative: *floor
-                  # Floor 0 is uncommon
-                  probability: 0.01
-                - alternative: *level
-                  probability: 0.005
-        "1":
-            # Most of the time just say 1st Floor
-            default: *floor
-            probability: 0.9
-            alternatives:
-                - alternative: *upper_ground_floor
-                  probability: 0.075
-                - alternative: *podium_level
-                  probability: 0.01
-                - alternative: *podium
-                  probability: 0.005
-                - alternative: *upstairs
-                  probability: 0.01
-        top:
-            default: *floor
-            probability: 0.85
-            alternatives:
-                - alternative: *level
-                  probability: 0.1
-                - alternative: *top_floor
-                  probability: 0.05
-
-    # Integer for whether floors start at 0 or 1
-    numbering_starts_at: 0
-
-    # Associated phrases for alphanumeric floors (Floor 1, Floor A)
-    alphanumeric:
-        default: *floor
-        probability: 0.8
-        add_number_phrase: true
-        add_number_phrase_probability: 0.3
-        alternatives:
-            - alternative: *level
-              probability: 0.15
-            - alternative: *platform
-              probability: 0.025
-            - alternative: *storey
-              probability: 0.025
-        numeric_probability: 0.99 # With this probability, pick an integer
-        alpha_probability: 0.0098 # With this probability, pick a letter e.g. Floor A
-        numeric_plus_alpha_probability: 0.0001 # e.g. Floor 2A
-        alpha_plus_numeric_probability: 0.0001 # e.g. Floor A2
-
-
-    # Floors are not part of the global address formats (and are not always standard)
-    # This is a list of places in the address where the floor number might go
-    order:
-        # e.g. 123 East 45th St, 6th Floor, NYC
-        - after: road
-          probability: 0.5
-        # e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London
-        - before: house
-          probability: 0.25
-        # e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London
-        - before: road
-          probability: 0.25
-
-
-# Intersections
-# =============
-# For constructing intersections like 5th Avenue & Broadway
-# In OSM, a node that's part of two ways is an intersection.
-#
-# These simple rules make it possible to create training examples
-# like: 26th/road Street/road and/intersection 6th/road Avenue/road
-
-cross_streets:
-    # 26th & 6th Avenue
-    and: *and
-    # 26th @ Broadway
-    at: &at
-        canonical: at
-        abbreviated: "@"
-        canonical_probability: 0.7
-        abbreviated_probability: 0.3
-        sample: true
-    corner_of: &corner_of
-        canonical: corner of
-
-    intersection:
-        default: *and
-        probability: 0.7
-        alternatives:
-            - alternative: *at
-              probability: 0.15
-            - alternative: *corner_of
-              probability: 0.15
-
-    # 26th betw 5th Ave and 6th Ave
-    between:
-        canonical: between
-        abbreviated: betw
-        canonical_probability: 0.5
-        abbreviated_probability: 0.5
-        sample: true
-        parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
-
-# PO Box addresses
-# ================
-# For PO box addresses, there's almost no data in OSM, so we'll need to
-# generate them somewhat randomly.
-#
-# The strategy is: for every amenity=post_office, generate a number of PO box
-# addresses using random numbers (and some alpha-numerics so we capture patterns
-# like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
-# exist, as long as they cover the patterns of digits we expect in real addresses.
-# The parser cares more about how many digits a number has and the surrounding 
-# words/phrases than the specific number i.e. numbers in the range 1000-9999
-# can simply be normalized to DDDD.
-
-po_boxes:
-    po_box: &po_box
-        canonical: post office box
-        abbreviated: p.o. box
-        sample: true
-        canonical_probability: 0.01
-        abbreviated_probability: 0.95
-        sample_probability: 0.04
-
-        numeric:
-            direction: left
-            add_number_phrase: true
-            add_number_phrase_probability: 0.4 # PO Box #1234
-
-        numeric_probability: 1.0
-
-    box: &box
-        canonical: box
-        sample: true
-        canonical_probability: 0.8
-        sample_probability: 0.2
-        numeric:
-            direction: left
-            add_number_phrase: true
-            add_number_phrase_probability: 0.4 # Box #1234
-
-        numeric_probability: 1.0
-
-    private_mail_box: &private_mail_box
-        canonical: private mail box
-        abbreviated: pmb
-        prefer_abbreviated: true
-        sample: true
-        canonical_probability: 0.01
-        abbreviated_probability: 0.95
-        sample_probability: 0.04
-
-        numeric:
-            direction: left
-            add_number_phrase: true
-            add_number_phrase_probability: 0.4 # PMB #1234
-
-        numeric_probability: 1.0
-
-    alphanumeric:
-        # Don't sample all the forms in post_office.txt as many of the PO box
-        # phrases appear only in Australia
-        sample: false
-        default: *po_box
-        probability: 0.995
-        alternatives:
-            - alternative: *box
-              probability: 0.005
-
-        numeric_probability: 0.9 # PO Box 123
-        alpha_probability: 0.05 # PO Box A
-        numeric_plus_alpha_probability: 0.04 # PO Box 123G
-        alpha_plus_numeric_probability: 0.01 # PO Box A123
-        alpha_plus_numeric_whitespace_probability: 0.1
-        numeric_plus_alpha_whitespace_probability: 0.1
-
-    digits:
-        - length: 1
-          probability: 0.05
-        - length: 2
-          probability: 0.1
-        - length: 3
-          probability: 0.2
-        - length: 4
-          probability: 0.5
-        - length: 5
-          probability: 0.1
-        - length: 6
-          probability: 0.05
-
-    zones:
-        # Overrides for commercial/office areas (landuse=commercial in OSM)
-        commercial:
-            default: *po_box
-            probability: 0.7
-            alternatives:
-                - alternative: *private_mail_box
-                  probability: 0.2
-                - alternative: *box
-                  probability: 0.1
-
-    order:
-        - after: house
-          probability: 0.8
-        - before: house
-          probability: 0.2
-
-# Categories
-# ==========
-# Use the operators "in" and "near" for building category queries
-# such as "restaurants in Hackney, London"
-
-categories:
-    near:
-        default:
-            canonical: near
-        probability: 0.8
-        alternatives:
-            - alternative:
-                  canonical: around
-              probability: 0.2
-    nearby:
-        default:
-            canonical: nearby
-        probability: 0.6
-        alternatives:
-            - alternative:
-                  canonical: near here
-              probability: 0.3
-            - alternative:
-                  canonical: around here
-              probability: 0.1
-    near_me:
-        canonical: near me
-    in:
-        canonical: in
-    # Probabilities of each phrase
-    near_probability: 0.35
-    nearby_probability: 0.2
-    near_me_probability: 0.1
-    in_probability: 0.35
-
-# Directions
-# ==========
-# Unit types, stairways, etc. may have a direction associated
-# with them whether it's right/left or a cardinal direction
-# like "East Entrance".
-
-directions:
-    right: &right
-        canonical: right
-        abbreviated: r
-        canonical_probability: 0.7
-        abbreviated_probability: 0.3
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: r
-            direction: right
-        numeric_probability: 0.2
-        numeric_affix_probability: 0.8
-    left: &left
-        canonical: left
-        abbreviated: l
-        canonical_probability: 0.7
-        abbreviated_probability: 0.3
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: l
-            direction: right
-        numeric_probability: 0.2
-        numeric_affix_probability: 0.8
-    rear: &rear
-        canonical: rear
-        abbreviated: r
-        canonical_probability: 0.8
-        abbreviated_probability: 0.2
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: r
-            direction: right
-        numeric_probability: 0.2
-        numeric_affix_probability: 0.8
-    front: &front
-        canonical: front
-        abbreviated: frnt
-        canonical_probability: 0.8
-        abbreviated_probability: 0.2
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: f
-            direction: right
-        numeric_probability: 0.2
-        numeric_affix_probability: 0.8
-    alternatives:
-        - alternative: *right
-          probability: 0.45
-        - alternative: *left
-          probability: 0.45
-        - alternative: *front
-          probability: 0.05
-        - alternative: *rear
-          probability: 0.05
-
-cardinal_directions:
-    east: &east
-        canonical: east
-        abbreviated: e
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.3
-        sample_probability: 0.2
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: e
-            direction: right
-        numeric_probability: 0.6
-        numeric_affix_probability: 0.4
-    west: &west
-        canonical: west
-        abbreviated: w
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.3
-        sample_probability: 0.2
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: w
-            direction: right
-        numeric_probability: 0.6
-        numeric_affix_probability: 0.4
-    north: &north
-        canonical: north
-        abbreviated: n
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.3
-        sample_probability: 0.2
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: n
-            direction: right
-        numeric_probability: 0.6
-        numeric_affix_probability: 0.4
-    south: &south
-        canonical: south
-        abbreviated: s
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.3
-        sample_probability: 0.2
-        numeric:
-            direction: right
-        numeric_affix:
-            affix: s
-            direction: right
-        numeric_probability: 0.6
-        numeric_affix_probability: 0.4
-
-    alternatives:
-        - alternative: *north
-          probability: 0.25
-        - alternative: *east
-          probability: 0.25
-        - alternative: *south
-          probability: 0.25
-        - alternative: *west
-          probability: 0.25
-
-# Entrance
-# ========
-# For deriving strings like "North Entrance"
-
-entrances:
-    entrance: &entrance
-        canonical: entrance
-        abbreviated: ent
-        sample: true
-        canonical_probability: 0.8
-        abbreviated_probability: 0.2
-
-    # Entrance 1, Entrance A, etc.
-    alphanumeric: &entrance_alphanumeric
-        default: *entrance
-
-    directional:
-        base: *entrance_alphanumeric
-        modifier:
-            direction: left # e.g. North Entrance
-            direction_probability: 0.9
-            alternatives:
-                - alternative: *north
-                - alternative: *south
-                - alternative: *east
-                - alternative: *west
-                - alternative: *right
-                - alternative: *left
-                - alternative: *rear
-                - alternative: *front
                 - alternative:
-                      canonical: freight
+                      canonical: near here
+                  probability: 0.3
+                - alternative:
+                      canonical: around here
+                  probability: 0.1
+        near_me:
+            canonical: near me
+        in:
+            canonical: in
+        # Probabilities of each phrase
+        near_probability: 0.35
+        nearby_probability: 0.2
+        near_me_probability: 0.1
+        in_probability: 0.35
 
-# Staircase
-# =========
-# For deriving strings like "Staircase A" in apartment buildings
+    # Directions
+    # ==========
+    # Unit types, stairways, etc. may have a direction associated
+    # with them whether it's right/left or a cardinal direction
+    # like "East Entrance".
 
-staircases:
-    stair: &stair
-        canonical: stair
-        sample: true
-
-    staircase: &staircase
-        canonical: staircase
-        sample: true
-
-    stairway: &stairway
-        canonical: stairway
-        sample: true
-
-    stairwell: &stairwell
-        canonical: stairwell
-        sample: true
-
-    alphanumeric: &staircase_alphanumeric
-        # For alphanumerics, Stair A, Stair 1, etc.
-        default: *stair
-        probability: 0.4
-        alternatives:
-            - alternative: *staircase
-              probability: 0.2
-            - alternative: *stairway
-              probability: 0.2
-            - alternative: *stairwell
-              probability: 0.2
-
-    directional:
-        base: *staircase_alphanumeric
-        modifier:
-            direction: left # e.g. Left Staircase
-            direction_probability: 0.7
-            alternatives:
-                - alternative: *north
-                - alternative: *south
-                - alternative: *east
-                - alternative: *west
-                - alternative: *right
-                - alternative: *left
-                - alternative: *rear
-                - alternative: *front
-
-
-# Unit types
-# ==========
-# Unit information is common in residential addresses, offices, business parks, etc.
-# Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
-# refer to the 
-
-units:
-    # Units are not part of the global address formats (and are not always standard)
-    # This is a list of places in the address where the unit line might go
-    order:
-        # e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London
-        - before: house
-          probability: 0.4
-        # e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London
-        - before: road
-          probability: 0.2
-        # e.g. Floor 5, Apt 6
-        - after: level
-          probability: 0.3
-        # e.g. Apt. 6, 5/F (less common)
-        - before: level
-          probability: 0.1
-
-    # Special terms
-    suite: &suite
-        canonical: suite
-        abbreviated: ste
-        sample: true        
-        canonical_probability: 0.4
-        abbreviated_probability: 0.4
-        sample_probability: 0.2
-        plural:
-            canonical: suites
-            abbreviated: stes
-            canonical_probability: 0.6
-            abbreviated_probability: 0.4
-        numeric:
-            direction: left
-        # Suite #101 and Suite No. 101 as opposed to Suite 101
-        add_number_phrase: true
-        add_number_phrase_probability: 0.5
-    penthouse: &penthouse
-        canonical: penthouse
-        abbreviated: ph
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.3
-        sample_probability: 0.2
-        plural:
-            canonical: penthouses
-        numeric:
-            direction: left
-        numeric_probability: 0.2
-        standalone_probability: 0.8
-        # Penthouse #1 and Penthouse No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.2
-    top_left: &top_left
-        canonical: top left
-        abbreviated: t/l
-        sample: true
-        canonical_probability: 0.4
-        abbreviated_probability: 0.3
-        sample_probability: 0.3
-    top_right: &top_right
-        canonical: top right
-        abbreviated: t/r
-        sample: true
-        canonical_probability: 0.4
-        abbreviated_probability: 0.3
-        sample_probability: 0.3
-    top_floor_right: &top_floor_right
-        canonical: top floor right
-        abbreviated: tfr
-        sample: true
-        canonical_probability: 0.2
-        abbreviated_probability: 0.5
-        sample_probability: 0.3
-    top_floor_left: &top_floor_left
-        canonical: top floor left
-        abbreviated: tfl
-        sample: true
-        canonical_probability: 0.2
-        abbreviated_probability: 0.5
-        sample_probability: 0.3
-    office: &office
-        canonical: office
-        abbreviated: ofc
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.3
-        sample_probability: 0.2
-        plural:
-            canonical: offices
-            abbreviated: ofcs
-            canonical_probability: 0.4
-            abbreviated_probability: 0.6
-        numeric:
-            direction: left
-        # Office #1 and Office No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.7
-    door: &door
-        canonical: door
-        sample: true
-        canonical_probability: 0.8
-        sample_probability: 0.2
-        plural:
-            canonical: doors
-        numeric:
-            direction: left
-        # Door #1 and Door No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.2
-    room: &room
-        canonical: room
-        abbreviated: rm
-        sample: true
-        canonical_probability: 0.5
-        abbreviated_probability: 0.5
-        plural:
-            canonical: rooms
-            abbreviated: rms
-            canonical_probability: 0.6
-            abbreviated_probability: 0.4
-        numeric:
-            direction: left
-        # Room #1 and Room No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.6
-    hall: &hall
-        canonical: hall
-        plural:
-            canonical: halls
-        numeric:
-            direction: left
-        # Room #1 and Room No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.6
-    apartment: &apartment
-        canonical: apartment
-        abbreviated: apt
-        prefer_abbreviated: true
-        sample: true
-        canonical_probability: 0.15
-        abbreviated_probability: 0.6
-        sample_probability: 0.25
-        plural:
-            canonical: apartments
-            abbreviated: apts
-            canonical_probability: 0.2
-            abbreviated: 0.8
-        numeric:
-            direction: left
-        # Apt #1 and Apt No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.4
-    flat: &flat
-        canonical: flat
-        abbreviated: flt
-        sample: true
-        canonical_probability: 0.8
-        abbreviated_probability: 0.15
-        sample_probability: 0.05
-        plural:
-            canonical: flats
-            abbreviated: flts
+    directions:
+        right: &right
+            canonical: right
+            abbreviated: r
+            canonical_probability: 0.7
+            abbreviated_probability: 0.3
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: r
+                direction: right
+            numeric_probability: 0.2
+            numeric_affix_probability: 0.8
+        left: &left
+            canonical: left
+            abbreviated: l
+            canonical_probability: 0.7
+            abbreviated_probability: 0.3
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: l
+                direction: right
+            numeric_probability: 0.2
+            numeric_affix_probability: 0.8
+        rear: &rear
+            canonical: rear
+            abbreviated: r
             canonical_probability: 0.8
             abbreviated_probability: 0.2
-        numeric:
-            direction: left
-        # Flat #1 and Flat No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.4
-    lot: &lot
-        canonical: lot
-        sample: true
-        canonical_probability: 0.9
-        sample_probability: 0.1
-        plural:
-            canonical: lots
-        numeric:
-            direction: left
-        # Lot #1 and Lot No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.6
-    parcel: &parcel
-        canonical: parcel
-        sample: true
-        canonical_probability: 0.9
-        sample_probability: 0.1
-        plural:
-            canonical: parcels
-        numeric:
-            direction: left
-        add_number_phrase: true
-        add_number_phrase_probability: 0.6
-    unit: &unit
-        canonical: unit
-        abbreviated: u
-        sample: true
-        canonical_probability: 0.8
-        abbreviated_probability: 0.1
-        sample_probability: 0.1
-        plural:
-            canonical: units
-        numeric:
-            direction: left
-        # Unit #1 and Unit No. 1
-        add_number_phrase: true
-        add_number_phrase_probability: 0.4
-    alphanumeric: &unit_alphanumeric
-        # Many unit types that apply only in Australia
-        # For most English-speaking countries, only use the terms defined above
-        sample: false
-        default: *flat
-        probability: 0.4
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: r
+                direction: right
+            numeric_probability: 0.2
+            numeric_affix_probability: 0.8
+        front: &front
+            canonical: front
+            abbreviated: frnt
+            canonical_probability: 0.8
+            abbreviated_probability: 0.2
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: f
+                direction: right
+            numeric_probability: 0.2
+            numeric_affix_probability: 0.8
         alternatives:
-            - alternative: *unit
+            - alternative: *right
+              probability: 0.45
+            - alternative: *left
+              probability: 0.45
+            - alternative: *front
+              probability: 0.05
+            - alternative: *rear
+              probability: 0.05
+
+    cardinal_directions:
+        east: &east
+            canonical: east
+            abbreviated: e
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.3
+            sample_probability: 0.2
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: e
+                direction: right
+            numeric_probability: 0.6
+            numeric_affix_probability: 0.4
+        west: &west
+            canonical: west
+            abbreviated: w
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.3
+            sample_probability: 0.2
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: w
+                direction: right
+            numeric_probability: 0.6
+            numeric_affix_probability: 0.4
+        north: &north
+            canonical: north
+            abbreviated: n
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.3
+            sample_probability: 0.2
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: n
+                direction: right
+            numeric_probability: 0.6
+            numeric_affix_probability: 0.4
+        south: &south
+            canonical: south
+            abbreviated: s
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.3
+            sample_probability: 0.2
+            numeric:
+                direction: right
+            numeric_affix:
+                affix: s
+                direction: right
+            numeric_probability: 0.6
+            numeric_affix_probability: 0.4
+
+        alternatives:
+            - alternative: *north
+              probability: 0.25
+            - alternative: *east
+              probability: 0.25
+            - alternative: *south
+              probability: 0.25
+            - alternative: *west
               probability: 0.25
-              # e.g. just plain #3 or No. 4
-            - alternative: *number
-              probability: 0.2
-            - alternative: *door
-              probability: 0.04
-            - alternative: *penthouse
-              probability: 0.01
-            - alternative: *apartment
-              probability: 0.1
-        numeric_probability: 0.9 # e.g. Flat 1
-        numeric_plus_alpha_probability: 0.03 # e.g. 1A
-        alpha_plus_numeric_probability: 0.03 # e.g. A1
-        alpha_probability: 0.04 # e.g. Flat A
-        alpha_plus_numeric_whitespace_probability: 0.1
-        numeric_plus_alpha_whitespace_probability: 0.1
 
-        # Separate random probability for adding directions like 2L, 2R, etc.
-        add_direction: true
-        add_direction_probability: 0.1
-        # Add directions only for plain numbers
-        add_direction_numeric: true
-        add_direction_standalone: true
+    # Entrance
+    # ========
+    # For deriving strings like "North Entrance"
 
-    zones:
-        residential: *unit_alphanumeric
-        commercial:
-            default: *suite
-            probability: 0.8
+    entrances:
+        entrance: &entrance
+            canonical: entrance
+            abbreviated: ent
+            sample: true
+            canonical_probability: 0.8
+            abbreviated_probability: 0.2
+
+        # Entrance 1, Entrance A, etc.
+        alphanumeric: &entrance_alphanumeric
+            default: *entrance
+
+        directional:
+            base: *entrance_alphanumeric
+            modifier:
+                direction: left # e.g. North Entrance
+                direction_probability: 0.9
+                alternatives:
+                    - alternative: *north
+                    - alternative: *south
+                    - alternative: *east
+                    - alternative: *west
+                    - alternative: *right
+                    - alternative: *left
+                    - alternative: *rear
+                    - alternative: *front
+                    - alternative:
+                          canonical: freight
+
+    # Staircase
+    # =========
+    # For deriving strings like "Staircase A" in apartment buildings
+
+    staircases:
+        stair: &stair
+            canonical: stair
+            sample: true
+
+        staircase: &staircase
+            canonical: staircase
+            sample: true
+
+        stairway: &stairway
+            canonical: stairway
+            sample: true
+
+        stairwell: &stairwell
+            canonical: stairwell
+            sample: true
+
+        alphanumeric: &staircase_alphanumeric
+            # For alphanumerics, Stair A, Stair 1, etc.
+            default: *stair
+            probability: 0.4
             alternatives:
-                - alternative: *office
+                - alternative: *staircase
                   probability: 0.2
-        industrial:
-            default: *lot
-            probability: 0.5
+                - alternative: *stairway
+                  probability: 0.2
+                - alternative: *stairwell
+                  probability: 0.2
+
+        directional:
+            base: *staircase_alphanumeric
+            modifier:
+                direction: left # e.g. Left Staircase
+                direction_probability: 0.7
+                alternatives:
+                    - alternative: *north
+                    - alternative: *south
+                    - alternative: *east
+                    - alternative: *west
+                    - alternative: *right
+                    - alternative: *left
+                    - alternative: *rear
+                    - alternative: *front
+
+
+    # Unit types
+    # ==========
+    # Unit information is common in residential addresses, offices, business parks, etc.
+    # Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
+    # refer to the 
+
+    units:
+        # Units are not part of the global address formats (and are not always standard)
+        # This is a list of places in the address where the unit line might go
+        order:
+            # e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London
+            - before: house
+              probability: 0.4
+            # e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London
+            - before: road
+              probability: 0.2
+            # e.g. Floor 5, Apt 6
+            - after: level
+              probability: 0.3
+            # e.g. Apt. 6, 5/F (less common)
+            - before: level
+              probability: 0.1
+
+        # Special terms
+        suite: &suite
+            canonical: suite
+            abbreviated: ste
+            sample: true        
+            canonical_probability: 0.4
+            abbreviated_probability: 0.4
+            sample_probability: 0.2
+            plural:
+                canonical: suites
+                abbreviated: stes
+                canonical_probability: 0.6
+                abbreviated_probability: 0.4
+            numeric:
+                direction: left
+            # Suite #101 and Suite No. 101 as opposed to Suite 101
+            add_number_phrase: true
+            add_number_phrase_probability: 0.5
+        penthouse: &penthouse
+            canonical: penthouse
+            abbreviated: ph
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.3
+            sample_probability: 0.2
+            plural:
+                canonical: penthouses
+            numeric:
+                direction: left
+            numeric_probability: 0.2
+            standalone_probability: 0.8
+            # Penthouse #1 and Penthouse No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.2
+        top_left: &top_left
+            canonical: top left
+            abbreviated: t/l
+            sample: true
+            canonical_probability: 0.4
+            abbreviated_probability: 0.3
+            sample_probability: 0.3
+        top_right: &top_right
+            canonical: top right
+            abbreviated: t/r
+            sample: true
+            canonical_probability: 0.4
+            abbreviated_probability: 0.3
+            sample_probability: 0.3
+        top_floor_right: &top_floor_right
+            canonical: top floor right
+            abbreviated: tfr
+            sample: true
+            canonical_probability: 0.2
+            abbreviated_probability: 0.5
+            sample_probability: 0.3
+        top_floor_left: &top_floor_left
+            canonical: top floor left
+            abbreviated: tfl
+            sample: true
+            canonical_probability: 0.2
+            abbreviated_probability: 0.5
+            sample_probability: 0.3
+        office: &office
+            canonical: office
+            abbreviated: ofc
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.3
+            sample_probability: 0.2
+            plural:
+                canonical: offices
+                abbreviated: ofcs
+                canonical_probability: 0.4
+                abbreviated_probability: 0.6
+            numeric:
+                direction: left
+            # Office #1 and Office No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.7
+        door: &door
+            canonical: door
+            sample: true
+            canonical_probability: 0.8
+            sample_probability: 0.2
+            plural:
+                canonical: doors
+            numeric:
+                direction: left
+            # Door #1 and Door No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.2
+        room: &room
+            canonical: room
+            abbreviated: rm
+            sample: true
+            canonical_probability: 0.5
+            abbreviated_probability: 0.5
+            plural:
+                canonical: rooms
+                abbreviated: rms
+                canonical_probability: 0.6
+                abbreviated_probability: 0.4
+            numeric:
+                direction: left
+            # Room #1 and Room No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.6
+        hall: &hall
+            canonical: hall
+            plural:
+                canonical: halls
+            numeric:
+                direction: left
+            # Room #1 and Room No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.6
+        apartment: &apartment
+            canonical: apartment
+            abbreviated: apt
+            prefer_abbreviated: true
+            sample: true
+            canonical_probability: 0.15
+            abbreviated_probability: 0.6
+            sample_probability: 0.25
+            plural:
+                canonical: apartments
+                abbreviated: apts
+                canonical_probability: 0.2
+                abbreviated: 0.8
+            numeric:
+                direction: left
+            # Apt #1 and Apt No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.4
+        flat: &flat
+            canonical: flat
+            abbreviated: flt
+            sample: true
+            canonical_probability: 0.8
+            abbreviated_probability: 0.15
+            sample_probability: 0.05
+            plural:
+                canonical: flats
+                abbreviated: flts
+                canonical_probability: 0.8
+                abbreviated_probability: 0.2
+            numeric:
+                direction: left
+            # Flat #1 and Flat No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.4
+        lot: &lot
+            canonical: lot
+            sample: true
+            canonical_probability: 0.9
+            sample_probability: 0.1
+            plural:
+                canonical: lots
+            numeric:
+                direction: left
+            # Lot #1 and Lot No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.6
+        parcel: &parcel
+            canonical: parcel
+            sample: true
+            canonical_probability: 0.9
+            sample_probability: 0.1
+            plural:
+                canonical: parcels
+            numeric:
+                direction: left
+            add_number_phrase: true
+            add_number_phrase_probability: 0.6
+        unit: &unit
+            canonical: unit
+            abbreviated: u
+            sample: true
+            canonical_probability: 0.8
+            abbreviated_probability: 0.1
+            sample_probability: 0.1
+            plural:
+                canonical: units
+            numeric:
+                direction: left
+            # Unit #1 and Unit No. 1
+            add_number_phrase: true
+            add_number_phrase_probability: 0.4
+        alphanumeric: &unit_alphanumeric
+            # Many unit types that apply only in Australia
+            # For most English-speaking countries, only use the terms defined above
+            sample: false
+            default: *flat
+            probability: 0.4
             alternatives:
-                - alternative: *suite
-                  probability: 0.3
                 - alternative: *unit
-                  probability: 0.19
-                - alternative: *parcel
+                  probability: 0.25
+                  # e.g. just plain #3 or No. 4
+                - alternative: *number
+                  probability: 0.2
+                - alternative: *door
+                  probability: 0.04
+                - alternative: *penthouse
                   probability: 0.01
-        university:
-            default: *room
-            probability: 0.9
+                - alternative: *apartment
+                  probability: 0.1
+            numeric_probability: 0.9 # e.g. Flat 1
+            numeric_plus_alpha_probability: 0.03 # e.g. 1A
+            alpha_plus_numeric_probability: 0.03 # e.g. A1
+            alpha_probability: 0.04 # e.g. Flat A
+            alpha_plus_numeric_whitespace_probability: 0.1
+            numeric_plus_alpha_whitespace_probability: 0.1
+
+            # Separate random probability for adding directions like 2L, 2R, etc.
+            add_direction: true
+            add_direction_probability: 0.1
+            # Add directions only for plain numbers
+            add_direction_numeric: true
+            add_direction_standalone: true
+
+        zones:
+            residential: *unit_alphanumeric
+            commercial:
+                default: *suite
+                probability: 0.8
+                alternatives:
+                    - alternative: *office
+                      probability: 0.2
+            industrial:
+                default: *lot
+                probability: 0.5
+                alternatives:
+                    - alternative: *suite
+                      probability: 0.3
+                    - alternative: *unit
+                      probability: 0.19
+                    - alternative: *parcel
+                      probability: 0.01
+            university:
+                default: *room
+                probability: 0.9
+                alternatives:
+                    - alternative: *hall
+                      probability: 0.1
+
+        allotments:
+            lot:
+                default: *lot
+                numeric_probability: 0.8
+                alphanumeric_probability: 0.1
+                alpha_probability: 0.1
+            parcel:
+                default: *parcel
+                numeric_probability: 0.3
+                alphanumeric_probability: 0.3
+                alpha_probability: 0.4
+            lot_probability: 0.9
+            parcel_probability: 0.06
+            lot_plus_parcel_probability: 0.02
+            parcel_plus_lot_probability: 0.02
+
+        directional:
+            modifier:
+                direction: right # e.g. 1
+                numeric_probability: 0.1
+                numeric_affix_probability: 0.9
+                alternatives:
+                    - alternative: *right
+                    - alternative: *left
+                    - alternative: *rear
+                    - alternative: *front
+
+        standalone:
+            sample: false
+            default: *penthouse
+            probability: 0.4
             alternatives:
-                - alternative: *hall
+                - alternative: *top_right
+                  probability: 0.15
+                - alternative: *top_left
+                  probability: 0.15
+                - alternative: *top_floor_left
+                  probability: 0.15
+                - alternative: *top_floor_right
+                  probability: 0.15
+
+        # For unit types like 2/34 (more common in Canada and Australia)
+        combined:
+            component: house_number
+            direction: right
+            separators:
+                - separator: /
+                  probability: 0.8
+                - separator: "-"
+                  probability: 0.1
+                - separator: " - "
                   probability: 0.1
 
-    allotments:
-        lot:
-            default: *lot
-            numeric_probability: 0.8
-            alphanumeric_probability: 0.1
-            alpha_probability: 0.1
-        parcel:
-            default: *parcel
-            numeric_probability: 0.3
-            alphanumeric_probability: 0.3
-            alpha_probability: 0.4
-        lot_probability: 0.9
-        parcel_probability: 0.06
-        lot_plus_parcel_probability: 0.02
-        parcel_plus_lot_probability: 0.02
-
-    directional:
-        modifier:
-            direction: right # e.g. 1
-            numeric_probability: 0.1
-            numeric_affix_probability: 0.9
-            alternatives:
-                - alternative: *right
-                - alternative: *left
-                - alternative: *rear
-                - alternative: *front
-
-    standalone:
-        sample: false
-        default: *penthouse
-        probability: 0.4
-        alternatives:
-            - alternative: *top_right
-              probability: 0.15
-            - alternative: *top_left
-              probability: 0.15
-            - alternative: *top_floor_left
-              probability: 0.15
-            - alternative: *top_floor_right
-              probability: 0.15
-
-    # For unit types like 2/34 (more common in Canada and Australia)
-    combined:
-        component: house_number
-        direction: right
-        separators:
-            - separator: /
-              probability: 0.8
-            - separator: "-"
-              probability: 0.1
-            - separator: " - "
-              probability: 0.1
-
-    # If no unit number is specified
-    alphanumeric_probability: 0.75
-    standalone_probability: 0.2495
-    combined_probability: 0.005
+        # If no unit number is specified
+        alphanumeric_probability: 0.75
+        standalone_probability: 0.2495
+        combined_probability: 0.005
 
 # Country-specific overrides
 # ==========================
@@ -1175,6 +1176,7 @@ units:
 countries:
     # United States
     us:
+        <<: *default
         levels:
             storey: &story
                 canonical: story
@@ -1261,6 +1263,7 @@ countries:
     # Canada
     # Specifically Canadian English. If the address is in French it will use fr.yaml
     ca:
+        <<: *default
         levels:
             # Note: Canadian English uses "storey" keeping with the British convention, so no need to change that
 
@@ -1288,6 +1291,7 @@ countries:
             combined_probability: 0.1
     # Australia
     au:
+        <<: *default
         po_boxes: &australia_po_boxes
             alphanumeric:
                 default: *po_box
@@ -1330,6 +1334,7 @@ countries:
 
     # New Zealand - same rules as Australia
     nz:
+        <<: *default
         po_boxes: *australia_po_boxes
         units: *australia_unit_types
 
diff --git a/scripts/geodata/addresses/config.py b/scripts/geodata/addresses/config.py
index a5e743e7..592768b3 100644
--- a/scripts/geodata/addresses/config.py
+++ b/scripts/geodata/addresses/config.py
@@ -7,9 +7,9 @@ import yaml
 from collections import Mapping
 
 from geodata.address_expansions.address_dictionaries import address_phrase_dictionaries
+from geodata.configs.utils import nested_get, DoesNotExist, recursive_merge
 from geodata.math.sampling import cdf, check_probability_distribution
 
-
 this_dir = os.path.realpath(os.path.dirname(__file__))
 
 ADDRESS_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
@@ -19,35 +19,6 @@ DICTIONARIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
                                 'resources', 'dictionaries')
 
 
-def recursive_merge(a, b):
-    for k, v in six.iteritems(b):
-        if isinstance(v, Mapping):
-            existing = a.get(k, v)
-            merged = recursive_merge(existing, v)
-            a[k] = merged
-        else:
-            a[k] = b[k]
-    return a
-
-
-class DoesNotExist:
-    pass
-
-
-def nested_get(obj, keys):
-    if len(keys) == 0:
-        return obj
-    try:
-        for key in keys[:-1]:
-            obj = obj.get(key, {})
-            if not hasattr(obj, 'items'):
-                return DoesNotExist
-        key = keys[-1]
-        return obj.get(key, DoesNotExist)
-    except AttributeError:
-        return DoesNotExist
-
-
 class AddressConfig(object):
     def __init__(self, config_dir=ADDRESS_CONFIG_DIR, dictionaries_dir=DICTIONARIES_DIR):
         self.address_configs = {}
@@ -58,17 +29,14 @@ class AddressConfig(object):
                 continue
 
             config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))
+            default = config['default']
             countries = config.pop('countries', {})
 
-            for k in countries.keys():
-                country_config = countries[k]
-                config_copy = copy.deepcopy(config)
-                countries[k] = recursive_merge(config_copy, country_config)
-
-            config['countries'] = countries
+            if countries:
+                default['countries'] = countries
 
             lang = filename.strip('.yaml')
-            self.address_configs[lang] = config
+            self.address_configs[lang] = default
 
         self.sample_phrases = {}
 
@@ -87,6 +55,7 @@ class AddressConfig(object):
             if country_config:
                 config = country_config
 
+
         value = nested_get(config, keys)
         if value is not DoesNotExist:
             return value