Files
2025-09-06 22:03:29 -04:00

1469 lines
47 KiB
YAML

# en.yaml
# -------
# Supplement to the per-country address formats for English around the world.
# These configs are mostly used to generate training data we don't have from OSM
# like flat/apartment numbers, intersections, etc. The configs aren't directly used by
# the parser model itself, but can influence it as they affect its input.
# Note: by default, we use the UK conventions for English as they cover more countries.
# US/Canada-specific conventions and any others (e.g. Hong Kong, Australia) go in the
# country overrides section. Each country can create its own copy of the entire top-level
# structure and it will be recursively merged with the defaults.
# Components
# ==========
# How likely we are to generate a component at random (on its own or conditional on other components)
components:
entrance:
null_probability: 0.9995
alphanumeric_probability: 0.0005
conditional:
- component: staircase
probabilities:
null_probability: 0.99995
alphanumeric_probability: 0.00005
- component: level
probabilities:
null_probability: 0.9995
alphanumeric_probability: 0.0005
staircase:
null_probability: 0.999
alphanumeric_probability: 0.001
level:
null_probability: 0.85 # Probability of doing nothing if no floor number is specified
alphanumeric_probability: 0.15 # Probability of generating an alphanumeric floor if none was specified
# Conditional probabilities
conditional:
# e.g. given that we have unit already (natural or generated)
- component: unit
probabilities:
null_probability: 0.95
alphanumeric_probability: 0.05
- component: staircase
probabilities:
null_probability: 0.6
alphanumeric_probability: 0.4
unit:
# If no unit number is specified
null_probability: 0.4
alphanumeric_probability: 0.55
standalone_probability: 0.05
conditional:
- component: level
probabilities:
null_probability: 0.95
alphanumeric_probability: 0.05
- component: staircase
probabilities:
null_probability: 0.7
alphanumeric_probability: 0.3
combinations:
# For unit types like 2/34 (more common in Canada and Australia)
-
components:
- house_number
- unit
label: house_number
separators:
- separator: /
probability: 0.8
- separator: "-"
probability: 0.1
- separator: " - "
probability: 0.1
probability: 0.005
# Number
# ======
# Number, No., #, etc. can be used in both floor and apartment numbers,
# so we'll define it separately
numbers:
default: &number
canonical: number # canonical word in libpostal dictionary
abbreviated: "no" # most common abbreviated form ("no" is a boolean in YAML, needs to be quoted)
sample: true # Randomly sample other variations (e.g. num, nr)
# Probabilities
canonical_probability: 0.3 # With this probability, use the canonical
abbreviated_probability: 0.5 # With this probability, use the abbreviated form
sample_probability: 0.2 # With this probability, sample other variations
sample_exclude:
- "#" # Used in numeric affix. Needs to be quoted, otherwise it's a comment
numeric:
direction: left
numeric_affix:
affix: "#" # e.g. #3, #2F, etc.
direction: left # affix goes on the number's left
# Probabilities for numbers
numeric_probability: 0.4 # With this probability, use the standard numeric
numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3
# And
# ===
# The word for "and". Used both in intersections and phrases like "Units 1 & 2", etc.
and:
default: &and
canonical: and
abbreviated: "&"
canonical_probability: 0.2
abbreviated_probability: 0.75
sample: true
sample_probability: 0.05
# Floor/level
# ===========
# OSM doesn't usually concern itself with the address beyond the front door
# yet many real-world addresses will have qualifying strings like "6th floor"
# and we'd like the parser to handle those.
#
# When we do get floor numbers in OSM addresses, it's usually in the form of the
# addr:floor or level tag, where the value is typically an integer or a half-floor
# (to indicate mezzanines). Those tags are relatively scarce in OSM, but many OSM
# addresses do have a building:levels tag. If we know there are 20 floors in the
# building, we can randomly sample numbers <= the # of floors and come up with plausible
# sounding addresses (i.e. a Floor 20 address is not as likely outside major cities).
#
# We're not done yet, because the integer value by itself isn't what people use when
# writing addresses. This part of the config helps us rewrite the raw integer floor
# numers as the sort of natural language text used in addresses like "Fl #1". The config
# is designed to be cross-lingual, so we can use the same structure with different words
# and do this for addresses in pretty much any language.
levels:
# Numbered floors
floor: &floor
canonical: floor
abbreviated: fl
canonical_probability: 0.5 # With this probability, use canonical version
abbreviated_probability: 0.4 # With this probability, use abbreviated version
sample_probability: 0.1 # With this probability, sample from the other forms
sample_exclude:
- / f # Exclude this abbreviation since it's used as an affix
sample: true
plural:
canonical: floors
abbreviated: fls
# e.g. Floor 1
numeric:
direction: left # Floor/Fl goes to the left of the number
direction_probability: 0.8 # With 1 - this probability, Floor/Fl goes on the other side of the number
add_number_phrase: true # Occasionally add variation of "number", e.g. Floor No. 1
add_number_phrase_probability: 0.4 # With this probability, use Floor No. 1 or Floor #1 vs. Floor 1
# e.g. 2/F, 3/F
numeric_affix:
affix: /f
direction: right # affix goes to number's right (always)
# e.g. 1st Floor
ordinal:
direction: right # canonical or abbreviated form goes to the ordinal's right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
# Probabilities
numeric_probability: 0.75 # Use the simple number e.g. Floor 1 (or Floor No. 1)
numeric_affix_probability: 0.05 # Use the 2/F (less common)
ordinal_probability: 0.2 # Use the ordinal e.g. 1st Floor
# The word "level" is also occasionally used
level: &level
canonical: level
plural: levels
abbreviated: lvl
sample: true
canonical_probability: 0.5
abbreviated_probability: 0.3
sample_probability: 0.2
sample_exclude:
- / l # Exclude this abbreviation since it's used as an affix
numeric:
direction: left # Level/Lvl goes to the left of the number
direction_probability: 0.8 # With 1 - this probability, Level/Lvl goes on the other side of the number
add_number_phrase: true # Occasionally add variation of "number", e.g. Level No. 1
add_number_phrase_probability: 0.4 # With this probability, use Level No. 1 or Level #1 vs. Level 1
# e.g. 2/L, 3/L (ambiguous with left)
numeric_affix:
affix: /l
direction: right
ordinal:
direction: right
numeric_probability: 0.4
numeric_affix_probability: 0.05
ordinal_probability: 0.55
platform: &platform
canonical: platform
plural: platforms
abbreviated: pf
canonical_probability: 0.7
abbreviated_probability: 0.3
numeric:
direction: left
ordinal:
direction: right
numeric_probability: 0.5 # e.g. Platform 1
ordinal_probability: 0.5 # e.g. 1st Platform
storey: &storey
canonical: storey
plural: storeys
numeric:
direction: left
ordinal:
direction: right
numeric_probability: 0.025 # e.g. Storey 2, less common
ordinal_probability: 0.975 # e.g. 2nd Storey, more common
# Special instructions for ground floor
ground_floor: &ground_floor
canonical: ground floor
abbreviated: g/f
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
sample: true
ground: &ground
canonical: ground
abbreviated: g
sample: true
canonical_probability: 0.6
abbreviated_probability: 0.1
sample_probability: 0.3
ground_level: &ground_level
canonical: ground level
abbreviated: g/l
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.2
sample_probability: 0.4
# Special instructions for lower ground floor (added randomly, not an alias for a floor number)
lower_ground_floor: &lower_ground_floor
canonical: lower ground floor
abbreviated: lg
sample: true
# Probabilities
canonical_probability: 0.6
abbreviated_probability: 0.3
sample_probability: 0.1
# Special instructions for upper ground floor (added randomly, not an alias for a floor number)
upper_ground_floor: &upper_ground_floor
canonical: upper ground floor
abbreviated: ug
sample: true
# Probabilities
canonical_probability: 0.6
abbreviated_probability: 0.2
sample_probability: 0.2
upper: &upper
canonical: upper
abbreviated: uppr
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.1
sample_probability: 0.1
lower_level: &lower_level
canonical: lower level
abbreviated: lwr lvl
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.1
sample_probability: 0.2
lobby: &lobby
canonical: lobby
upstairs: &upstairs
canonical: upstairs
downstairs: &downstairs
canonical: downstairs
# Special instructions for podium level (added randomly)
podium_level: &podium_level
canonical: podium level
abbreviated: pd lvl
sample: true
canonical_probability: 0.6
abbreviated_probability: 0.2
sample_probability: 0.2
podium: &podium
canonical: podium
abbreviated: pd
sample: true
canonical_probability: 0.6
abbreviated_probability: 0.2
sample_probability: 0.2
# Used when floor number is < 0 (starts at -1 in all countries)
basement: &basement
canonical: basement
abbreviated: bsmt
sample: true
# e.g. Basement 1
numeric:
direction: left
# e.g. B1
numeric_affix:
affix: b
direction: left
# e.g. 2nd Basement
ordinal:
direction: right
standalone_probability: 0.985
number_abs_value: true
number_min_abs_value: 1
numeric_probability: 0.005
numeric_affix_probability: 0.005
ordinal_probability: 0.005
cellar: &cellar
canonical: cellar
sample: true
canonical_probability: 0.8
sample_probability: 0.2
# Floor number of <= -2 can be basement 2, sub-basement, sub-basement 1, etc.
sub_basement: &sub_basement
canonical: sub basement
abbreviated: sb
sample: true
# e.g. Sub-basement 1
numeric:
direction: left
# e.g. SB1
numeric_affix:
affix: sb
direction: left
# e.g. 2nd Sub-basement
ordinal:
direction: right
number_abs_value: true
number_min_abs_value: 2
# Basement 2 == Sub-basement 1
number_subtract_abs_value: 1
standalone_probability: 0.985
numeric_probability: 0.005
numeric_affix_probability: 0.005
ordinal_probability: 0.005
top_floor: &top_floor
canonical: top floor
abbreviated: tf
sample: true
canonical_probability: 0.6
abbreviated_probability: 0.3
sample_probability: 0.1
# Mezzanine level (floor number {0.5, 1.5, ...}, also be added at random)
mezzanine: &mezzanine
canonical: mezzanine
abbreviated: mezz
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.1
sample_probability: 0.1
# Mezzanine/Mezz 2 or Mezzanine/Mezz A
numeric:
direction: left
# M2
numeric_affix:
affix: m
direction: left
# 2nd Mezzanine
ordinal:
direction: right
# Floor 0.5 is just plain mezzanine, no number
number_abs_value: true
number_min_abs_value: 1
standalone_probability: 0.5
numeric_probability: 0.1
numeric_affix_probability: 0.1
ordinal_probability: 0.3
mezzanine_floor: &mezzanine_floor
canonical: mezzanine floor
abbreviated: mezz floor
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
mezzanine_level: &mezzanine_level
canonical: mezzanine level
abbreviated: mezz level
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
lower_mezzanine: &lower_mezzanine
canonical: lower mezzanine
abbreviated: lower mezz
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
upper_mezzanine: &upper_mezzanine
canonical: upper mezzanine
abbreviated: upper mezz
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
# Should be at least level 1.5
number_min_abs_value: 1
aliases:
"<-1":
default: *basement
probability: 0.6
alternatives:
- alternative: *sub_basement
probability: 0.3995
- alternative: *floor
probability: 0.0005
"-1":
default: *basement
probability: 0.7
alternatives:
- alternative: *cellar
probability: 0.1
- alternative: *lower_ground_floor
probability: 0.1
- alternative: *downstairs
probability: 0.0495
- alternative: *lower_level
probability: 0.05
- alternative: *floor
probability: 0.0005
# Special token for half-floors
half_floors:
default: *mezzanine
probability: 0.8
alternatives:
- alternative: *mezzanine_floor
probability: 0.1
- alternative: *mezzanine_level
probability: 0.1
aliases:
"1":
default: *upper_mezzanine
probability: 0.5
alternatives:
- alternative: *mezzanine
probability: 0.5
half_floors_negative:
default: *lower_mezzanine
"0":
default: *ground_floor
probability: 0.9
alternatives:
- alternative: *ground
probability: 0.02
- alternative: *ground_level
probability: 0.01
- alternative: *lower_ground_floor
probability: 0.025
- alternative: *upper_ground_floor
probability: 0.025
- alternative: *lobby
probability: 0.005
- alternative: *floor
# Floor 0 is uncommon
probability: 0.01
- alternative: *level
probability: 0.005
"1":
# Most of the time just say 1st Floor
default: *floor
probability: 0.9
alternatives:
- alternative: *upper_ground_floor
probability: 0.075
- alternative: *podium_level
probability: 0.01
- alternative: *podium
probability: 0.005
- alternative: *upstairs
probability: 0.01
top:
default: *floor
probability: 0.85
alternatives:
- alternative: *level
probability: 0.1
- alternative: *top_floor
probability: 0.05
# Integer for whether floors start at 0 or 1
numbering_starts_at: 0
# Associated phrases for alphanumeric floors (Floor 1, Floor A)
alphanumeric:
default: *floor
probability: 0.8
add_number_phrase: true
add_number_phrase_probability: 0.3
alternatives:
- alternative: *level
probability: 0.15
- alternative: *platform
probability: 0.025
- alternative: *storey
probability: 0.025
numeric_probability: 0.96 # With this probability, pick an integer
alpha_probability: 0.0098 # With this probability, pick a letter e.g. Floor A
numeric_plus_alpha_probability: 0.0001 # e.g. Floor 2A
alpha_plus_numeric_probability: 0.0001 # e.g. Floor A2
hyphenated_number_probability: 0.03 # e.g. 11-10
numeric_plus_alpha:
whitespace_probability: 0.1
alpha_plus_numeric:
whitespace_probability: 0.1
hyphenated_number:
range_probability: 0.5
direction: right
direction_probability: 0.6
# Intersections
# =============
# For constructing intersections like 5th Avenue & Broadway
# In OSM, a node that's part of two ways is an intersection.
#
# These simple rules make it possible to create training examples
# like: 26th/road Street/road and/intersection 6th/road Avenue/road
cross_streets:
# 26th & 6th Avenue
and: *and
# 26th @ Broadway
at: &at
canonical: at
abbreviated: "@"
canonical_probability: 0.7
abbreviated_probability: 0.3
sample: true
corner_of: &corner_of
canonical: corner of
at_the_corner_of: &at_the_corner_of
canonical: at the corner of
x: &x
canonical: x
intersection:
default: *and
probability: 0.7
alternatives:
- alternative: *at
probability: 0.125
- alternative: *x
probability: 0.025
- alternative: *corner_of
probability: 0.1
- alternative: *at_the_corner_of
probability: 0.05
# 26th betw 5th Ave and 6th Ave
between:
canonical: between
abbreviated: betw
canonical_probability: 0.5
abbreviated_probability: 0.5
sample: true
parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th)
# PO Box addresses
# ================
# For PO box addresses, there's almost no data in OSM, so we'll need to
# generate them somewhat randomly.
#
# The strategy is: for every amenity=post_office, generate a number of PO box
# addresses using random numbers (and some alpha-numerics so we capture patterns
# like PO Box 1Q, etc.) It doesn't matter if the post boxes themselves actually
# exist, as long as they cover the patterns of digits we expect in real addresses.
# The parser cares more about how many digits a number has and the surrounding
# words/phrases than the specific number i.e. numbers in the range 1000-9999
# can simply be normalized to DDDD.
po_boxes:
po_box: &po_box
canonical: post office box
abbreviated: p.o. box
sample: true
canonical_probability: 0.01
abbreviated_probability: 0.95
sample_probability: 0.04
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.4 # PO Box #1234
box: &box
canonical: box
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.4 # Box #1234
private_mail_box: &private_mail_box
canonical: private mail box
abbreviated: pmb
prefer_abbreviated: true
sample: true
canonical_probability: 0.01
abbreviated_probability: 0.95
sample_probability: 0.04
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.4 # PMB #1234
alphanumeric:
# Don't sample all the forms in post_office.txt as many of the PO box
# phrases appear only in Australia
sample: false
default: *po_box
probability: 0.995
alternatives:
- alternative: *box
probability: 0.005
numeric_probability: 0.9 # PO Box 123
alpha_probability: 0.05 # PO Box A
numeric_plus_alpha_probability: 0.04 # PO Box 123G
alpha_plus_numeric_probability: 0.01 # PO Box A123
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
digits:
- length: 1
probability: 0.05
- length: 2
probability: 0.1
- length: 3
probability: 0.2
- length: 4
probability: 0.5
- length: 5
probability: 0.1
- length: 6
probability: 0.05
zones:
# Overrides for commercial/office areas (landuse=commercial in OSM)
commercial:
default: *po_box
probability: 0.7
alternatives:
- alternative: *private_mail_box
probability: 0.2
- alternative: *box
probability: 0.1
# Categories
# ==========
# Use the operators "in" and "near" for building category queries
# such as "restaurants in Hackney, London"
categories:
near:
default:
canonical: near
probability: 0.8
alternatives:
- alternative:
canonical: around
probability: 0.2
nearby:
default:
canonical: nearby
probability: 0.6
alternatives:
- alternative:
canonical: near here
probability: 0.3
- alternative:
canonical: around here
probability: 0.1
near_me:
default:
canonical: near me
in:
default:
canonical: in
# Probabilities of each phrase
near_probability: 0.35
nearby_probability: 0.2
near_me_probability: 0.1
in_probability: 0.35
# Directions
# ==========
# Unit types, stairways, etc. may have a direction associated
# with them whether it's right/left or a cardinal direction
# like "East Entrance".
directions:
right: &right
canonical: right
abbreviated: r
canonical_probability: 0.7
abbreviated_probability: 0.3
numeric:
direction: right
numeric_affix:
affix: r
direction: right
whitespace_probability: 0.05
numeric_probability: 0.05
numeric_affix_probability: 0.95
left: &left
canonical: left
abbreviated: l
canonical_probability: 0.7
abbreviated_probability: 0.3
numeric:
direction: right
numeric_affix:
affix: l
direction: right
whitespace_probability: 0.05
numeric_probability: 0.05
numeric_affix_probability: 0.95
rear: &rear
canonical: rear
abbreviated: r
canonical_probability: 0.8
abbreviated_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: r
direction: right
whitespace_probability: 0.05
numeric_probability: 0.05
numeric_affix_probability: 0.95
front: &front
canonical: front
abbreviated: frnt
canonical_probability: 0.8
abbreviated_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: f
direction: right
whitespace_probability: 0.05
numeric_probability: 0.05
numeric_affix_probability: 0.95
alternatives:
- alternative: *right
probability: 0.45
- alternative: *left
probability: 0.45
- alternative: *front
probability: 0.05
- alternative: *rear
probability: 0.05
anteroposterior:
alternatives:
- alternative: *front
probability: 0.5
- alternative: *rear
probability: 0.5
lateral:
alternatives:
- alternative: *left
probability: 0.5
- alternative: *right
probability: 0.5
cardinal_directions:
east: &east
canonical: east
abbreviated: e
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
numeric:
direction: right
numeric_affix:
affix: e
direction: right
numeric_probability: 0.6
numeric_affix_probability: 0.4
west: &west
canonical: west
abbreviated: w
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
numeric:
direction: right
numeric_affix:
affix: w
direction: right
numeric_probability: 0.6
numeric_affix_probability: 0.4
north: &north
canonical: north
abbreviated: n
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
numeric:
direction: right
numeric_affix:
affix: n
direction: right
numeric_probability: 0.6
numeric_affix_probability: 0.4
south: &south
canonical: south
abbreviated: s
sample: true
canonical_probability: 0.7
abbreviated_probability: 0.2
sample_probability: 0.1
numeric:
direction: right
numeric_affix:
affix: s
direction: right
numeric_probability: 0.6
numeric_affix_probability: 0.4
alternatives:
- alternative: *north
probability: 0.25
- alternative: *east
probability: 0.25
- alternative: *south
probability: 0.25
- alternative: *west
probability: 0.25
# Entrance
# ========
# For deriving strings like "North Entrance"
entrances:
entrance: &entrance
canonical: entrance
abbreviated: ent
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.2
numeric:
direction: left
# Entrance 1, Entrance A, etc.
alphanumeric: &entrance_alphanumeric
default: *entrance
numeric_probability: 0.1 # e.g. Entrance 1
alpha_probability: 0.85 # e.g. Entrnace A
numeric_plus_alpha_probability: 0.025 # e.g. 1A
alpha_plus_numeric_probability: 0.025 # e.g. A1
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
directional:
modifier:
direction: left # e.g. North Entrance
direction_probability: 0.9
alternatives:
- alternative: *north
- alternative: *south
- alternative: *east
- alternative: *west
- alternative: *right
- alternative: *left
- alternative: *rear
- alternative: *front
- alternative:
canonical: freight
# Staircase
# =========
# For deriving strings like "Staircase A" in apartment buildings
staircases:
stair: &stair
canonical: stair
sample: true
canonical_probability: 0.9
sample_probability: 0.1
numeric:
direction: left
staircase: &staircase
canonical: staircase
sample: true
canonical_probability: 0.9
sample_probability: 0.1
numeric:
direction: left
stairway: &stairway
canonical: stairway
sample: true
canonical_probability: 0.9
sample_probability: 0.1
numeric:
direction: left
stairwell: &stairwell
canonical: stairwell
sample: true
canonical_probability: 0.9
sample_probability: 0.1
numeric:
direction: left
alphanumeric: &staircase_alphanumeric
# For alphanumerics, Stair A, Stair 1, etc.
default: *stair
probability: 0.4
alternatives:
- alternative: *staircase
probability: 0.2
- alternative: *stairway
probability: 0.2
- alternative: *stairwell
probability: 0.2
numeric_probability: 0.1 # e.g. Staircase 1
alpha_probability: 0.85 # e.g. Staircase A
numeric_plus_alpha_probability: 0.025 # e.g. 1A
alpha_plus_numeric_probability: 0.025 # e.g. A1
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
directional:
direction: left # e.g. Left Staircase, North Tower
direction_probability: 0.7
modifier:
alternatives:
- alternative: *north
- alternative: *south
- alternative: *east
- alternative: *west
- alternative: *right
- alternative: *left
- alternative: *rear
- alternative: *front
# Unit types
# ==========
# Unit information is common in residential addresses, offices, business parks, etc.
# Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to
# refer to the
units:
# Special terms
suite: &suite
canonical: suite
abbreviated: ste
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
plural:
canonical: suites
abbreviated: stes
canonical_probability: 0.6
abbreviated_probability: 0.4
numeric:
direction: left
# Suite #101 and Suite No. 101 as opposed to Suite 101
add_number_phrase: true
add_number_phrase_probability: 0.5
penthouse: &penthouse
canonical: penthouse
abbreviated: ph
sample: true
canonical_probability: 0.5
abbreviated_probability: 0.3
sample_probability: 0.2
plural:
canonical: penthouses
standalone_probability: 1.0
penthouse_numeric: &penthouse_numeric
<<: *penthouse
numeric:
direction: left
# Penthouse #1 and Penthouse No. 1
add_number_phrase: true
add_number_phrase_probability: 0.2
numeric_probability: 1.0
standalone_probability: 0.0
top_left: &top_left
canonical: top left
abbreviated: t/l
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.3
sample_probability: 0.3
top_right: &top_right
canonical: top right
abbreviated: t/r
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.3
sample_probability: 0.3
top_floor_right: &top_floor_right
canonical: top floor right
abbreviated: tfr
sample: true
canonical_probability: 0.2
abbreviated_probability: 0.5
sample_probability: 0.3
top_floor_left: &top_floor_left
canonical: top floor left
abbreviated: tfl
sample: true
canonical_probability: 0.2
abbreviated_probability: 0.5
sample_probability: 0.3
office: &office
canonical: office
abbreviated: ofc
sample: true
canonical_probability: 0.5
abbreviated_probability: 0.3
sample_probability: 0.2
plural:
canonical: offices
abbreviated: ofcs
canonical_probability: 0.4
abbreviated_probability: 0.6
numeric:
direction: left
# Office #1 and Office No. 1
add_number_phrase: true
add_number_phrase_probability: 0.7
door: &door
canonical: door
sample: true
canonical_probability: 0.8
sample_probability: 0.2
plural:
canonical: doors
numeric:
direction: left
# Door #1 and Door No. 1
add_number_phrase: true
add_number_phrase_probability: 0.2
room: &room
canonical: room
abbreviated: rm
sample: true
canonical_probability: 0.5
abbreviated_probability: 0.5
plural:
canonical: rooms
abbreviated: rms
canonical_probability: 0.6
abbreviated_probability: 0.4
numeric:
direction: left
# Room #1 and Room No. 1
add_number_phrase: true
add_number_phrase_probability: 0.6
hall: &hall
canonical: hall
plural:
canonical: halls
numeric:
direction: left
# Room #1 and Room No. 1
add_number_phrase: true
add_number_phrase_probability: 0.6
apartment: &apartment
canonical: apartment
abbreviated: apt
prefer_abbreviated: true
sample: true
canonical_probability: 0.15
abbreviated_probability: 0.6
sample_probability: 0.25
plural:
canonical: apartments
abbreviated: apts
canonical_probability: 0.2
abbreviated: 0.8
numeric:
direction: left
# Apt #1 and Apt No. 1
add_number_phrase: true
add_number_phrase_probability: 0.4
flat: &flat
canonical: flat
abbreviated: flt
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.15
sample_probability: 0.05
plural:
canonical: flats
abbreviated: flts
canonical_probability: 0.8
abbreviated_probability: 0.2
numeric:
direction: left
# Flat #1 and Flat No. 1
add_number_phrase: true
add_number_phrase_probability: 0.4
lot: &lot
canonical: lot
sample: true
canonical_probability: 0.9
sample_probability: 0.1
plural:
canonical: lots
numeric:
direction: left
# Lot #1 and Lot No. 1
add_number_phrase: true
add_number_phrase_probability: 0.6
parcel: &parcel
canonical: parcel
sample: true
canonical_probability: 0.9
sample_probability: 0.1
plural:
canonical: parcels
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.6
unit: &unit
canonical: unit
abbreviated: u
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.1
sample_probability: 0.1
plural:
canonical: units
numeric:
direction: left
# Unit #1 and Unit No. 1
add_number_phrase: true
add_number_phrase_probability: 0.4
alphanumeric: &unit_alphanumeric
# Many unit types that apply only in Australia
# For most English-speaking countries, only use the terms defined above
sample: false
default: *flat
probability: 0.4
alternatives:
- alternative: *unit
probability: 0.25
# e.g. just plain #3 or No. 4
- alternative: *number
probability: 0.2
- alternative: *lot
probability: 0.03
- alternative: *door
probability: 0.01
- alternative: *penthouse_numeric
probability: 0.01
- alternative: *apartment
probability: 0.1
numeric_probability: 0.87 # e.g. Flat 1
numeric_plus_alpha_probability: 0.03 # e.g. 1A
alpha_plus_numeric_probability: 0.03 # e.g. A1
alpha_probability: 0.04 # e.g. Flat A
hyphenated_number_probability: 0.03 # e.g. 11-10
alpha_plus_numeric:
whitespace_probability: 0.2
hyphen_probability: 0.2
numeric_plus_alpha:
whitespace_probability: 0.2
hyphen_probability: 0.2
hyphenated_number:
range_probability: 0.5
direction: right
direction_probability: 0.6
# Separate random probability for adding directions like 2L, 2R, etc.
add_direction: true
add_direction_probability: 0.1
# Add directions for plain numbers
add_direction_numeric: true
# Add direction only e.g. Unit Left
add_direction_standalone: true
# Separate random probability for adding quadrant units like 2RF 2RR 2LF 2LR
add_quadrant: true
add_quadrant_probability: 0.001
add_quadrant_first_direction: lateral
add_quadrant_numeric: true
add_quadrant_standalone: true
# If there are 10 floors, create unit numbers like #301 or #1032
use_floor_probability: 0.35
zones:
residential: *unit_alphanumeric
commercial:
default: *office
probability: 0.6
alternatives:
- alternative: *number
probability: 0.2
- alternative: *suite
probability: 0.2
industrial:
default: *lot
probability: 0.5
alternatives:
- alternative: *suite
probability: 0.3
- alternative: *unit
probability: 0.19
- alternative: *parcel
probability: 0.01
university:
default: *room
probability: 0.9
alternatives:
- alternative: *hall
probability: 0.1
allotments:
lot:
default: *lot
numeric_probability: 0.8
alphanumeric_probability: 0.1
alpha_probability: 0.1
parcel:
default: *parcel
numeric_probability: 0.3
alphanumeric_probability: 0.3
alpha_probability: 0.4
lot_probability: 0.9
parcel_probability: 0.06
lot_plus_parcel_probability: 0.02
parcel_plus_lot_probability: 0.02
standalone:
sample: false
default: *penthouse
probability: 0.4
alternatives:
- alternative: *top_right
probability: 0.15
- alternative: *top_left
probability: 0.15
- alternative: *top_floor_left
probability: 0.15
- alternative: *top_floor_right
probability: 0.15
# Country-specific overrides
# ==========================
# For each country, we allow a copy of the structures listed above
# in order to override the default values
countries:
# United States
us:
levels:
storey: &story
canonical: story
numeric:
direction: left
ordinal:
direction: right
numeric_probability: 0.025 # e.g. Story 2, less common
ordinal_probability: 0.975 # e.g. 2nd Story, more common
alphanumeric:
default: *floor
probability: 0.8
alternatives:
- alternative: *level
probability: 0.15
- alternative: *platform
probability: 0.025
- alternative: *story
probability: 0.025
numbering_starts_at: 1
aliases: &us_floor_aliases
"1":
default: *floor
probability: 0.6
alternatives:
- alternative: *ground_floor
probability: 0.3
- alternative: *upper_ground_floor
probability: 0.1
"2":
# Most of the time just say 2nd Floor
default: *floor
probability: 0.9
alternatives:
- alternative: *upstairs
probability: 0.1
po_boxes:
concatenate_postcode:
po_box_max_digits: 4 # For PO boxes with max n digits
direction: left # Concatenate on the left side of the PO box
postcode_digits:
length: 2 # use this many digits from the postal code
direction: right
concatenate_postcode_probability: 0.01
postcodes:
concatenate_po_box:
append:
separator: "-" # Use a hyphen separator
direction: right # To the right of the postcode
digits:
length: 4 # number of digits to append to the ZIP code
pad:
direction: left # left pad
character: "0" # pad with 0s, e.g. for PO Box 52, use -0052
concatenate_po_box_probability: 0.1
units:
alphanumeric: &us_units_alphanumeric
default: *apartment
probability: 0.6
alternatives:
- alternative: *unit
probability: 0.15
- alternative: *number
probability: 0.2
- alternative: *lot
probability: 0.03
- alternative: *door
probability: 0.005
- alternative: *penthouse_numeric
probability: 0.005
- alternative: *flat
probability: 0.01 # See this e.g. in Milwaukee with Polish flats
zones: &us_zones
residential: *us_units_alphanumeric
commercial:
# Suite is much more common in the US and Canada
default: *suite
probability: 0.5
alternatives:
- alternative: *number
probability: 0.2
- alternative: *office
probability: 0.3
# Canada
# Specifically Canadian English. If the address is in French it will use fr.yaml
ca:
components:
combinations:
-
components:
- unit
- house_number
label: house_number
separators:
- separator: /
probability: 0.04
- separator: "-"
probability: 0.95
- separator: " - "
probability: 0.01
probability: 0.1
levels:
# Note: Canadian English uses "storey" keeping with the British convention, so no need to change that
# In Canada first floor is the ground floor, as in the US
numbering_starts_at: 1
aliases: *us_floor_aliases
# For (English-speaking) Canada, use the same unit types as in the US
units:
alphanumeric: *us_units_alphanumeric
zones: *us_zones
# For unit types like 2/34
combined:
component: house_number
direction: left # Apartment number goes to the left of the house number
separators:
- separator: /
probability: 0.2
- separator: "-"
probability: 0.4
- separator: " - "
probability: 0.4
standalone_probability: 0.15
combined_probability: 0.1
# Australia
au:
po_boxes: &australia_po_boxes
alphanumeric:
default: *po_box
alternatives: []
probability: 0.95
# Australia has many strings for this e.g. Roadside Mail Bag
sample: true
sample_probability: 0.05
numeric:
direction: left
units: &australia_unit_types
alphanumeric:
# Australia has all kinds of unit types (e.g. Marine Berth) not used elsewhere
sample: true
default: *flat
# Reduce the default's probability to make room for sampling
probability: 0.39
sample_probability: 0.01
numeric:
direction: left
standalone:
default: *penthouse
sample: true
# Reduce the default's probability to make room for sampling
probability: 0.39
sample_probability: 0.01
standalone_probability: 1.0
# For unit types like 2/34
combined:
component: house_number
direction: right # Apartment number goes to the right of the house number
separators:
- separator: /
probability: 0.8
- separator: "-"
probability: 0.1
- separator: " - "
probability: 0.1
standalone_probability: 0.15
combined_probability: 0.1
# New Zealand - same rules as Australia
nz:
po_boxes: *australia_po_boxes
units: *australia_unit_types