diff --git a/resources/addresses/en.yaml b/resources/addresses/en.yaml index 061756fc..b27d3550 100644 --- a/resources/addresses/en.yaml +++ b/resources/addresses/en.yaml @@ -1,26 +1,43 @@ # en.yaml # ------- # Supplement to the per-country address formats for English around the world. -# Note: by default, we use the UK conventions as they cover more countries -# US/Canada-specific conventions and any others (e.g. Hong Kong, Australia) -# go in country overrides -# -# Remember, these rules only get applied sometimes with random probabilities -# unless specified using "frequency: always" (which should almost never be used) +# These configs are mostly used to generate training data we don't have from OSM +# like flat/apartment numbers, intersections, etc. The configs aren't directly used by +# the parser model itself, but can influence it as they affect its input. + +# Note: by default, we use the UK conventions for English as they cover more countries. +# US/Canada-specific conventions and any others (e.g. Hong Kong, Australia) go in the +# country overrides section. Each country can create its own copy of the entire top-level +# structure and it will be recursively merged with the defaults. # Number # ====== # Number, No., #, etc. can be used in both floor and apartment numbers, # so we'll define it separately -number: - canonical: number # canonical word in libpostal dictionary - abbreviated: "no" # abbreviated form. no is a boolean in YAML, needs to be quoted - prefer_abbreviated: true # Use abbreviated form more often than the canonical form - sample: true # Randomly sample other variations (e.g. num, nr) -numeric_abbreviated: - affix: "#" # e.g. #3, #2F, etc. - direction: left # affix goes on the number's left +numbers: + default: + canonical: number # canonical word in libpostal dictionary + abbreviated: "no" # abbreviated form. no is a boolean in YAML, needs to be quoted + direction: left # The phrase "number/no" goes to the left of the number + sample: true # Randomly sample other variations (e.g. num, nr) + # Probabilities + canonical_probability: 0.3 # With this probability, use the canonical + abbreviated_probability: 0.5 # With this probability, use the abbreviated form + sample_probability: 0.2 # With this probability, sample other variations + sample_exclude: + - "#" # Used in numeric affix + numeric: + direction: left + numeric_affix: + affix: "#" # e.g. #3, #2F, etc. + direction: left # affix goes on the number's left + + # Probabilities for numbers + numeric_probability: 0.4 # With this probability, use the standard numeric + numeric_affix_probability: 0.6 # With this probability, use e.g. #3 instead of No. 3 + affix_integers_only: false + # Floor/level # =========== @@ -41,95 +58,268 @@ numeric_abbreviated: # is designed to be cross-lingual, so we can use the same structure with different words # and do this for addresses in pretty much any language. -level: +levels: # Numbered floors - floor: + floor: &floor canonical: floor abbreviated: fl + canonical_probability: 0.5 # With this probability, use canonical version + abbreviated_probability: 0.4 # With this probability, use abbreviated version + sample_probability: 0.1 # With this probability, sample from the other forms + sample_exclude: + - / f # Exclude this abbreviation since it's used as an affix sample: true # e.g. Floor 1 numeric: direction: left # Floor/Fl goes to the left of the number - use_number_phrase: true # Occasionally add variation of "number", e.g. Floor #1, Floor No. 1 + direction_probability: 0.8 # With 1 - this probability, Floor/Fl goes on the other side of the number + add_number_phrase: true # Occasionally add variation of "number", e.g. Floor No. 1 + add_number_phrase_probability: 0.4 # With this probability, use Floor No. 1 or Floor #1 vs. Floor 1 # e.g. 2/F, 3/F - numeric_abbreviated: + numeric_affix: affix: /f - direction: right # affix goes to number's right + direction: right # affix goes to number's right (always) # e.g. 1st Floor ordinal: + direction: right # canonical or abbreviated form goes to the ordinal's right + # Probabilities + numeric_probability: 0.75 # Use the simple number e.g. Floor 1 (or Floor No. 1) + numeric_affix_probability: 0.05 # Use the 2/F (less common) + ordinal_probability: 0.2 # Use the ordinal e.g. 1st Floor + # The word "level" is also occasionally used + level: &level + canonical: level + abbreviated: lvl + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + sample_exclude: + - / l # Exclude this abbreviation since it's used as an affix + numeric: + direction: left # Level/Lvl goes to the left of the number + direction_probability: 0.8 # With 1 - this probability, Level/Lvl goes on the other side of the number + add_number_phrase: true # Occasionally add variation of "number", e.g. Level No. 1 + add_number_phrase_probability: 0.4 # With this probability, use Level No. 1 or Level #1 vs. Level 1 + # e.g. 2/L, 3/L (ambiguous with left) + numeric_affix: + affix: /l direction: right + ordinal: + direction: right + numeric_probability: 0.4 + numeric_affix_probability: 0.05 + ordinal_probability: 0.55 + platform: &platform + canonical: platform + abbreviated: pf + canonical_probability: 0.7 + abbreviated_probability: 0.3 + numeric: + direction: left + ordinal: + direction: right + numeric_probability: 0.5 # e.g. Platform 1 + ordinal_probability: 0.5 # e.g. 1st Platform + storey: &storey + canonical: storey + numeric: + direction: left + ordinal: + direction: right + numeric_probability: 0.1 # e.g. Storey 2, less common + ordinal_probability: 0.9 # e.g. 2nd Storey, more common # Special instructions for ground floor - ground_floor: - number: 0 # the 0th floor is typically the ground level in the UK/Commonwealth + ground_floor: &ground_floor canonical: ground floor abbreviated: g/f + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 sample: true # Special instructions for lower ground floor (added randomly, not an alias for a floor number) - lower_ground_floor: + lower_ground_floor: &lower_ground_floor canonical: lower ground floor abbreviated: lg sample: true + # Probabilities + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 # Special instructions for upper ground floor (added randomly, not an alias for a floor number) - upper_ground_floor: + upper_ground_floor: &upper_ground_floor canonical: upper ground floor abbreviated: ug sample: true + # Probabilities + canonical_probability: 0.6 + abbreviated_probability: 0.2 + sample_probability: 0.2 + upstairs: &upstairs + canonical: upstairs + downstairs: &downstairs + canonical: downstairs # Special instructions for podium level (added randomly) - podium_level: - canonical: podium level - abbreviated: pd lvl - sample: true - alternates: - - canonical: podium - abbreviated: pd - sample: true + podium_level: &podium_level + default: + canonical: podium level + abbreviated: pd lvl + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.2 + sample_probability: 0.2 + probability: 0.8 # Probability of using the default form + alternatives: + - alternative: + canonical: podium + abbreviated: pd + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.2 + sample_probability: 0.2 + probability: 0.2 # Probability of using this alternative # Used when floor number is < 0 (starts at -1 in all countries) - basement: + basement: &basement canonical: basement abbreviated: bsmt sample: true # e.g. Basement 1 - numeric: + alphanumeric: direction: left # e.g. B1 - numeric_abbreviated: + numeric_affix: affix: b direction: left # e.g. 2nd Basement ordinal: direction: right + cellar: &cellar + canonical: cellar + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 # Floor number of <= -2 can be basement 2, sub-basement, sub-basement 1, etc. - sub_basement: + sub_basement: &sub_basement canonical: sub basement abbreviated: sb sample: true # e.g. Sub-basement 1 - numeric: + alphanumeric: direction: left # e.g. SB1 - numeric_abbreviated: + numeric_affix: affix: sb direction: left # e.g. 2nd Sub-basement ordinal: direction: right + top_floor: &top_floor + canonical: top floor + abbreviated: tf + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 # Mezzanine level (floor number {0.5, 1.5, ...}, also be added at random) - mezzanine: + mezzanine: &mezzanine # Floor 0.5 is just plain mezzanine, no number canonical: mezzanine abbreviated: mezz half_floors: true sample: true - # Mezzanine/Mezz 2 - numeric: + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 + # Mezzanine/Mezz 2 or Mezzanine/Mezz A + alphanumeric: direction: left # M2 - numeric_abbreviated: + numeric_affix: affix: m direction: left # 2nd Mezzanine ordinal: direction: right + alphanumeric_probability: 0.3 + numeric_affix_probability: 0.2 + ordinal_probability: 0.5 + aliases: + "-1": + default: *basement + probability: 0.7 + alternatives: + - alternative: *cellar + probability: 0.125 + - alternative: *lower_ground_floor + probability: 0.1 + - alternative: *downstairs + probability: 0.05 + - alternative: *floor + probability: 0.025 + # Special token for half-floors + "*.5": + default: *mezzanine + "0": + default: *ground_floor + probability: 0.8 + alternatives: + - alternative: *upper_ground_floor + probability: 0.1 + - alternative: *downstairs + probability: 0.05 + - alternative: *podium_level + probability: 0.025 + - alternative: *floor + # Floor 0 is uncommon + probability: 0.025 + "1": + # Most of the time just say 1st Floor + default: *floor + probability: 0.9 + alternatives: + - alternative: *upstairs + probability: 0.1 + top: + default: *floor + probability: 0.5 + alternatives: + - alternative: *top_floor + probability: 0.3 + + # Associated phrases for alphanumeric floors (Floor 1, Floor A) + alphanumeric: + default: *floor + probability: 0.8 + add_number_phrase: true + add_number_phrase_probability: 0.3 + alternatives: + - alternative: *level + probability: 0.15 + - alternative: *platform + probability: 0.025 + - alternative: *storey + probability: 0.025 + numeric_probability: 0.99 # With this probability, pick an integer + alpha_probability: 0.0098 # With this probability, pick a letter e.g. Floor A + numeric_plus_alpha_probability: 0.0001 # e.g. Floor 2A + alpha_plus_numeric_probability: 0.0001 # e.g. Floor A2 + + alphanumeric_probability: 0.5 # Probability of using simple alphanumeric + alias_probability: 0.5 # Probability of using aliases + + # Floors are not part of the global address formats (and are not always standard) + # This is a list of places in the address where the floor number might go + order: + # e.g. 123 East 45th St, 6th Floor, NYC + - after: road + probability: 0.5 + # e.g. Floor 1, Da Vinci House, 44 Saffron Hill, London + - before: house + probability: 0.25 + # e.g. Da Vinci House, 1st Floor, 44 Saffron Hill, London + - before: road + probability: 0.25 + # Intersections # ============= @@ -144,17 +334,24 @@ intersections: and: canonical: and abbreviated: "&" + canonical_probability: 0.4 + abbreviated_probability: 0.6 sample: true # 26th @ Broadway at: canonical: at abbreviated: "@" + canonical_probability: 0.7 + abbreviated_probability: 0.3 sample: true # 26th betw 5th Ave and 6th Ave between: canonical: between abbreviated: betw + canonical_probability: 0.5 + abbreviated_probability: 0.5 sample: true + parentheses_probability: 0.5 # Probability of using parentheses e.g. (between 5th and 6th) # PO Box addresses # ================ @@ -169,24 +366,63 @@ intersections: # words/phrases than the specific number i.e. numbers in the range 1000-9999 # can simply be normalized to DDDD. -po_box: - numeric: +po_boxes: + po_box: &po_box + canonical: post office box + abbreviated: po box + sample: true + prefer_abbreviated: true + + box: &box + canonical: box + sample: true + + private_mail_box: &private_mail_box + canonical: private mail box + abbreviated: pmb + prefer_abbreviated: true + sample: true + + alphanumeric: # Don't sample all the forms in post_office.txt as many of the PO box # phrases appear only in Australia sample: false - default: - canonical: post office box - abbreviated: po box - sample: true - prefer_abbreviated: true - # Alternative phrases to use - alternates: - - canonical: box - sample: true - - canonical: private mail box - abbreviated: pmb - prefer_abbreviated: true - sample: true + default: *po_box + probability: 0.95 + alternatives: + - alternative: *box + probability: 0.05 + + add_number_phrase: true + add_number_phrase_probability: 0.4 # PO Box #1234 + + numeric_probability: 0.9 # + alpha_probability: 0.01 # PO Box A + numeric_plus_alpha_probability: 0.04 # PO Box 123G + + digits: + - length: 1 + probability: 0.1 + - length: 2 + probability: 0.1 + - length: 3 + probability: 0.1 + - length: 4 + probability: 0.5 + - length: 5 + probability: 0.1 + - length: 6 + probability: 0.05 + + # Overrides for commercial/office areas (landuse=commercial in OSM) + commercial: + default: *po_box + probability: 0.7 + alternatives: + - alternative: *private_mail_box + probability: 0.2 + - alternative: *box + probability: 0.1 # Categories # ========== @@ -195,18 +431,33 @@ po_box: categories: near: - canonical: near - alternates: - - around + default: + canonical: near + probability: 0.8 + alternatives: + - alternative: + canonical: around + probability: 0.2 nearby: - canonical: nearby - alternates: - - near here - - around here + default: + canonical: nearby + probability: 0.6 + alternatives: + - alternative: + canonical: near here + probability: 0.3 + - alternative: + canonical: around here + probability: 0.1 near_me: canonical: near me in: canonical: in + # Probabilities of each phrase + near_probability: 0.35 + nearby_probability: 0.2 + near_me_probability: 0.1 + in_probability: 0.35 # Directions # ========== @@ -215,44 +466,76 @@ categories: # like "East Entrance". directions: - right: + right: &right canonical: right - numeric_abbreviated: + abbreviated: r + canonical_probability: 0.7 + abbreviated_probability: 0.3 + numeric_affix: affix: r direction: right - left: + left: &left canonical: left - numeric_abbreviated: + abbreviated: l + canonical_probability: 0.7 + abbreviated_probability: 0.3 + numeric_affix: affix: l direction: right - rear: + rear: &rear canonical: rear - numeric_abbreviated: + abbreviated: r + canonical_probability: 0.8 + abbreviated_probability: 0.2 + numeric_affix: affix: r direction: right - front: + front: &front canonical: front - numeric_abbreviated: + abbreviated: frnt + canonical_probability: 0.8 + abbreviated_probability: 0.2 + numeric_affix: affix: f direction: right - east: + east: &east canonical: east - numeric_abbreviated: + abbreviated: e + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + numeric_affix: affix: e direction: right - west: + west: &west canonical: west - numeric_abbreviated: + abbreviated: w + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + numeric_affix: affix: w direction: right - north: + north: &north canonical: north - numeric_abbreviated: + abbreviated: n + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + numeric_affix: affix: n direction: right - south: + south: &south canonical: south - numeric_abbreviated: + abbreviated: s + sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + numeric_affix: affix: s direction: right @@ -260,23 +543,82 @@ directions: # ======== # For deriving strings like "North Entrance" -entrance: - canonical: entrance - abbreviated: ent - sample: true +entrances: + entrance: &entrance + canonical: entrance + abbreviated: ent + sample: true + canonical_probability: 0.8 + abbreviated_probability: 0.2 + + # Entrance 1, Entrance A, etc. + alphanumeric: &entrance_alphanumeric + default: *entrance + + directional: + base: *entrance_alphanumeric + modifier: + direction: left # e.g. North Entrance + direction_probability: 0.9 + alternatives: + - alternative: *north + - alternative: *south + - alternative: *east + - alternative: *west + - alternative: *right + - alternative: *left + - alternative: *rear + - alternative: *front + - alternative: + canonical: freight # Staircase # ========= # For deriving strings like "Staircase A" in apartment buildings -staircase: - canonical: stair - sample: true - alternates: - - canonical: stairway - sample: true - - canonical: staircase - sample: true +staircases: + stair: &stair + canonical: stair + sample: true + + staircase: &staircase + canonical: staircase + sample: true + + stairway: &stairway + canonical: stairway + sample: true + + stairwell: &stairwell + canonical: stairwell + sample: true + + alphanumeric: &staircase_alphanumeric + # For alphanumerics, Stair A, Stair 1, etc. + default: *stair + probability: 0.4 + alternatives: + - alternative: *staircase + probability: 0.2 + - alternative: *stairway + probability: 0.2 + - alternative: *stairwell + probability: 0.2 + + directional: + base: *staircase_alphanumeric + modifier: + direction: left # e.g. Left Staircase + direction_probability: 0.7 + alternatives: + - alternative: *north + - alternative: *south + - alternative: *east + - alternative: *west + - alternative: *right + - alternative: *left + - alternative: *rear + - alternative: *front # Unit types @@ -285,68 +627,249 @@ staircase: # Just like thoroughfare types (Street, Avenue, etc.), there are many common ways to # refer to the -unit_types: +units: # Units are not part of the global address formats (and are not always standard) # This is a list of places in the address where the unit line might go order: # e.g. Flat 18, Da Vinci House, 44 Saffron Hill, London - before: house + probability: 0.4 # e.g. Da Vinci House, Flat 18, 44 Saffron Hill, London - before: road - # e.g. 123 East 45th St, Apt 6, NYC - - after: road + probability: 0.2 + # e.g. Floor 5, Apt 6 + - after: level + probability: 0.3 + # e.g. Apt. 6, 5/F (less common) + - before: level + probability: 0.1 + # Special terms suite: &suite canonical: suite + plural: suites abbreviated: ste - sample: true + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + # Suite #101 and Suite No. 101 as opposed to Suite 101 + add_number_phrase: true + add_number_phrase_probability: 0.5 penthouse: &penthouse canonical: penthouse abbreviated: ph sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + # Penthouse #1 and Penthouse No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.2 + top_left: &top_left + canonical: top left + abbreviated: t/l + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.3 + sample_probability: 0.3 + top_right: &top_right + canonical: top right + abbreviated: t/r + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.3 + sample_probability: 0.3 + top_floor_right: &top_floor_right + canonical: top floor right + abbreviated: tfr + sample: true + canonical_probability: 0.2 + abbreviated_probability: 0.5 + sample_probability: 0.3 + top_floor_left: &top_floor_left + canonical: top floor left + abbreviated: tfl + sample: true + canonical_probability: 0.2 + abbreviated_probability: 0.5 + sample_probability: 0.3 office: &office canonical: office + plural: offices abbreviated: ofc sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.3 + sample_probability: 0.2 + # Office #1 and Office No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.7 door: &door canonical: door + plural: doors sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + # Door #1 and Door No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.2 room: &room canonical: room + plural: rooms abbreviated: rm sample: true + canonical_probability: 0.5 + abbreviated_probability: 0.5 + # Room #1 and Room No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.6 apartment: &apartment canonical: apartment + plural: apartments abbreviated: apt prefer_abbreviated: true sample: true + canonical_probability: 0.15 + abbreviated_probability: 0.6 + sample_probability: 0.25 + # Apt #1 and Apt No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.4 flat: &flat canonical: flat + plural: flats abbreviated: fl sample: true + canonical_probability: 0.8 + abbreviated_probability: 0.1 + sample_probability: 0.1 + # Flat #1 and Flat No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.4 lot: &lot canonical: lot + plural: lots sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + # Lot #1 and Lot No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.6 + parcel: &parcel + canonical: parcel + canonical_probability: 0.9 + sample: true + sample_probability: 0.1 + add_number_phrase: true + add_number_phrase_probability: 0.6 unit: &unit canonical: unit abbreviated: u - use_number_phrase: true + add_number_phrase: true + add_number_phrase_probability: 0.3 sample: true - numeric: + canonical_probability: 0.8 + abbreviated_probability: 0.1 + sample_probability: 0.1 + # Unit #1 and Unit No. 1 + add_number_phrase: true + add_number_phrase_probability: 0.6 + alphanumeric: &unit_alphanumeric # Many unit types that apply only in Australia # For most English-speaking countries, only use the terms defined above sample: false - use_number_phrase: true - use_direction_phrase: true default: *flat - alternates: - - *apartment - - *unit - - *door - - *room - - *office - - *penthouse - - *lot + probability: 0.6 + alternatives: + - alternative: *unit + probability: 0.25 + - alternative: *door + probability: 0.04 + - alternative: *penthouse + probability: 0.01 + - alternative: *apartment + probability: 0.1 + numeric_probability: 0.8 # e.g. Flat 1 + numeric_plus_alpha_probability: 0.05 # e.g. 1A + alpha_plus_numeric_probability: 0.05 # e.g. A1 + alpha_probability: 0.1 # e.g. Flat A + + # Separate random probability for adding directions like 2L, 2R, etc. + add_direction: true + add_direction_probability: 0.1 + add_direction_numeric_only: true # Only for numbers + + zone: + residential: *unit_alphanumeric + commercial: + default: *suite + probability: 0.8 + alternatives: + - alternative: *office + probability: 0.2 + industrial: + default: *lot + probability: 0.7 + alternatives: + - alternative: *suite + probability: 0.15 + - alternative: *unit + probability: 0.1 + - alternative: *parcel + alternatives: 0.05 + university: + default: *room + allotments: + lot: + default: *lot + numeric_probability: 0.8 + alphanumeric_probability: 0.1 + alpha_probability: 0.1 + parcel: + default: *parcel + numeric_probability: 0.3 + alphanumeric_probability: 0.3 + alpha_probability: 0.4 + lot_probability: 0.9 + parcel_probability: 0.06 + lot_plus_parcel_probability: 0.02 + parcel_plus_lot_probability: 0.02 + + # For unit types like 2/34 + combined: + component: house_number + direction: left + + directional: + modifier: + direction: right # e.g. 1 + numeric_probability: 0.1 + numeric_affix_probability: 0.9 + alternatives: + - alternative: *right + - alternative: *left + - alternative: *rear + - alternative: *front + + standalone: + sample: false + default: *penthouse + probability: 0.4 + alternatives: + - alternative: *top_right + probability: 0.15 + - alternative: *top_left + probability: 0.15 + - alternative: *top_floor_left + probability: 0.15 + - alternative: *top_floor_right + probability: 0.15 + + # If no unit number is specified + alphanumeric_probability: 0.75 + standalone_probability: 0.15 + combined_probability: 0.1 # Country-specific overrides # ========================== @@ -355,32 +878,126 @@ unit_types: countries: # United States us: - level: - ground_floor: - number: 1 - unit_types: - numeric: + levels: + storey: &story + canonical: story + numeric: + direction: left + ordinal: + direction: right + numeric_probability: 0.1 # e.g. Story 2, less common + ordinal_probability: 0.9 # e.g. 2nd Story, more common + alphanumeric: + default: *floor + probability: 0.6 + alternatives: + - alternative: *level + probability: 0.3 + - alternative: *platform + probability: 0.05 + - alternative: *story + alternative: 0.05 + aliases: + "1": + default: *floor + probability: 0.6 + alternatives: + - alternative: *ground_floor + probability: 0.3 + - alternative: *upper_ground_floor + probability: 0.1 + "2": + # Most of the time just say 1st Floor + default: *floor + probability: 0.8 + alternatives: + - alternative: *upstairs + probability: 0.1 + po_boxes: + concatenate_postcode: + po_box_max_digits: 4 # For PO boxes with max n digits + direction: left # Concatenate on the left side of the PO box + postcode_digits: + length: 2 # use this many digits from the postal code + direction: right + concatenate_postcode_probability: 0.01 + postcodes: + concatenate_po_box: + append: + separator: "-" # Use a hyphen separator + direction: right # To the right of the postcode + digits: + length: 4 # number of digits to append to the ZIP code + pad: + direction: left # left pad + character: "0" # pad with 0s, e.g. for PO Box 52, use -0052 + concatenate_po_box_probability: 0.1 + units: &us_unit_types + alphanumeric: &us_units_alphanumeric default: *apartment - country_alternates: - - *flat + probability: 0.6 + alternatives: + - alternative: *unit + probability: 0.3 + - alternative: *door + probability: 0.02 + - alternative: *penthouse + probability: 0.07 + - alternative: *flat + probability: 0.01 # See this e.g. in Milwaukee with Polish flats + + zone: + residential: *us_units_alphanumeric + commercial: + default: *office + probability: 0.5 + alternatives: + - alternative: *suite # Suite is much more common in the US and Canada + probability: 0.5 + industrial: + default: *lot + probability: 0.6 + university: + default: *room + # Canada # Note: this is Canadian English only. If the address is in French it will use the French config ca: - level: - ground_floor: - number: 1 - unit_types: - numeric: - default: *apartment - country_alternates: - - *flat + levels: + aliases: + "1": + default: *floor + probability: 0.6 + alternatives: + - alternative: *ground_floor + probability: 0.3 + - alternative: *upper_ground_floor + probability: 0.1 + "2": + # Most of the time just say 1st Floor + default: *floor + probability: 0.8 + alternatives: + - alternative: *upstairs + probability: 0.1 + # For (English-speaking) Canada, use the same unit types as in the US + units: *us_unit_types # Australia au: - po_box: - numeric: + po_boxes: &australia_po_boxes + alphanumeric: # Australia has many strings for this e.g. Roadside Mail Bag sample: true - unit_types: - numeric: + sample_probability: 0.05 + units: &australia_unit_types + alphanumeric: # Australia has all kinds of unit types (e.g. Marine Berth) not used elsewhere sample: true + sample_probability: 0.2 + standalone: + sample: true + sample_probability: 0.2 + # New Zealand - same rules as Australia + nz: + po_boxes: *australia_po_boxes + units: *australia_unit_types