376 lines
9.7 KiB
YAML
376 lines
9.7 KiB
YAML
# eu.yaml
|
|
# -------
|
|
# Basque language specification
|
|
|
|
components:
|
|
level:
|
|
# If no floor number is specified
|
|
null_probability: 0.8
|
|
alphanumeric_probability: 0.2
|
|
|
|
staircase:
|
|
null_probability: 0.99
|
|
alphanumeric_probability: 0.01
|
|
|
|
entrance:
|
|
null_probability: 0.999
|
|
alphanumeric_probability: 0.001
|
|
|
|
unit:
|
|
# If no unit number is specified
|
|
null_probability: 0.4
|
|
alphanumeric_probability: 0.6
|
|
|
|
combinations:
|
|
-
|
|
components:
|
|
- level
|
|
- unit
|
|
label: unit
|
|
separators:
|
|
- separator: "-"
|
|
probability: 0.85
|
|
- separator: "/"
|
|
probability: 0.15
|
|
probability: 0.7
|
|
|
|
|
|
and:
|
|
default: &eta
|
|
canonical: eta
|
|
abbreviated: "&"
|
|
sample: true
|
|
canonical_probability: 0.5
|
|
abbreviated_probability: 0.4
|
|
sample_probability: 0.1
|
|
|
|
house_numbers:
|
|
# zenbakirik gabe (zk.g) addresses
|
|
no_number:
|
|
default:
|
|
canonical: zenbakirik gabe
|
|
abbreviated: zk.g
|
|
sample: true
|
|
canonical_probability: 0.1
|
|
abbreviated_probability: 0.6
|
|
sample_probability: 0.3
|
|
|
|
no_number_probability: 0.1 # With this probability, use sense número if no house_number is specified
|
|
|
|
levels:
|
|
floor: &solairua
|
|
canonical: solairua
|
|
abbreviated: sol
|
|
sample: true
|
|
canonical_probability: 0.5
|
|
abbreviated_probability: 0.3
|
|
sample_probability: 0.2
|
|
numeric:
|
|
direction: left
|
|
# e.g. 2. solairua
|
|
ordinal:
|
|
direction: right
|
|
numeric_probability: 0.25
|
|
ordinal_probability: 0.75
|
|
# Ground floor
|
|
beheko_solairua: &beheko_solairua
|
|
canonical: beheko solairua
|
|
abbreviated: beheko sol
|
|
sample: true
|
|
canonical_probability: 0.6
|
|
abbreviated_probability: 0.3
|
|
sample_probability: 0.1
|
|
behe_solairua: &behe_solairua
|
|
canonical: behe-solairua
|
|
abbreviated: behe-sol
|
|
sample: true
|
|
canonical_probability: 0.6
|
|
abbreviated_probability: 0.3
|
|
sample_probability: 0.1
|
|
aliases:
|
|
"0":
|
|
default: *beheko_solairua
|
|
probability: 0.5
|
|
alternatives:
|
|
- alternative: *behe_solairua
|
|
probability: 0.4
|
|
- alternative: *solairua
|
|
probability: 0.1
|
|
|
|
numbering_starts_at: 0
|
|
|
|
alphanumeric:
|
|
default: *solairua
|
|
numeric_probability: 0.99
|
|
alpha_probability: 0.01
|
|
|
|
blocks:
|
|
alphanumeric:
|
|
default:
|
|
canonical: blokea
|
|
abbreviated: bl
|
|
sample: true
|
|
canonical_probability: 0.6
|
|
abbreviated_probability: 0.2
|
|
sample_probability: 0.2
|
|
numeric:
|
|
direction: left
|
|
ordinal:
|
|
direction: right
|
|
numeric_probability: 0.2
|
|
ordinal_probability: 0.8
|
|
|
|
categories:
|
|
near:
|
|
default:
|
|
canonical: gertu
|
|
|
|
nearby:
|
|
default:
|
|
canonical: gertuko
|
|
probability: 0.7
|
|
alternatives:
|
|
- alternative:
|
|
canonical: hemen gertu
|
|
probability: 0.2
|
|
- alternative:
|
|
canonical: hemen
|
|
probability: 0.1
|
|
near_me:
|
|
default:
|
|
canonical: me gertu
|
|
|
|
# Probabilities of each phrase
|
|
near_probability: 0.7
|
|
nearby_probability: 0.2
|
|
near_me_probability: 0.1
|
|
|
|
cross_streets:
|
|
and: *eta
|
|
txoko: &txoko
|
|
canonical: txoko
|
|
sample: true
|
|
canonical_probability: 0.7
|
|
sample_probability: 0.3
|
|
|
|
intersection:
|
|
default: *eta
|
|
probability: 0.8
|
|
alternatives:
|
|
- alternative: *txoko
|
|
probability: 0.2
|
|
|
|
between:
|
|
canonical: arteko
|
|
sample: true
|
|
canonical_probability: 0.8
|
|
sample_probability: 0.2
|
|
parentheses_probabililty: 0.5
|
|
|
|
|
|
po_boxes:
|
|
posta_kutxa: &posta_kutxa
|
|
canonical: posta-kutxa
|
|
abbreviated: p.-ku
|
|
sample: true
|
|
canonical_probability: 0.2
|
|
abbreviated_probability: 0.4
|
|
sample_probability: 0.4
|
|
numeric:
|
|
direction: left
|
|
numeric_probability: 1.0
|
|
alphanumeric:
|
|
sample: false
|
|
default: *posta_kutxa
|
|
numeric_probability: 0.9 # P.-Ku 123
|
|
alpha_probability: 0.05 # P.-Ku A
|
|
numeric_plus_alpha_probability: 0.04 # P.-Ku 123G
|
|
alpha_plus_numeric_probability: 0.01 # P.-Ku A123
|
|
alpha_plus_numeric:
|
|
whitespace_probability: 0.1
|
|
numeric_plus_alpha:
|
|
whitespace_probability: 0.1
|
|
|
|
digits:
|
|
- length: 1
|
|
probability: 0.05
|
|
- length: 2
|
|
probability: 0.1
|
|
- length: 3
|
|
probability: 0.2
|
|
- length: 4
|
|
probability: 0.5
|
|
- length: 5
|
|
probability: 0.1
|
|
- length: 6
|
|
probability: 0.05
|
|
|
|
|
|
postcodes:
|
|
alphanumeric:
|
|
default:
|
|
canonical: posta-kodea
|
|
abbreviated: p.-k
|
|
sample: true
|
|
canonical_probability: 0.01
|
|
abbreviated_probability: 0.9
|
|
sample_probability: 0.09
|
|
|
|
numeric:
|
|
direction: left
|
|
|
|
numeric_affix:
|
|
affix: p.-k.
|
|
direction: left
|
|
# null_probability means the chance of doing nothing e.g. just the postal code
|
|
null_probability: 0.7
|
|
numeric_probability: 0.18
|
|
numeric_affix_probability: 0.12
|
|
strict_numeric: true
|
|
|
|
directions:
|
|
right: &eskuina
|
|
canonical: eskuina
|
|
abbreviated: esk
|
|
sample: true
|
|
canonical_probability: 0.3
|
|
abbreviated_probability: 0.4
|
|
sample_probability: 0.3
|
|
numeric:
|
|
direction: right
|
|
numeric_affix:
|
|
affix: esk.
|
|
direction: right
|
|
whitespace_probability: 0.1
|
|
numeric_probability: 0.9
|
|
numeric_affix_probability: 0.1
|
|
left: &ezkerkada
|
|
canonical: ezkerkada
|
|
abbreviated: ezk
|
|
sample: true
|
|
canonical_probability: 0.3
|
|
abbreviated_probability: 0.4
|
|
sample_probability: 0.3
|
|
numeric:
|
|
direction: right
|
|
numeric_affix:
|
|
affix: ezk.
|
|
direction: right
|
|
whitespace_probability: 0.1
|
|
numeric_probability: 0.9
|
|
numeric_affix_probability: 0.1
|
|
ezkerreko: &ezkerreko
|
|
canonical: ezkerreko
|
|
abbreviated: ezk.-ko
|
|
sample: true
|
|
canonical_probability: 0.2
|
|
abbreviated_probability: 0.5
|
|
sample_probability: 0.3
|
|
numeric:
|
|
direction: left
|
|
|
|
alternatives:
|
|
- alternative: *eskuina
|
|
probability: 0.5
|
|
- alternative: *ezkerkada
|
|
probability: 0.5
|
|
|
|
|
|
entrances:
|
|
sarrera: &sarrera
|
|
canonical: sarrera
|
|
sample: true
|
|
canonical_probability: 0.8
|
|
sample_probability: 0.2
|
|
numeric:
|
|
direction: left
|
|
|
|
# Sarrera 1, Sarrera A, etc.
|
|
alphanumeric:
|
|
default: *sarrera
|
|
numeric_probability: 0.1 # e.g. Sarrera 1
|
|
alpha_probability: 0.85 # e.g. Sarrera A
|
|
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
|
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
|
|
|
alpha_plus_numeric:
|
|
whitespace_probability: 0.1
|
|
|
|
numeric_plus_alpha:
|
|
whitespace_probability: 0.1
|
|
|
|
directional:
|
|
direction: left
|
|
modifier:
|
|
alternatives:
|
|
- alternative: *eskuina
|
|
- alternative: *ezkerreko
|
|
|
|
staircases:
|
|
eskailera: &eskailera
|
|
canonical: eskailera
|
|
abbreviated: eskra
|
|
sample: true
|
|
canonical_probability: 0.3
|
|
abbreviated_probability: 0.4
|
|
sample_probability: 0.3
|
|
numeric:
|
|
direction: left
|
|
|
|
alphanumeric:
|
|
# For alphanumerics, Eskra A, Eskra 1, etc.
|
|
default: *eskailera
|
|
numeric_probability: 0.6 # e.g. Eskra 1
|
|
alpha_probability: 0.35 # e.g. Eskra A
|
|
numeric_plus_alpha_probability: 0.025 # e.g. 1A
|
|
alpha_plus_numeric_probability: 0.025 # e.g. A1
|
|
|
|
alpha_plus_numeric:
|
|
whitespace_probability: 0.1
|
|
|
|
numeric_plus_alpha:
|
|
whitespace_probability: 0.1
|
|
|
|
directional:
|
|
direction: left # e.g. Ezk.-ko Eskra
|
|
modifier:
|
|
alternatives:
|
|
- alternative: *eskuina
|
|
- alternative: *ezkerreko
|
|
|
|
units:
|
|
flat: &apartamentu
|
|
canonical: apartamentu
|
|
abbreviated: aptu
|
|
sample: true
|
|
canonical_probability: 0.3
|
|
abbreviated_probability: 0.4
|
|
sample_probability: 0.3
|
|
numeric:
|
|
direction: left
|
|
# If it's just puerta B, many times it's just e.g. 3o B for "tercero piso puerta B"
|
|
null_phrase_probability: 0.15
|
|
ordinal:
|
|
direction: right
|
|
numeric_probability: 0.6
|
|
ordinal_probability: 0.4
|
|
|
|
alphanumeric: &unit_alphanumeric
|
|
default: *apartamentu
|
|
|
|
# Separate random probability for adding directions like 2. Ezk, 2 Esk, etc.
|
|
add_direction: true
|
|
add_direction_probability: 0.1
|
|
add_direction_numeric: true # Only for numbers
|
|
add_direction_standalone: true # A unit can be as simple as "D"
|
|
|
|
numeric_probability: 0.7 # e.g. 1
|
|
numeric_plus_alpha_probability: 0.01 # e.g. 1A
|
|
alpha_plus_numeric_probability: 0.01 # e.g. A1
|
|
alpha_probability: 0.28 # e.g. A
|
|
|
|
alpha_plus_numeric:
|
|
whitespace_probability: 0.1
|
|
numeric_plus_alpha:
|
|
whitespace_probability: 0.1
|