Files
2016-07-21 17:04:57 -04:00

954 lines
28 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# sr.yaml
# -------
# Serbian language specification
alphabet: абвгдђежзијклљмнњопрстћуфхцчџш
alphanumeric_probability: 0.7
components:
level:
null_probability: 0.8
alphanumeric_probability: 0.2
staircase:
null_probability: 0.99
alphanumeric_probability: 0.01
entrance:
null_probability: 0.999
alphanumeric_probability: 0.001
unit:
null_probability: 0.7
alphanumeric_probability: 0.3
combinations:
-
components:
- house_number
- staircase
- level
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
-
components:
- house_number
- level
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
-
components:
- house_number
- level
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.1
# For unit types like 2/34
-
components:
- house_number
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
numbers:
default: &broj
canonical: број
abbreviated: бр
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.6
sample_probability: 0.1
numeric:
direction: left
numeric_affix:
affix: "бр."
direction: left
numeric_probability: 0.4
numeric_affix_probability: 0.6
alternatives:
- alternative: &broj_latin
canonical: broj
abbreviated: br
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.6
sample_probability: 0.1
numeric:
direction: left
numeric_affix:
affix: "br."
direction: left
numeric_probability: 0.4
numeric_affix_probability: 0.6
and:
default: &i
canonical: и
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.9
alternatives:
- alternative: &i_latin
canonical: i
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.1
cross_streets:
i: *i
i_latin: *i_latin
at: &na
canonical: на
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_latin: &na_latin
canonical: na
sample: true
canonical_probability: 0.8
sample_probability: 0.2
corner: &ugao
canonical: угао
sample: true
canonical_probability: 0.8
sample_probability: 0.2
ugao_latin: &ugao_latin
canonical: ugao
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_uglu: &na_uglu
canonical: на углу
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_uglu_latin: &na_uglu_latin
canonical: na uglu
sample: true
canonical_probability: 0.8
sample_probability: 0.2
intersection:
default: *i
probability: 0.65
alternatives:
- alternative: *i_latin
probability: 0.05
- alternative: *na
probability: 0.075
- alternative: *na_latin
probability: 0.025
- alternative: *ugao
probability: 0.1
- alternative: *ugao_latin
probability: 0.05
- alternative: *na_uglu
probability: 0.025
- alternative: *na_uglu_latin
probability: 0.025
izmedu: &izmedu
canonical: између
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parentheses_probability: 0.5
izmedu_latin: &izmedu_latin
canonical: između
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parentheses_probability: 0.5
between:
default: *izmedu
probability: 0.9
alternatives:
- alternative: *izmedu_latin
probability: 0.1
levels:
sprat: &sprat
canonical: спрат
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
sprat_latin: &sprat_latin
canonical: sprat
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
kat: &kat
canonical: кат
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
kat_latin: &kat_latin
canonical: kat
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
etaza: &etaza
canonical: етажа
abbreviated: ет
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
etaza_latin: &etaza_latin
canonical: etaža
abbreviated: et
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
prizemlje: &prizemlje
canonical: приземље
sample: true
canonical_probability: 0.9
sample_probability: 0.1
prizemlje_latin: &prizemlje_latin
canonical: prizemlje
sample: true
canonical_probability: 0.9
sample_probability: 0.1
parter: &parter
canonical: партер
sample: true
canonical_probability: 0.9
sample_probability: 0.1
parter_latin: &parter_latin
canonical: parter
sample: true
canonical_probability: 0.9
sample_probability: 0.1
podrum: &podrum
canonical: подрум
sample: true
canonical_probability: 0.7
sample_probability: 0.3
# e.g. подрум 1
numeric:
direction: left
direction_probability: 0.8
# e.g. 1. подрум
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
standalone_probability: 0.99
number_abs_value: true
number_min_abs_value: 1
numeric_probability: 0.005
ordinal_probability: 0.005
podrum_latin: &podrum_latin
canonical: podrum
sample: true
canonical_probability: 0.7
sample_probability: 0.3
# e.g. подрум 1
numeric:
direction: left
direction_probability: 0.8
# e.g. 1. подрум
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
standalone_probability: 0.99
number_abs_value: true
number_min_abs_value: 1
numeric_probability: 0.005
ordinal_probability: 0.005
aliases:
"<-1":
default: *podrum
probability: 0.8
alternatives:
- alternative: *podrum_latin
probability: 0.2
"-1":
default: *podrum
probability: 0.8
alternatives:
- alternative: *podrum_latin
probability: 0.2
"0":
default: *prizemlje
probability: 0.45
alternatives:
- alternative: *prizemlje_latin
probability: 0.05
- alternative: *parter
probability: 0.35
- alternative: *parter_latin
probability: 0.05
- alternative: *sprat
probability: 0.04
- alternative: *sprat_latin
probability: 0.01
- alternative: *kat
probability: 0.04
- alternative: *kat_latin
probability: 0.01
numbering_starts_at: 0
alphanumeric:
default: *sprat
probability: 0.65
alternatives:
- alternative: *sprat_latin
probability: 0.1
- alternative: *kat
probability: 0.15
- alternative: *kat_latin
probability: 0.05
- alternative: *etaza
probability: 0.04
- alternative: *etaza_latin
probability: 0.01
numeric_probability: 0.69 # With this probability, pick an integer
roman_numeral_probability: 0.3 # Pick a Roman numeral for the actual value
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
alpha_plus_numeric_probability: 0.0001 # e.g. A2
directions:
right: &desno
canonical: десно
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
desno_latin: &desno_latin
canonical: desno
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
left: &levo
canonical: лево
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
levo_latin: &levo_latin
canonical: levo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
alternatives:
- alternative: *desno
probability: 0.45
- alternative: *desno_latin
probability: 0.05
- alternative: *levo
probability: 0.45
- alternative: *levo_latin
probability: 0.05
cardinal_directions:
east: &istok
canonical: исток
abbreviated: и
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: и
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
istok_latin: &istok_latin
canonical: istok
abbreviated: i
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: i
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
west: &zapad
canonical: запад
abbreviated: з
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: з
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
zapad_latin: &zapad_latin
canonical: zapad
abbreviated: z
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: z
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
north: &sever
canonical: север
abbreviated: с
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: с
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
sever_latin: &sever_latin
canonical: sever
abbreviated: s
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: s
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
south: &jug
canonical: југ
abbreviated: ј
sample: true
canonical_probability: 0.75
abbreviated_probability: 0.1
sample_probability: 0.15
numeric:
direction: right
numeric_affix:
affix: ј
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
jug_latin: &jug_latin
canonical: jug
abbreviated: j
sample: true
canonical_probability: 0.75
abbreviated_probability: 0.1
sample_probability: 0.15
numeric:
direction: right
numeric_affix:
affix: j
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
alternatives:
- alternative: *sever
probability: 0.23
- alternative: *sever_latin
probability: 0.02
- alternative: *istok
probability: 0.23
- alternative: *istok_latin
probability: 0.02
- alternative: *jug
probability: 0.23
- alternative: *jug_latin
probability: 0.02
- alternative: *zapad
probability: 0.23
- alternative: *zapad_latin
probability: 0.02
entrances:
ulaz: &ulaz
canonical: улаз
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
ulaz_latin: &ulaz_latin
canonical: ulaz
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
# Ulaz 1, Ulaz A, etc.
alphanumeric: &entrance_alphanumeric
default: *ulaz
probability: 0.8
alternatives:
- alternative: *ulaz_latin
probability: 0.2
numeric_probability: 0.1 # e.g. Ulaz 1
alpha_probability: 0.85 # e.g. Ulaz A
numeric_plus_alpha_probability: 0.025 # e.g. 1A
alpha_plus_numeric_probability: 0.025 # e.g. A1
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
staircases:
stepeniste: &stepeniste
canonical: степениште
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
stepeniste_latin: &stepeniste_latin
canonical: stepenište
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
alphanumeric: &staircase_alphanumeric
default: *stepeniste
probability: 0.8
alternatives:
- alternative: *stepeniste_latin
probability: 0.2
numeric_probability: 0.75
alpha_probability: 0.2
numeric_plus_alpha_probability: 0.025
alpha_plus_numeric_probability: 0.025
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
directional:
direction: right
direction_probability: 0.85
modifier:
alternatives:
- alternative: *desno
probability: 0.19
- alternative: *desno_latin
probability: 0.01
- alternative: *levo
probability: 0.19
- alternative: *levo_latin
probability: 0.01
- alternative: *sever
probability: 0.14
- alternative: *sever_latin
probability: 0.01
- alternative: *jug
probability: 0.14
- alternative: *jug_latin
probability: 0.01
- alternative: *istok
probability: 0.14
- alternative: *istok_latin
probability: 0.01
- alternative: *zapad
probability: 0.14
- alternative: *zapad_latin
probability: 0.01
po_boxes:
postanski_fah: &postanski_fah
canonical: поштански фах
abbreviated: пф
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2 # poštanski fah br. 1234
postanski_fah_latin: &postanski_fah_latin
canonical: poštanski fah
abbreviated: pf
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2 # poštanski fah br. 1234
postanski_pretinac: &postanski_pretinac
canonical: поштански претинац
sample: true
canonical_probability: 0.6
sample_probability: 0.5
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
postanski_pretinac_latin: &postanski_pretinac_latin
canonical: poštanski pretinac
sample: true
canonical_probability: 0.6
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
postanski_pregradak: &postanski_pregradak
canonical: поштански преградак
sample: true
canonical_probability: 0.6
sample_probability: 0.5
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
postanski_pregradak_latin: &postanski_pregradak_latin
canonical: poštanski pregradak
sample: true
canonical_probability: 0.6
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
alphanumeric:
default: *postanski_fah
probability: 0.7
alternatives:
- alternative: *postanski_fah_latin
probability: 0.05
- alternative: *postanski_pretinac
probability: 0.1
- alternative: *postanski_pretinac_latin
probability: 0.05
- alternative: *postanski_pregradak
probability: 0.075
- alternative: *postanski_pregradak_latin
probability: 0.025
numeric_probability: 0.9 # pf 123
alpha_probability: 0.05 # pf A
numeric_plus_alpha_probability: 0.04 # pf 123G
alpha_plus_numeric_probability: 0.01 # pf A123
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
digits:
- length: 1
probability: 0.05
- length: 2
probability: 0.1
- length: 3
probability: 0.2
- length: 4
probability: 0.5
- length: 5
probability: 0.1
- length: 6
probability: 0.05
units:
stan: &stan
canonical: стан
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
stan_latin: &stan_latin
canonical: stan
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
apartman: &apartman
canonical: апартман
abbreviated: ап
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.2
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
apartman_latin: &apartman_latin
canonical: apartman
abbreviated: ap
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.2
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
soba: &soba
canonical: соба
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
soba_latin: &soba_latin
canonical: soba
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
kancelarija: &kancelarija
canonical: канцеларија
sample: true
canonical_probability: 0.6
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
kancelarija_latin: &kancelarija_latin
canonical: kancelarija
sample: true
canonical_probability: 0.6
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
alphanumeric: &unit_alphanumeric
default: *stan
probability: 0.5
alternatives:
- alternative: *stan_latin
probability: 0.1
- alternative: *apartman
probability: 0.2
- alternative: *apartman_latin
probability: 0.05
- alternative: *soba
probability: 0.1
- alternative: *soba_latin
probability: 0.05
numeric_probability: 0.9 # e.g. stan. 1
numeric_plus_alpha_probability: 0.03 # e.g. 1A
alpha_plus_numeric_probability: 0.03 # e.g. A1
alpha_probability: 0.04 # e.g. stan A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
# If there are 10 floors, create unit numbers like #301 or #1032
use_floor_probability: 0.01
zones:
commercial: &commercial_unit_types
default: *soba
probability: 0.55
alternatives:
- alternative: *soba_latin
probability: 0.05
- alternative: *kancelarija
probability: 0.35
- alternative: *kancelarija_latin
probability: 0.05
numeric_probability: 0.95 # e.g. soba 1
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
alpha_probability: 0.03 # e.g. soba A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
university:
default: *soba
probability: 0.9
alternatives:
- alternative: *soba_latin
probability: 0.1
numeric_probability: 0.95 # e.g. soba 1
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
alpha_probability: 0.03 # e.g. soba A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1