Files
2016-07-31 22:50:48 -04:00

1001 lines
28 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# bg.yaml
# -------
# Bulgarian language specification
alphabet: абвгдежзийклмнопрстуфхцчшщъьюя
alphanumeric_probability: 0.7
components:
level:
null_probability: 0.8
alphanumeric_probability: 0.2
staircase:
null_probability: 0.99
alphanumeric_probability: 0.01
# Entrance more common in Bulgarian addresses
entrance:
null_probability: 0.9
alphanumeric_probability: 0.1
unit:
null_probability: 0.7
alphanumeric_probability: 0.3
combinations:
-
components:
- house_number
- staircase
- level
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
-
components:
- house_number
- level
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
-
components:
- house_number
- level
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.1
# For unit types like 2/34
-
components:
- house_number
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
numbers:
default: &nomer
canonical: номер
abbreviated:
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
probability: 0.95
alternatives:
- alternative: &nomer_latin
canonical: nomer
abbreviated: "no"
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
probability: 0.05
house_numbers:
alphanumeric:
default: *nomer
probability: 0.95
alternatives:
- alternative: *nomer_latin
probability: 0.05
alphanumeric_phrase_probability: 0.2
and:
default: &i
canonical: и
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.9
alternatives:
- alternative: &i_latin
canonical: i
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.1
cross_streets:
i: *i
i_latin: *i_latin
corner: &ugul
canonical: ъгъл
sample: true
canonical_probability: 0.8
sample_probability: 0.2
ugul_latin: &ugul_latin
canonical: ŭgŭl
sample: true
canonical_probability: 0.8
sample_probability: 0.2
ugul_na: &ugul_na
canonical: ъгъл на
sample: true
canonical_probability: 0.8
sample_probability: 0.2
ugul_na_latin: &ugul_na_latin
canonical: ŭgŭl na
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_ugula_na: &na_ugula_na
canonical: на ъгъла на
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_ugula_na_latin: &na_ugula_na_latin
canonical: na ŭgŭla na
sample: true
canonical_probability: 0.8
sample_probability: 0.2
intersection:
default: *i
probability: 0.65
alternatives:
- alternative: *i_latin
probability: 0.05
- alternative: *ugul_na
probability: 0.075
- alternative: *ugul_na_latin
probability: 0.075
- alternative: *ugul
probability: 0.05
- alternative: *ugul_latin
probability: 0.05
- alternative: *na_ugula_na
probability: 0.025
- alternative: *na_ugula_na_latin
probability: 0.025
mezhdu: &mezhdu
canonical: между
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parentheses_probability: 0.5
mezhdu_latin: &mezhdu_latin
canonical: mezhdu
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parentheses_probability: 0.5
between:
default: *mezhdu
probability: 0.9
alternatives:
- alternative: *mezhdu_latin
probability: 0.1
levels:
etazh: &etazh
canonical: этаж
abbreviated: эт
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.3
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.8
roman_numeral_probability: 0.1
spellout_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.5
roman_numeral_probability: 0.3
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
etazh_latin: &etazh_latin
canonical: etazh
abbreviated: et
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.3
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.8
roman_numeral_probability: 0.1
spellout_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.5
roman_numeral_probability: 0.3
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
kat: &kat
canonical: кат
sample: true
canonical_probability: 0.7
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.8
roman_numeral_probability: 0.1
spellout_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.5
roman_numeral_probability: 0.3
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
kat_latin: &kat_latin
canonical: kat
sample: true
canonical_probability: 0.7
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.8
roman_numeral_probability: 0.1
spellout_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.5
roman_numeral_probability: 0.3
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
nivo: &nivo
canonical: ниво
sample: true
canonical_probability: 0.7
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.8
roman_numeral_probability: 0.1
spellout_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.5
roman_numeral_probability: 0.3
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
nivo_latin: &nivo_latin
canonical: nivo
sample: true
canonical_probability: 0.7
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.8
roman_numeral_probability: 0.1
spellout_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.5
roman_numeral_probability: 0.3
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
prizemen_etazh: &prizemen_etazh
canonical: приземен етаж
sample: true
canonical_probability: 0.8
sample_probability: 0.2
prizemen_etazh_latin: &prizemen_etazh_latin
canonical: prizemen etazh
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parter: &parter
canonical: партер
sample: true
canonical_probability: 0.9
sample_probability: 0.1
parter_latin: &parter_latin
canonical: parter
sample: true
canonical_probability: 0.9
sample_probability: 0.1
suteren: &suteren
canonical: сутерен
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
ordinal:
direction: right
number_abs_value: true
number_min_abs_value: 1
standalone_probability: 0.985
numeric_probability: 0.01
ordinal_probability: 0.005
suteren_latin: &suteren_latin
canonical: suteren
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
ordinal:
direction: right
number_abs_value: true
number_min_abs_value: 1
standalone_probability: 0.985
numeric_probability: 0.01
ordinal_probability: 0.005
aliases:
"<-1":
default: *suteren
probability: 0.9
alternatives:
- alternative: *suteren_latin
probability: 0.1
"-1":
default: *suteren
probability: 0.9
alternatives:
- alternative: *suteren_latin
probability: 0.1
"0":
default: *prizemen_etazh
probability: 0.7
alternatives:
- alternative: *prizemen_etazh_latin
probability: 0.05
- alternative: *parter
probability: 0.2
- alternative: *parter_latin
probability: 0.05
numbering_starts_at: 0
alphanumeric:
default: *etazh
probability: 0.8
alternatives:
- alternative: *etazh_latin
probability: 0.1
- alternative: *nivo
probability: 0.09
- alternative: *nivo_latin
probability: 0.01
numeric_probability: 0.79 # With this probability, pick an integer
roman_numeral_probability: 0.2
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
alpha_plus_numeric_probability: 0.0001 # e.g. A2
blocks:
alphanumeric:
default: &blok
canonical: блок
abbreviated: бл
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
probability: 0.95
alternatives:
- alternative: &blok_latin
canonical: blok
abbreviated: bl
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
probability: 0.05
categories:
near:
default:
canonical: в близост до
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.69
alternatives:
- alternative:
canonical: v blizost do
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: близо до
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.09
- alternative:
canonical: blizo do
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: около
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.09
- alternative:
canonical: okolo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: в района на
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.09
- alternative:
canonical: v raĭona na
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
nearby:
default:
canonical: наблизо
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.64
alternatives:
- alternative:
canonical: nablizo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: в близост до тук
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.19
- alternative:
canonical: v blizost do tuk
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: тук
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.09
- alternative:
canonical: tuk
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: по целия тук
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: po tseliya tuk
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
near_me:
default:
canonical: близо до мен
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.99
alternatives:
- alternative:
canonical: blizo do men
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
in:
default:
canonical: в
probability: 0.99
alternatives:
- alternative:
canonical: v
probability: 0.01
# Probabilities of each phrase
near_probability: 0.35
nearby_probability: 0.2
near_me_probability: 0.1
in_probability: 0.35
directions:
dyasno: &dyasno
canonical: дясно
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
dyasno_latin: &dyasno_latin
canonical: dyasno
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
lyavo: &lyavo
canonical: ляво
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
lyavo_latin: &lyavo_latin
canonical: lyavo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
alternatives:
- alternative: *dyasno
probability: 0.49
- alternative: *dyasno_latin
probability: 0.01
- alternative: *lyavo
probability: 0.49
- alternative: *lyavo_latin
probability: 0.01
cardinal_directions:
istok: &istok
canonical: изток
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: и
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
istok_latin: &istok_latin
canonical: istok
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: i
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
zapad: &zapad
canonical: запад
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: з
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
zapad_latin: &zapad_latin
canonical: zapad
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: z
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
sever: &sever
canonical: север
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: с
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
sever_latin: &sever_latin
canonical: sever
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: s
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
yug: &yug
canonical: юг
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
numeric_affix:
affix: ю
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
yug_latin: &yug_latin
canonical: yug
abbreviated: y
sample: true
canonical_probability: 0.75
abbreviated_probability: 0.1
sample_probability: 0.15
numeric:
direction: right
numeric_affix:
affix: y
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
alternatives:
- alternative: *sever
probability: 0.24
- alternative: *sever_latin
probability: 0.01
- alternative: *istok
probability: 0.24
- alternative: *istok_latin
probability: 0.01
- alternative: *yug
probability: 0.24
- alternative: *yug_latin
probability: 0.01
- alternative: *zapad
probability: 0.24
- alternative: *zapad_latin
probability: 0.01
entrances:
vkhod: &vkhod
canonical: вход
abbreviated: вх
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
vkhod_latin: &vkhod_latin
canonical: vkhod
abbreviated: vkh
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
# вход 1, вход A, etc.
alphanumeric:
default: *vkhod
probability: 0.99
alternatives:
- alternative: *vkhod_latin
probability: 0.01
numeric_probability: 0.1 # e.g. вх 1
alpha_probability: 0.85 # e.g. вх A
numeric_plus_alpha_probability: 0.025 # e.g. 1A
alpha_plus_numeric_probability: 0.025 # e.g. A1
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
staircases:
stulbishte: &stulbishte
canonical: стълбище
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
stulbishte_latin: &stulbishte_latin
canonical: stŭlbishte
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
alphanumeric: &staircase_alphanumeric
default: *stulbishte
probability: 0.99
alternatives:
- alternative: *stulbishte_latin
probability: 0.01
numeric_probability: 0.75
alpha_probability: 0.2
numeric_plus_alpha_probability: 0.025
alpha_plus_numeric_probability: 0.025
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
directional:
direction: right
direction_probability: 0.85
modifier:
alternatives:
- alternative: *dyasno
probability: 0.19
- alternative: *dyasno_latin
probability: 0.01
- alternative: *lyavo
probability: 0.19
- alternative: *lyavo_latin
probability: 0.01
- alternative: *sever
probability: 0.14
- alternative: *sever_latin
probability: 0.01
- alternative: *yug
probability: 0.14
- alternative: *yug_latin
probability: 0.01
- alternative: *istok
probability: 0.14
- alternative: *istok_latin
probability: 0.01
- alternative: *zapad
probability: 0.14
- alternative: *zapad_latin
probability: 0.01
po_boxes:
poshtenska_kutiya: &poshtenska_kutiya
canonical: пощенска кутия
abbreviated: пк
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.5
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
poshtenska_kutiya_latin: &poshtenska_kutiya_latin
canonical: poshtenska kutiya
abbreviated: pk
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.5
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
alphanumeric:
default: *poshtenska_kutiya
probability: 0.8
alternatives:
- alternative: *poshtenska_kutiya_latin
probability: 0.2
numeric_probability: 0.9 # p.k 123
alpha_probability: 0.05 # p.k А
numeric_plus_alpha_probability: 0.04 # p.k 123А
alpha_plus_numeric_probability: 0.01 # p.k А123
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
digits:
- length: 1
probability: 0.05
- length: 2
probability: 0.1
- length: 3
probability: 0.2
- length: 4
probability: 0.5
- length: 5
probability: 0.1
- length: 6
probability: 0.05
units:
apartament: &apartament
canonical: апартамент
abbreviated: ап
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.6
sample_probability: 0.1
numeric:
direction: left
apartament_latin: &apartament_latin
canonical: apartament
abbreviated: ap
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.6
sample_probability: 0.1
numeric:
direction: left
staya: &staya
canonical: стая
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
staya_latin: &staya_latin
canonical: staya
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
ofis: &ofis
canonical: офис
abbreviated: оф
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.5
sample_probability: 0.1
numeric:
direction: left
ofis_latin: &ofis_latin
canonical: ofis
abbreviated: of
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.5
sample_probability: 0.1
numeric:
direction: left
alphanumeric: &unit_alphanumeric
default: *apartament
probability: 0.65
alternatives:
- alternative: *apartament_latin
probability: 0.05
- alternative: *staya
probability: 0.25
- alternative: *staya_latin
probability: 0.05
numeric_probability: 0.9 # e.g. ап 1
numeric_plus_alpha_probability: 0.03 # e.g. 1А
alpha_plus_numeric_probability: 0.03 # e.g. AА1
alpha_probability: 0.04 # e.g. ап А
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
# If there are 10 floors, create unit numbers like #301 or #1032
use_floor_probability: 0.1
zones:
commercial:
default: *ofis
probability: 0.75
alternatives:
- alternative: *ofis_latin
probability: 0.05
- alternative: *staya
probability: 0.15
- alternative: *staya_latin
probability: 0.05
numeric_probability: 0.95 # e.g. ofis 1
numeric_plus_alpha_probability: 0.01 # e.g. ofis 1A
alpha_plus_numeric_probability: 0.01 # e.g. of A1
alpha_probability: 0.03 # e.g. of A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
university:
default: *staya
probability: 0.95
alternatives:
- alternative: *staya_latin
probability: 0.05
numeric_probability: 0.95 # e.g. staya 1
numeric_plus_alpha_probability: 0.01 # e.g. staya 1A
alpha_plus_numeric_probability: 0.01 # e.g. staya A1
alpha_probability: 0.03 # e.g. staya A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1