diff --git a/resources/addresses/sr.yaml b/resources/addresses/sr.yaml new file mode 100644 index 00000000..abad9c22 --- /dev/null +++ b/resources/addresses/sr.yaml @@ -0,0 +1,899 @@ +# sr.yaml +# ------- +# Serbian language specification + +alphabet: абвгдђежзијклљмнњопрстћуфхцчџш +alphanumeric_probability: 0.7 + +components: + level: + null_probability: 0.95 + alphanumeric_probability: 0.04 + standalone_probability: 0.01 + + staircase: + null_probability: 0.99 + alphanumeric_probability: 0.01 + + entrance: + null_probability: 0.999 + alphanumeric_probability: 0.001 + + unit: + null_probability: 0.7 + alphanumeric_probability: 0.3 + + # Note: no combinations because of the house numbering scheme + + +numbers: + default: &broj + canonical: број + abbreviated: бр + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.6 + sample_probability: 0.1 + numeric: + direction: left + numeric_affix: + affix: "бр." + direction: left + numeric_probability: 0.4 + numeric_affix_probability: 0.6 + alternatives: + - alternative: &broj_latin + canonical: broj + abbreviated: br + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.6 + sample_probability: 0.1 + numeric: + direction: left + numeric_affix: + affix: "br." + direction: left + numeric_probability: 0.4 + numeric_affix_probability: 0.6 + + +and: + default: &i + canonical: и + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.9 + alternatives: + - alternative: &i_latin + canonical: i + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.1 + + +cross_streets: + i: *i + i_latin: *i_latin + at: &na + canonical: на + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_latin: &na_latin + canonical: na + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + corner: &ugao + canonical: угао + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + ugao_latin: &ugao_latin + canonical: ugao + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_uglu: &na_uglu + canonical: на углу + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_uglu_latin: &na_uglu_latin + canonical: na uglu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + intersection: + default: *i + probability: 0.65 + alternatives: + - alternative: *i_latin + probability: 0.05 + - alternative: *na + probability: 0.075 + - alternative: *na_latin + probability: 0.025 + - alternative: *ugao + probability: 0.1 + - alternative: *ugao_latin + probability: 0.05 + - alternative: *na_uglu + probability: 0.025 + - alternative: *na_uglu_latin + probability: 0.025 + izmedu: &izmedu + canonical: између + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + izmedu_latin: &izmedu_latin + canonical: između + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + between: + default: *izmedu + probability: 0.9 + alternatives: + - alternative: *izmedu_latin + probability: 0.1 + +levels: + sprat: &sprat + canonical: спрат + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + sprat_latin: &sprat_latin + canonical: sprat + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + kat: &kat + canonical: кат + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + kat_latin: &kat_latin + canonical: kat + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + etaza: &etaza + canonical: етажа + abbreviated: ет + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + etaza_latin: &etaza_latin + canonical: etaža + abbreviated: et + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + prizemlje: &prizemlje + canonical: приземље + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + prizemlje_latin: &prizemlje_latin + canonical: prizemlje + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + parter: &parter + canonical: партер + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + parter_latin: &parter_latin + canonical: parter + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + + podrum: &podrum + canonical: подрум + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + # e.g. подрум 1 + numeric: + direction: left + direction_probability: 0.8 + # e.g. 1. подрум + ordinal: + direction: right + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + standalone_probability: 0.99 + number_abs_value: true + number_min_abs_value: 1 + numeric_probability: 0.005 + ordinal_probability: 0.005 + podrum_latin: &podrum_latin + canonical: podrum + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + # e.g. подрум 1 + numeric: + direction: left + direction_probability: 0.8 + # e.g. 1. подрум + ordinal: + direction: right + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + standalone_probability: 0.99 + number_abs_value: true + number_min_abs_value: 1 + numeric_probability: 0.005 + ordinal_probability: 0.005 + + aliases: + "<-1": + default: *podrum + probability: 0.8 + alternatives: + - alternative: *podrum_latin + probability: 0.2 + "-1": + default: *podrum + probability: 0.8 + alternatives: + - alternative: *podrum_latin + probability: 0.2 + "0": + default: *prizemlje + probability: 0.45 + alternatives: + - alternative: *prizemlje_latin + probability: 0.05 + - alternative: *parter + probability: 0.35 + - alternative: *parter_latin + probability: 0.05 + - alternative: *sprat + probability: 0.04 + - alternative: *sprat_latin + probability: 0.01 + - alternative: *kat + probability: 0.04 + - alternative: *kat_latin + probability: 0.01 + + numbering_starts_at: 0 + + alphanumeric: + default: *sprat + probability: 0.65 + alternatives: + - alternative: *sprat_latin + probability: 0.1 + - alternative: *kat + probability: 0.15 + - alternative: *kat_latin + probability: 0.05 + - alternative: *etaza + probability: 0.04 + - alternative: *etaza_latin + probability: 0.01 + numeric_probability: 0.99 # With this probability, pick an integer + alpha_probability: 0.0098 # With this probability, pick a letter e.g. A + numeric_plus_alpha_probability: 0.0001 # e.g. 2A + alpha_plus_numeric_probability: 0.0001 # e.g. A2 + +directions: + right: &desno + canonical: десно + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + desno_latin: &desno_latin + canonical: desno + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + left: &levo + canonical: лево + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + levo_latin: &levo_latin + canonical: лево + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + alternatives: + - alternative: *desno + probability: 0.45 + - alternative: *desno_latin + probability: 0.05 + - alternative: *levo + probability: 0.45 + - alternative: *levo_latin + probability: 0.05 + +cardinal_directions: + east: &istok + canonical: исток + abbreviated: и + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: и + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + istok_latin: &istok_latin + canonical: istok + abbreviated: i + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: i + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + west: &zapad + canonical: запад + abbreviated: з + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: з + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + zapad_latin: &zapad_latin + canonical: zapad + abbreviated: z + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: z + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + north: &sever + canonical: север + abbreviated: с + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: с + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + sever_latin: &sever_latin + canonical: sever + abbreviated: s + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: s + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + south: &jug + canonical: југ + abbreviated: ј + sample: true + canonical_probability: 0.75 + abbreviated_probability: 0.1 + sample_probability: 0.15 + numeric: + direction: right + numeric_affix: + affix: ј + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + jug_latin: &jug_latin + canonical: jug + abbreviated: j + sample: true + canonical_probability: 0.75 + abbreviated_probability: 0.1 + sample_probability: 0.15 + numeric: + direction: right + numeric_affix: + affix: j + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + alternatives: + - alternative: *sever + probability: 0.23 + - alternative: *sever_latin + probability: 0.02 + - alternative: *istok + probability: 0.23 + - alternative: *istok_latin + probability: 0.02 + - alternative: *jug + probability: 0.23 + - alternative: *jug_latin + probability: 0.02 + - alternative: *zapad + probability: 0.23 + - alternative: *zapad_latin + probability: 0.02 + +entrances: + ulaz: &ulaz + canonical: улаз + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + ulaz_latin: &ulaz_latin + canonical: ulaz + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + # Ulaz 1, Ulaz A, etc. + alphanumeric: &entrance_alphanumeric + default: *ulaz + probability: 0.8 + alternatives: + - alternative: *ulaz_latin + probability: 0.2 + numeric_probability: 0.1 # e.g. Ulaz 1 + alpha_probability: 0.85 # e.g. Ulaz A + numeric_plus_alpha_probability: 0.025 # e.g. 1A + alpha_plus_numeric_probability: 0.025 # e.g. A1 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + + +staircases: + stepeniste: &stepeniste + canonical: степениште + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + stepeniste_latin: &stepeniste_latin + canonical: stepenište + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + + alphanumeric: &staircase_alphanumeric + default: *stepeniste + probability: 0.8 + alternatives: + - alternative: *stepeniste_latin + probability: 0.2 + numeric_probability: 0.75 + alpha_probability: 0.2 + numeric_plus_alpha_probability: 0.025 + alpha_plus_numeric_probability: 0.025 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + directional: + direction: right + direction_probability: 0.85 + modifier: + alternatives: + - alternative: *desno + probability: 0.2 + - alternative: *levo + probability: 0.2 + - alternative: *sever + probability: 0.14 + - alternative: *sever_latin + probability: 0.01 + - alternative: *jug + probability: 0.14 + - alternative: *jug_latin + probability: 0.01 + - alternative: *istok + probability: 0.14 + - alternative: *istok_latin + probability: 0.01 + - alternative: *zapad + probability: 0.14 + - alternative: *zapad_latin + probability: 0.01 + +po_boxes: + postanski_fah: &postanski_fah + canonical: поштански фах + abbreviated: пф + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 # poštanski fah br. 1234 + postanski_fah_latin: &postanski_fah_latin + canonical: poštanski fah + abbreviated: pf + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 # poštanski fah br. 1234 + postanski_pretinac: &postanski_pretinac + canonical: поштански претинац + sample: true + canonical_probability: 0.6 + sample_probability: 0.5 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + postanski_pretinac_latin: &postanski_pretinac_latin + canonical: poštanski pretinac + sample: true + canonical_probability: 0.6 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + postanski_pregradak: &postanski_pregradak + canonical: поштански преградак + sample: true + canonical_probability: 0.6 + sample_probability: 0.5 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + postanski_pregradak_latin: &postanski_pregradak_latin + canonical: poštanski pregradak + sample: true + canonical_probability: 0.6 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + + alphanumeric: + default: *postanski_fah + probability: 0.7 + alternatives: + - alternative: *postanski_fah_latin + probability: 0.05 + - alternative: *postanski_pretinac + probability: 0.1 + - alternative: *postanski_pretinac_latin + probability: 0.05 + - alternative: *postanski_pregradak + probability: 0.075 + - alternative: *postanski_pregradak_latin + probability: 0.025 + numeric_probability: 0.9 # pf 123 + alpha_probability: 0.05 # pf A + numeric_plus_alpha_probability: 0.04 # pf 123G + alpha_plus_numeric_probability: 0.01 # pf A123 + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + digits: + - length: 1 + probability: 0.05 + - length: 2 + probability: 0.1 + - length: 3 + probability: 0.2 + - length: 4 + probability: 0.5 + - length: 5 + probability: 0.1 + - length: 6 + probability: 0.05 + +units: + stan: &stan + canonical: стан + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + stan_latin: &stan_latin + canonical: stan + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + apartman: &apartman + canonical: апартман + abbreviated: ап + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.2 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + + apartman_latin: &apartman_latin + canonical: apartman + abbreviated: ap + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.2 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + + soba: &soba + canonical: соба + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + soba_latin: &soba_latin + canonical: soba + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + kancelarija: &kancelarija + canonical: канцеларија + sample: true + canonical_probability: 0.6 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + kancelarija_latin: &kancelarija_latin + canonical: kancelarija + sample: true + canonical_probability: 0.6 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + + alphanumeric: &unit_alphanumeric + default: *stan + probability: 0.5 + alternatives: + - alternative: *stan_latin + probability: 0.1 + - alternative: *apartman + probability: 0.2 + - alternative: *apartman_latin + probability: 0.05 + - alternative: *soba + probability: 0.1 + - alternative: *soba_latin + probability: 0.05 + numeric_probability: 0.9 # e.g. stan. 1 + numeric_plus_alpha_probability: 0.03 # e.g. 1A + alpha_plus_numeric_probability: 0.03 # e.g. A1 + alpha_probability: 0.04 # e.g. stan A + + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + # If there are 10 floors, create unit numbers like #301 or #1032 + use_floor_probability: 0.01 + + zones: + commercial: &commercial_unit_types + default: *soba + probability: 0.55 + alternatives: + - alternative: *soba_latin + probability: 0.05 + - alternative: *kancelarija + probability: 0.35 + - alternative: *kancelarija_latin + probability: 0.05 + numeric_probability: 0.95 # e.g. soba 1 + numeric_plus_alpha_probability: 0.01 # e.g. soba 1A + alpha_plus_numeric_probability: 0.01 # e.g. soba A1 + alpha_probability: 0.03 # e.g. soba A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + university: + default: *soba + probability: 0.9 + alternatives: + - alternative: *soba_latin + probability: 0.1 + numeric_probability: 0.95 # e.g. soba 1 + numeric_plus_alpha_probability: 0.01 # e.g. soba 1A + alpha_plus_numeric_probability: 0.01 # e.g. soba A1 + alpha_probability: 0.03 # e.g. soba A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1