diff --git a/resources/addresses/uk.yaml b/resources/addresses/uk.yaml new file mode 100644 index 00000000..ebfcd9df --- /dev/null +++ b/resources/addresses/uk.yaml @@ -0,0 +1,983 @@ +# uk.yaml +# ------- +# Ukranian language specification + +alphabet: абвгґдеєжзиіїйклмнопрстуфхцчшщьюя +alphabet_probability: 0.7 + +components: + level: + null_probability: 0.95 + alphanumeric_probability: 0.04 + standalone_probability: 0.01 + + staircase: + null_probability: 0.99 + alphanumeric_probability: 0.01 + + entrance: + null_probability: 0.999 + alphanumeric_probability: 0.001 + + unit: + null_probability: 0.6 + alphanumeric_probability: 0.4 + + +numbers: + default: &nomer + canonical: номер + abbreviated: № + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + probability: 0.95 + alternatives: + - alternative: &nomer_latin + canonical: nomer + abbreviated: "no" + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + probability: 0.05 + + +house_number: + budnyok: &budnyok + canonical: будинок + abbreviated: буд + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 + numeric: + direction: left + budnyok_latin: &budnyok_latin + canonical: budnyok + abbreviated: bud + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 + numeric: + direction: left + + dom: &dom + canonical: дом + abbreviated: д + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 + numeric: + direction: left + dom_latin: &dom_latin + canonical: dom + abbreviated: d + sample: true + canonical_probability: 0.6 + abbreviated_probability: 0.3 + sample_probability: 0.1 + numeric: + direction: left + + alphanumeric: + default: *budnyok + probability: 0.65 + alternatives: + - alternative: *budnyok_latin + probability: 0.05 + - alternative: *dom + probability: 0.35 + - alternative: *dom_latin + probability: 0.05 + + # Very common in Ukranian to write bud/dom + alphanumeric_phrase_probability: 0.6 + + + +and: + default: &i + canonical: і + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.9 + alternatives: + - alternative: &i_latin + canonical: i + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.1 + + +cross_streets: + i: *i + i_latin: *i_latin + kut: &kut + canonical: кут + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + kut_latin: &kut_latin + canonical: kut + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + rozi: &rozi + canonical: розі + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + rozi_latin: &rozi_latin + canonical: rozi + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_rozi: &na_rozi + canonical: на розі + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_rozi_latin: &na_rozi_latin + canonical: na rozi + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + intersection: + default: *i + probability: 0.65 + alternatives: + - alternative: *i_latin + probability: 0.05 + - alternative: *rozi + probability: 0.075 + - alternative: *rozi_latin + probability: 0.075 + - alternative: *na_rozi + probability: 0.05 + - alternative: *na_rozi_latin + probability: 0.05 + - alternative: *kut + probability: 0.025 + - alternative: *kut_latin + probability: 0.025 + mizh: &mizh + canonical: між + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + mizh_latin: &mizh_latin + canonical: mizh + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + between: + default: *mizh + probability: 0.9 + alternatives: + - alternative: *mizh_latin + probability: 0.1 + + +levels: + poverkh: &poverkh + canonical: поверх + abbreviated: пов + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.3 + sample_probability: 0.3 + numeric: + direction: left + direction_probability: 0.9 + ordinal: + direction: right + numeric_probability: 0.4 + ordinal_probability: 0.6 + poverkh_latin: &poverkh_latin + canonical: poverkh + abbreviated: pov + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.3 + sample_probability: 0.3 + numeric: + direction: left + direction_probability: 0.9 + ordinal: + direction: right + numeric_probability: 0.4 + ordinal_probability: 0.6 + riven: &riven + canonical: рівень + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + numeric: + direction: left + direction_probability: 0.9 + ordinal: + direction: right + numeric_probability: 0.4 + ordinal_probability: 0.6 + riven_latin: &riven_latin + canonical: riven' + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + numeric: + direction: left + direction_probability: 0.9 + ordinal: + direction: right + numeric_probability: 0.4 + ordinal_probability: 0.6 + pershyy_poverkh: &pershyy_poverkh + canonical: перший поверх + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + pershyy_poverkh_latin: &pershyy_poverkh_latin + canonical: pershyy poverkh + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + nyzhniy_poverkh: &nyzhniy_poverkh + canonical: нижній поверх + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + nyzhniy_poverkh_latin: &nyzhniy_poverkh_latin + canonical: nyzhniy poverkh + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + + tsokolnyy_poverkh: &tsokolnyy_poverkh + canonical: цокольний поверх + abbreviated: цок пов + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + tsokolnyy_poverkh_latin: &tsokolnyy_poverkh_latin + canonical: tsokolʹnyy poverkh + abbreviated: tsok pov + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + pidval: &pidval + canonical: підвал + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + numeric_affix: + affix: п + direction: left + ordinal: + direction: right + ordinal: + direction: right + number_abs_value: true + number_min_abs_value: 2 + # Basement 2 == Sub-basement 1 + number_subtract_abs_value: 1 + standalone_probability: 0.985 + numeric_probability: 0.005 + numeric_affix_probability: 0.005 + ordinal_probability: 0.005 + pidval_latin: &pidval_latin + canonical: pidval + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + numeric_affix: + affix: p + direction: left + ordinal: + direction: right + ordinal: + direction: right + number_abs_value: true + number_min_abs_value: 2 + # Basement 2 == Sub-basement 1 + number_subtract_abs_value: 1 + standalone_probability: 0.985 + numeric_probability: 0.005 + numeric_affix_probability: 0.005 + ordinal_probability: 0.005 + + aliases: + "<-1": + default: *pidval + probability: 0.9 + alternatives: + - alternative: *pidval_latin + probability: 0.1 + "-1": &ground_floor + default: *tsokolnyy_poverkh + probability: 0.89 + alternatives: + - alternative: *tsokolnyy_poverkh_latin + probability: 0.01 + - alternative: *poverkh + probability: 0.09 + - alternative: *poverkh_latin + probability: 0.01 + "0": + default: *pershyy_poverkh + probability: 0.6 + alternatives: + - alternative: *pershyy_poverkh_latin + probability: 0.05 + - alternative: *nyzhniy_poverkh + probability: 0.2 + - alternative: *nyzhniy_poverkh_latin + probability: 0.05 + - alternative: *tsokolnyy_poverkh + probability: 0.075 + - alternative: *tsokolnyy_poverkh_latin + probability: 0.025 + numbering_starts_at: 0 + + alphanumeric: + default: *poverkh + probability: 0.8 + alternatives: + - alternative: *poverkh_latin + probability: 0.1 + - alternative: *riven + probability: 0.09 + - alternative: *riven_latin + probability: 0.01 + numeric_probability: 0.99 # With this probability, pick an integer + alpha_probability: 0.0098 # With this probability, pick a letter e.g. A + numeric_plus_alpha_probability: 0.0001 # e.g. 2A + alpha_plus_numeric_probability: 0.0001 # e.g. A2 + + + +categories: + near: + default: + canonical: поруч + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.59 + alternatives: + - alternative: + canonical: poruch + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: поблизу + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.19 + - alternative: + canonical: poblyzu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: близько + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: blyzʹko + sample: true + canonical_probability: 0.6 + sample_probability: 0.4 + probability: 0.01 + - alternative: + canonical: у + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: u + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: біля + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: bilye + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: поруч з + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: poruch z + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + + nearby: + default: + canonical: поблизу + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.64 + alternatives: + - alternative: + canonical: poblyzu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: поруч тут + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.19 + - alternative: + canonical: poruch tut + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: тут + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.09 + - alternative: + canonical: tut + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: поруч + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: poruch + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + near_me: + default: + canonical: поруч з мною + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.99 + alternatives: + - alternative: + canonical: poruch z mnoyu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + in: + default: + canonical: в + probability: 0.99 + alternatives: + - alternative: + canonical: v + probability: 0.01 + + # Probabilities of each phrase + near_probability: 0.35 + nearby_probability: 0.2 + near_me_probability: 0.1 + in_probability: 0.35 + + +# ru.yaml +# ------- +# Russian language specification + +directions: + pravo: &pravo + canonical: право + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + pravo_latin: &pravo_latin + canonical: pravo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + livo: &livo + canonical: ліво + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + livo_latin: &livo_latin + canonical: livo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + alternatives: + - alternative: *pravo + probability: 0.49 + - alternative: *pravo_latin + probability: 0.01 + - alternative: *livo + probability: 0.49 + - alternative: *livo_latin + probability: 0.01 + + + +cardinal_directions: + shkid: &shkid + canonical: схід + abbreviated: с + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: с + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + shkid_latin: &shkid_latin + canonical: shkid + abbreviated: s + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: s + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + zakhid: &zakhid + canonical: захід + abbreviated: з + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + + zakhid_latin: &zakhid_latin + canonical: zakhid + abbreviated: z + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + + pivnikh: &pivnikh + canonical: північ + abbreviated: півн + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + + pivnikh_latin: &pivnikh_latin + canonical: pivnikh + abbreviated: pivn + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + + pivden: &pivden + canonical: південь + abbreviated: півд + sample: true + canonical_probability: 0.75 + abbreviated_probability: 0.1 + sample_probability: 0.15 + numeric: + direction: right + numeric_affix: + affix: Ю + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + pivden_latin: &pivden_latin + canonical: pivden' + abbreviated: pivd + sample: true + canonical_probability: 0.55 + abbreviated_probability: 0.1 + sample_probability: 0.35 + numeric: + direction: right + + alternatives: + - alternative: *pivnikh + probability: 0.24 + - alternative: *pivnikh_latin + probability: 0.01 + - alternative: *shkid + probability: 0.24 + - alternative: *shkid_latin + probability: 0.01 + - alternative: *pivden + probability: 0.24 + - alternative: *pivden_latin + probability: 0.01 + - alternative: *zakhid + probability: 0.24 + - alternative: *zakhid_latin + probability: 0.01 + +entrances: + vkhid: &vkhid + canonical: вхід + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + vkhod_latin: &vkhid_latin + canonical: vkhid + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + # вход 1, вход A, etc. + alphanumeric: + default: *vkhid + probability: 0.99 + alternatives: + - alternative: *vkhid_latin + probability: 0.01 + numeric_probability: 0.1 # e.g. Vkhid 1 + alpha_probability: 0.85 # e.g. Vkhid A + numeric_plus_alpha_probability: 0.025 # e.g. 1A + alpha_plus_numeric_probability: 0.025 # e.g. A1 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + +staircases: + skhody: &skhody + canonical: сходи + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + skhody_latin: &skhody_latin + canonical: skhody + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + alphanumeric: &staircase_alphanumeric + default: *skhody + probability: 0.99 + alternatives: + - alternative: *skhody_latin + probability: 0.01 + numeric_probability: 0.75 + alpha_probability: 0.2 + numeric_plus_alpha_probability: 0.025 + alpha_plus_numeric_probability: 0.025 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + directional: + direction: left + direction_probability: 0.85 + modifier: + alternatives: + - alternative: *pivnikh + - alternative: *shkid + - alternative: *pivden + - alternative: *zakhid + +po_boxes: + abonementnykh_skrynka: &abonementnykh_skrynka + canonical: абонементна скринька + abbreviated: а/с + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + abonementnykh_skrynka_latin: &abonementnykh_skrynka_latin + canonical: abonementnykh skrynʹka + abbreviated: a/s + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.1 + sample_probability: 0.5 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + + alphanumeric: + default: *abonementnykh_skrynka + probability: 0.99 + alternatives: + - alternative: *abonementnykh_skrynka_latin + probability: 0.01 + numeric_probability: 0.9 # 123 + alpha_probability: 0.05 # А + numeric_plus_alpha_probability: 0.04 # 123А + alpha_plus_numeric_probability: 0.01 # А123 + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + digits: + - length: 1 + probability: 0.05 + - length: 2 + probability: 0.1 + - length: 3 + probability: 0.2 + - length: 4 + probability: 0.5 + - length: 5 + probability: 0.1 + - length: 6 + probability: 0.05 + +units: + kvartyra: &kvartyra + canonical: квартира + abbreviated: кв + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.6 + sample_probability: 0.1 + numeric: + direction: left + kvartyra_latin: &kvartyra_latin + canonical: kvartyra + abbreviated: kv + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.6 + sample_probability: 0.1 + numeric: + direction: left + + kabinet: &kabinet + canonical: кабінет + abbreviated: каб + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + kabinet_latin: &kabinet_latin + canonical: kabinet + abbreviated: kab + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + kimnata: &kimnata + canonical: кімната + abbreviated: км + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + kimnata_latin: &kimnata_latin + canonical: kimnata + abbreviated: km + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + + litera: &litera + canonical: літера + abbreviated: літ + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + litera_latin: &litera_latin + canonical: litera + abbreviated: lit + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + + ofis: &ofis + canonical: офіс + abbreviated: оф + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.5 + sample_probability: 0.1 + numeric: + direction: left + ofis_latin: &ofis_latin + canonical: ofis + abbreviated: of + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.5 + sample_probability: 0.1 + numeric: + direction: left + + alphanumeric: &unit_alphanumeric + default: *kvartyra + probability: 0.89 + alternatives: + - alternative: *kvartyra + probability: 0.01 + - alternative: *kimnata + probability: 0.09 + - alternative: *kimnata_latin + probability: 0.01 + + numeric_probability: 0.9 # e.g. кв 1 + numeric_plus_alpha_probability: 0.03 # e.g. 1А + alpha_plus_numeric_probability: 0.03 # e.g. AА1 + alpha_probability: 0.04 # e.g. кв А + + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + # If there are 10 floors, create unit numbers like #301 or #1032 + use_floor_probability: 0.1 + + alpha: + default: *kvartyra + probability: 0.79 + alternatives: + - alternative: *kvartyra_latin + probability: 0.01 + - alternative: *kimnata + probability: 0.09 + - alternative: *kimnata_latin + probability: 0.01 + - alternative: *litera + probability: 0.09 + - alternative: *litera_latin + probability: 0.01 + + + zones: + commercial: + default: *kabinet + probability: 0.59 + alternatives: + - alternative: *kabinet_latin + probability: 0.01 + - alternative: *ofis + probability: 0.29 + - alternative: *ofis_latin + probability: 0.01 + - alternative: *kimnata + probability: 0.09 + - alternative: *kimnata_latin + probability: 0.01 + numeric_probability: 0.95 # e.g. kabinet 1 + numeric_plus_alpha_probability: 0.01 # e.g. kabinet 1A + alpha_plus_numeric_probability: 0.01 # e.g. kab A1 + alpha_probability: 0.03 # e.g. kab A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + university: + default: *kimnata + probability: 0.99 + alternatives: + - alternative: *kimnata_latin + probability: 0.01 + numeric_probability: 0.95 # e.g. kimnata 1 + numeric_plus_alpha_probability: 0.01 # e.g. kimnata 1A + alpha_plus_numeric_probability: 0.01 # e.g. km A1 + alpha_probability: 0.03 # e.g. km A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1