diff --git a/resources/addresses/hr.yaml b/resources/addresses/hr.yaml new file mode 100644 index 00000000..50fe2795 --- /dev/null +++ b/resources/addresses/hr.yaml @@ -0,0 +1,586 @@ +# hr.yaml +# ------- +# Croatian language specification + +components: + level: + null_probability: 0.9 + alphanumeric_probability: 0.1 + + staircase: + null_probability: 0.99 + alphanumeric_probability: 0.01 + + entrance: + null_probability: 0.999 + alphanumeric_probability: 0.001 + + unit: + null_probability: 0.7 + alphanumeric_probability: 0.3 + + combinations: + - + components: + - house_number + - staircase + - level + - unit + label: house_number + separators: + - separator: "/" + probability: 0.95 + - separator: "-" + probability: 0.05 + probability: 0.005 + - + components: + - house_number + - level + - unit + label: house_number + separators: + - separator: "/" + probability: 0.95 + - separator: "-" + probability: 0.05 + probability: 0.005 + - + components: + - house_number + - level + label: house_number + separators: + - separator: "/" + probability: 0.95 + - separator: "-" + probability: 0.05 + probability: 0.1 + # For unit types like 2/34 + - + components: + - house_number + - unit + label: house_number + separators: + - separator: "/" + probability: 0.95 + - separator: "-" + probability: 0.05 + probability: 0.005 + + +numbers: + no_number: + default: + canonical: bez broja + abbreviated: bb + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + + default: &broj + canonical: broj + abbreviated: br + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.6 + sample_probability: 0.1 + numeric: + direction: left + numeric_affix: + affix: "br." + whitespace_probability: 0.6 + direction: left + numeric_probability: 0.4 + numeric_affix_probability: 0.6 + + alphanumeric_phrase_probability: 0.05 + no_number_probability: 0.05 + + +and: + default: &i + canonical: i + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + + +cross_streets: + i: *i + at: &na + canonical: na + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + corner: &ugao + canonical: ugao + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + corner_of: &uglu + canonical: uglu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_uglu: &na_uglu + canonical: na uglu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + intersection: + default: *i + probability: 0.65 + alternatives: + - alternative: *na + probability: 0.1 + - alternative: *uglu + probability: 0.1 + - alternative: *na_uglu + probability: 0.1 + - alternative: *ugao + probability: 0.05 + + izmedu: &izmedu + canonical: između + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + between: + default: *izmedu + +levels: + kat: &kat + canonical: kat + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + etaza: &etaza + canonical: etaža + abbreviated: et + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + add_number_phrase: true + add_number_phrase_probability: 0.1 + ordinal: + direction: right + digits: + ascii_probability: 0.3 + roman_numeral_probability: 0.7 + add_number_phrase: true + add_number_phrase_probability: 0.1 + numeric_probability: 0.4 + ordinal_probability: 0.6 + prizemlje: &prizemlje + canonical: prizemlje + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + parter: &parter + canonical: parter + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + mezanino: &polukat + canonical: polukat + half_floors: true + canonical_probability: 0.8 + sample_probability: 0.2 + sample: true + # e.g. polukat 2 + numeric: + direction: left + # e.g. 2. entresuelo + ordinal: + direction: right + numeric_probability: 0.1 + ordinal_probability: 0.2 + standalone_probability: 0.6 + podrum: &podrum + canonical: podrum + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + # e.g. подрум 1 + numeric: + direction: left + direction_probability: 0.8 + # e.g. 1. подрум + ordinal: + direction: right + digits: + ascii_probability: 0.7 + roman_numeral_probability: 0.3 + standalone_probability: 0.99 + number_abs_value: true + number_min_abs_value: 1 + numeric_probability: 0.005 + ordinal_probability: 0.005 + + aliases: + "<-1": + default: *podrum + "-1": + default: *podrum + # Special token for half-floors + half_floors: + default: *polukat + "0": + default: *prizemlje + probability: 0.5 + alternatives: + - alternative: *parter + probability: 0.4 + - alternative: *kat + probability: 0.1 + + numbering_starts_at: 0 + + alphanumeric: + default: *kat + probability: 0.95 + alternatives: + - alternative: *etaza + probability: 0.05 + numeric_probability: 0.69 # With this probability, pick an integer + roman_numeral_probability: 0.3 # Pick a Roman numeral for the actual value + alpha_probability: 0.0098 # With this probability, pick a letter e.g. A + numeric_plus_alpha_probability: 0.0001 # e.g. 2A + alpha_plus_numeric_probability: 0.0001 # e.g. A2 + + +categories: + near: + default: + canonical: u blizini + nearby: + default: + canonical: u blizini + probability: 0.6 + alternatives: + - alternative: + canonical: u blizini ovdje + probability: 0.3 + - alternative: + canonical: oko ovdje + probability: 0.1 + + near_me: + default: + canonical: u blizini mene + + # Don't worry about agreement + in: + default: + canonical: u + + # Probabilities of each phrase + near_probability: 0.35 + nearby_probability: 0.2 + near_me_probability: 0.1 + in_probability: 0.35 + +directions: + right: &desno + canonical: desno + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + left: &lijevo + canonical: lijevo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + alternatives: + - alternative: *desno + probability: 0.5 + - alternative: *lijevo + probability: 0.5 + +cardinal_directions: + east: &istok + canonical: istok + abbreviated: i + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: i + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + west: &zapad + canonical: zapad + abbreviated: z + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: z + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + north: &sjever + canonical: sjever + abbreviated: s + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: s + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + south: &jug + canonical: jug + abbreviated: j + sample: true + canonical_probability: 0.75 + abbreviated_probability: 0.1 + sample_probability: 0.15 + numeric: + direction: right + numeric_affix: + affix: j + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + alternatives: + - alternative: *sjever + probability: 0.25 + - alternative: *istok + probability: 0.23 + - alternative: *jug + probability: 0.23 + - alternative: *zapad + probability: 0.23 + +entrances: + ulaz: &ulaz + canonical: ulaz + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + # Ulaz 1, Ulaz A, etc. + alphanumeric: &entrance_alphanumeric + default: *ulaz + numeric_probability: 0.1 # e.g. Ulaz 1 + alpha_probability: 0.85 # e.g. Ulaz A + numeric_plus_alpha_probability: 0.025 # e.g. 1A + alpha_plus_numeric_probability: 0.025 # e.g. A1 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + +staircases: + stubiste: &stubiste + canonical: stubište + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + + alphanumeric: &staircase_alphanumeric + default: *stubiste + numeric_probability: 0.75 + alpha_probability: 0.2 + numeric_plus_alpha_probability: 0.025 + alpha_plus_numeric_probability: 0.025 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + directional: + direction: right + direction_probability: 0.85 + modifier: + alternatives: + - alternative: *desno + probability: 0.2 + - alternative: *lijevo + probability: 0.2 + - alternative: *sjever + probability: 0.15 + - alternative: *jug + probability: 0.15 + - alternative: *istok + probability: 0.15 + - alternative: *zapad + probability: 0.15 + +po_boxes: + postanski_pretinac: &postanski_pretinac + canonical: poštanski pretinac + abbreviated: p.p + sample: true + canonical_probability: 0.2 + abbreviated_probability: 0.4 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + + alphanumeric: + default: *postanski_pretinac + numeric_probability: 0.9 # pp 123 + alpha_probability: 0.05 # p.p A + numeric_plus_alpha_probability: 0.04 # pp 123G + alpha_plus_numeric_probability: 0.01 # pp A123 + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + digits: + - length: 1 + probability: 0.05 + - length: 2 + probability: 0.1 + - length: 3 + probability: 0.2 + - length: 4 + probability: 0.5 + - length: 5 + probability: 0.1 + - length: 6 + probability: 0.05 + +units: + stan: &stan + canonical: stan + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + apartman: &apartman + canonical: apartman + abbreviated: ap + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.2 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + + soba: &soba + canonical: soba + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + ured: &ured + canonical: ured + sample: true + canonical_probability: 0.6 + sample_probability: 0.4 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.1 + + alphanumeric: &unit_alphanumeric + default: *stan + probability: 0.6 + alternatives: + - alternative: *apartman + probability: 0.3 + - alternative: *soba + probability: 0.1 + numeric_probability: 0.9 # e.g. stan. 1 + numeric_plus_alpha_probability: 0.03 # e.g. 1A + alpha_plus_numeric_probability: 0.03 # e.g. A1 + alpha_probability: 0.04 # e.g. stan A + + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + # If there are 10 floors, create unit numbers like #301 or #1032 + use_floor_probability: 0.05 + + zones: + commercial: &commercial_unit_types + default: *soba + probability: 0.6 + alternatives: + - alternative: *ured + probability: 0.4 + numeric_probability: 0.95 # e.g. soba 1 + numeric_plus_alpha_probability: 0.01 # e.g. soba 1A + alpha_plus_numeric_probability: 0.01 # e.g. soba A1 + alpha_probability: 0.03 # e.g. soba A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + university: + default: *soba + numeric_probability: 0.95 # e.g. soba 1 + numeric_plus_alpha_probability: 0.01 # e.g. soba 1A + alpha_plus_numeric_probability: 0.01 # e.g. soba A1 + alpha_probability: 0.03 # e.g. soba A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1