diff --git a/resources/addresses/pl.yaml b/resources/addresses/pl.yaml new file mode 100644 index 00000000..928fa515 --- /dev/null +++ b/resources/addresses/pl.yaml @@ -0,0 +1,492 @@ +# pl.yaml +# ------- +# Polish language specification. + +components: + level: + null_probability: 0.95 + alphanumeric_probability: 0.04 + standalone_probability: 0.01 + + staircase: + null_probability: 0.99 + alphanumeric_probability: 0.01 + + entrance: + null_probability: 0.999 + alphanumeric_probability: 0.001 + + unit: + null_probability: 0.75 + alphanumeric_probability: 0.25 + + combinations: + level_unit: + components: + - house_number + - unit + label: house_number + separators: + - separator: "/" + probability: 0.9 + - separator: "-" + probability: 0.05 + - separator: " - " + probability: 0.05 + probability: 0.01 + +numbers: + default: &numer + canonical: numer + abbreviated: nr + sample: true + # Probabilities + canonical_probability: 0.3 + abbreviated_probability: 0.5 + sample_probability: 0.2 + sample_exclude: + - "#" + numeric: + direction: left + numeric_affix: + affix: "#" + direction: left + + numeric_probability: 0.4 + numeric_affix_probability: 0.6 + + +house_number: + dom: &dom + canonical: dom + abbreviated: d + sample: true + canonical_probability: 0.8 + abbreviated_probability: 0.1 + sample_probability: 0.1 + numeric: + direction: left + alphanumeric: + default: *numer + probability: 0.6 + alternatives: + - alternative: *dom + probability: 0.4 + + alphanumeric_phrase_probability: 0.0001 + +and: + default: &i + canonical: i + abbreviated: "&" + canonical_probability: 0.2 + abbreviated_probability: 0.75 + sample: true + sample_probability: 0.05 + +cross_streets: + and: *i + corner_of: &rogu + canonical: rogu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + at_the_corner_of: &na_rogu + canonical: na rogu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + intersection: + default: *i + probability: 0.7 + alternatives: + - alternative: *rogu + probability: 0.15 + - alternative: *na_rogu + probability: 0.15 + + between: + canonical: pomiędzy + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + +levels: + floor: &pietro + canonical: piętro + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + numeric: + direction: left + direction_probability: 0.9 + ordinal: + direction: right + numeric_probability: 0.4 + ordinal_probability: 0.6 + parter: &parter + canonical: parter + sample: true + canonical_probability: 0.9 + sample_probability: 0.1 + suterena: &suterena + canonical: suterena + # e.g. suterena 1 + numeric: + direction: left + direction_probability: 0.8 + # e.g. s1 + numeric_affix: + affix: s + direction: left + # e.g. 1. suterena + ordinal: + direction: right + standalone_probability: 0.985 + number_abs_value: true + number_min_abs_value: 1 + numeric_probability: 0.005 + numeric_affix_probability: 0.005 + ordinal_probability: 0.005 + aliases: + "<-1": + default: *suterena + "-1": + default: *suterena + "0": + default: *parter + probability: 0.9 + alternatives: + - alternative: *pietro + probability: 0.1 + + numbering_starts_at: 0 + + alphanumeric: + default: *pietro + numeric_probability: 0.99 # With this probability, pick an integer + alpha_probability: 0.0098 # With this probability, pick a letter e.g. A + numeric_plus_alpha_probability: 0.0001 # e.g. 2A + alpha_plus_numeric_probability: 0.0001 # e.g. A2 + +categories: + near: + default: + canonical: w pobliżu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.7 + alternatives: + - alternative: + canonical: blisko + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.1 + - alternative: + canonical: koło + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + probability: 0.05 + - alternative: + canonical: niedaleko + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.05 + - alternative: + canonical: obok + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.05 + - alternative: + canonical: przy + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.05 + nearby: + default: + canonical: w pobliżu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.6 + alternatives: + - alternative: + canonical: w pobliżu tutaj + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.2 + - alternative: + canonical: wokół tutaj + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.1 + - alternative: + canonical: blisko + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.1 + near_me: + default: + canonical: w pobliżu mnie + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + + # Don't worry about agreement + in: + default: + canonical: w + probability: 0.7 + alternatives: + - alternative: + canonical: we + probability: 0.3 + + # Probabilities of each phrase + near_probability: 0.35 + nearby_probability: 0.2 + near_me_probability: 0.1 + in_probability: 0.35 + + +directions: + right: &prawo + canonical: prawo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + left: &lewo + canonical: lewo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + alternatives: + - alternative: *prawo + probability: 0.5 + - alternative: *lewo + probability: 0.5 + +cardinal_directions: + east: &wschod + canonical: wschód + abbreviated: w + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: w + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + west: &zachod + canonical: zachód + abbreviated: z + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: z + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + north: &polnoc + canonical: północ + abbreviated: pn + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: pn + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + south: &poludnie + canonical: południe + abbreviated: pd + sample: true + canonical_probability: 0.75 + abbreviated_probability: 0.1 + sample_probability: 0.15 + numeric: + direction: right + numeric_affix: + affix: pd + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + alternatives: + - alternative: *polnoc + probability: 0.25 + - alternative: *wschod + probability: 0.25 + - alternative: *poludnie + probability: 0.25 + - alternative: *zachod + probability: 0.25 + + +entrances: + wejscie: &wejscie + canonical: wejście + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + # Wejście 1, Wejście A, etc. + alphanumeric: &entrance_alphanumeric + default: *wejscie + numeric_probability: 0.1 # e.g. Wejście 1 + alpha_probability: 0.85 # e.g. Wejście A + numeric_plus_alpha_probability: 0.025 # e.g. 1A + alpha_plus_numeric_probability: 0.025 # e.g. A1 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + +staircases: + schody: &schody + canonical: schody + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + alphanumeric: &staircase_alphanumeric + default: *schody + numeric_probability: 0.75 + alpha_probability: 0.2 + numeric_plus_alpha_probability: 0.025 + alpha_plus_numeric_probability: 0.025 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + directional: + direction: left + direction_probability: 0.85 + modifier: + alternatives: + - alternative: *polnoc + - alternative: *poludnie + - alternative: *wschod + - alternative: *zachod + + +po_boxes: + skrytka_pocztowa: &skrytka_pocztowa + canonical: skrytka pocztowa + abbreviated: skr poczt + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.5 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 # Skr Poczt 1234 + alphanumeric: + default: *skrytka_pocztowa + numeric_probability: 0.9 # Skr Poczt 123 + alpha_probability: 0.05 # Skr Poczt A + numeric_plus_alpha_probability: 0.04 # Skr Poczt 123G + alpha_plus_numeric_probability: 0.01 # Skr Poczt A123 + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + digits: + - length: 1 + probability: 0.05 + - length: 2 + probability: 0.1 + - length: 3 + probability: 0.2 + - length: 4 + probability: 0.5 + - length: 5 + probability: 0.1 + - length: 6 + probability: 0.05 + +units: + mieszkanie: &mieszkanie + canonical: mieszkanie + abbreviated: m + sample: true + canonical_probability: 0.05 + abbreviated_probability: 0.9 + sample_probability: 0.05 + numeric: + direction: left + pokoj: &pokoj + canonical: pokój + abbreviated: pok + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.5 + sample_probability: 0.1 + numeric: + direction: left + + alphanumeric: &unit_alphanumeric + default: *mieszkanie + probability: 0.9 + alternatives: + - alternative: *pokoj + probability: 0.1 + numeric_probability: 0.9 # e.g. m. 1 + numeric_plus_alpha_probability: 0.03 # e.g. 1A + alpha_plus_numeric_probability: 0.03 # e.g. A1 + alpha_probability: 0.04 # e.g. m. A + + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + # If there are 10 floors, create unit numbers like #301 or #1032 + use_floor_probability: 0.01 + + zones: + commercial: &commercial_unit_types + default: *pokoj + numeric_probability: 0.95 # e.g. pokój 1 + numeric_plus_alpha_probability: 0.01 # e.g. pokój 1A + alpha_plus_numeric_probability: 0.01 # e.g. pokój A1 + alpha_probability: 0.03 # e.g. pokój A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + university: *commercial_unit_types diff --git a/resources/dictionaries/pl/ambiguous_expansions.txt b/resources/dictionaries/pl/ambiguous_expansions.txt index 4acdf586..72a9695d 100644 --- a/resources/dictionaries/pl/ambiguous_expansions.txt +++ b/resources/dictionaries/pl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +d g k m diff --git a/resources/dictionaries/pl/cross_streets.txt b/resources/dictionaries/pl/cross_streets.txt new file mode 100644 index 00000000..030bd3e3 --- /dev/null +++ b/resources/dictionaries/pl/cross_streets.txt @@ -0,0 +1,5 @@ +i +na rogu +pomiędzy|pomiedzy +rogu +w \ No newline at end of file diff --git a/resources/dictionaries/pl/directionals.txt b/resources/dictionaries/pl/directionals.txt index 36922d69..acae9bd0 100644 --- a/resources/dictionaries/pl/directionals.txt +++ b/resources/dictionaries/pl/directionals.txt @@ -1,8 +1,8 @@ północ|polnoc|płn|pln północny wschód|polnocny wschod|pn.-wsch.|pn-wsch|pn wsch|płn.-wsch.|płn-wsch|płn wsch|pln.-wsch.|pln-wsch|pln wsch północny zachód|polnocny zachod|pn.-zach.|pn-zach|pn zach|płn.-zach.|płn-zach|płn zach|pln.-zach.|pln-zach|pln zach -wschód|wschod +wschód|wschod|w|wsch południe|poludnie|pd|płd|pld południowy wschód|poludniowy wschod|pd.-wsch.|pd-wsch|pd wsch|płd.-wsch.|płd-wsch|płd wsch|pld.-wsch.|pld-wsch|pld wsch południowy zachód|poludniowy zachod|pd.-zach.|pd-zach|pd zach|płd.-zach.|płd-zach|płd zach|pld.-zach.|pld-zach|pld zach -zachód|zachod \ No newline at end of file +zachód|zachod|z|zach \ No newline at end of file diff --git a/resources/dictionaries/pl/entrances.txt b/resources/dictionaries/pl/entrances.txt new file mode 100644 index 00000000..80b54807 --- /dev/null +++ b/resources/dictionaries/pl/entrances.txt @@ -0,0 +1 @@ +wejście|wejscie \ No newline at end of file diff --git a/resources/dictionaries/pl/house_numbers.txt b/resources/dictionaries/pl/house_numbers.txt new file mode 100644 index 00000000..b3a0c117 --- /dev/null +++ b/resources/dictionaries/pl/house_numbers.txt @@ -0,0 +1 @@ +dom|d \ No newline at end of file diff --git a/resources/dictionaries/pl/level_types_basement.txt b/resources/dictionaries/pl/level_types_basement.txt new file mode 100644 index 00000000..21a97510 --- /dev/null +++ b/resources/dictionaries/pl/level_types_basement.txt @@ -0,0 +1 @@ +suterena|s \ No newline at end of file diff --git a/resources/dictionaries/pl/level_types_standalone.txt b/resources/dictionaries/pl/level_types_standalone.txt new file mode 100644 index 00000000..6942d135 --- /dev/null +++ b/resources/dictionaries/pl/level_types_standalone.txt @@ -0,0 +1 @@ +parter \ No newline at end of file diff --git a/resources/dictionaries/pl/near.txt b/resources/dictionaries/pl/near.txt new file mode 100644 index 00000000..302fd0cd --- /dev/null +++ b/resources/dictionaries/pl/near.txt @@ -0,0 +1,11 @@ +blisko +koło|kolo|k +niedaleko +obok +przy +w +w pobliżu|w poblizu +w pobliżu mnie|w poblizu mnie +w pobliżu tutaj|w poblizu tutaj +wokół tutaj|wokol tutaj +we \ No newline at end of file diff --git a/resources/dictionaries/pl/number.txt b/resources/dictionaries/pl/number.txt new file mode 100644 index 00000000..0c43ae06 --- /dev/null +++ b/resources/dictionaries/pl/number.txt @@ -0,0 +1 @@ +numer|nr|#|№ \ No newline at end of file diff --git a/resources/dictionaries/pl/staircases.txt b/resources/dictionaries/pl/staircases.txt new file mode 100644 index 00000000..bd4a978d --- /dev/null +++ b/resources/dictionaries/pl/staircases.txt @@ -0,0 +1 @@ +schody \ No newline at end of file diff --git a/resources/dictionaries/pl/stopwords.txt b/resources/dictionaries/pl/stopwords.txt index e2d67453..7d40e787 100644 --- a/resources/dictionaries/pl/stopwords.txt +++ b/resources/dictionaries/pl/stopwords.txt @@ -1,3 +1,3 @@ i na -koło|k \ No newline at end of file +w \ No newline at end of file diff --git a/resources/dictionaries/pl/unit_directions.txt b/resources/dictionaries/pl/unit_directions.txt new file mode 100644 index 00000000..03826d6d --- /dev/null +++ b/resources/dictionaries/pl/unit_directions.txt @@ -0,0 +1,2 @@ +lewo +prawo \ No newline at end of file