From 11c656478378c410fe0b90f7468ba2f86c3002ac Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 26 Jun 2016 01:24:00 -0400 Subject: [PATCH] [addresses] Russian address config --- resources/addresses/ru.yaml | 957 ++++++++++++++++++++++++++++ scripts/geodata/addresses/config.py | 2 +- 2 files changed, 958 insertions(+), 1 deletion(-) create mode 100644 resources/addresses/ru.yaml diff --git a/resources/addresses/ru.yaml b/resources/addresses/ru.yaml new file mode 100644 index 00000000..32315d49 --- /dev/null +++ b/resources/addresses/ru.yaml @@ -0,0 +1,957 @@ +# ru.yaml +# ------- +# Russian language specification + +alphabet: абвгдежзийклмнопрстуфхцчшщъыьэюя +alphabet_probability: 0.7 + +components: + level: + null_probability: 0.95 + alphanumeric_probability: 0.04 + standalone_probability: 0.01 + + staircase: + null_probability: 0.99 + alphanumeric_probability: 0.01 + + entrance: + null_probability: 0.999 + alphanumeric_probability: 0.001 + + unit: + null_probability: 0.6 + alphanumeric_probability: 0.4 + + +numbers: + default: &nomer + canonical: номер + abbreviated: № + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + probability: 0.95 + alternatives: + - alternative: &nomer_latin + canonical: nomer + abbreviated: "no" + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + numeric: + direction: left + probability: 0.05 + + +house_number: + dom: &dom + canonical: дом + abbreviated: д + sample: true + canonical_probability: 0.8 + abbreviated_probability: 0.1 + sample_probability: 0.1 + numeric: + direction: left + dom_latin: &dom_latin + canonical: dom + abbreviated: d + sample: true + canonical_probability: 0.8 + abbreviated_probability: 0.1 + sample_probability: 0.1 + numeric: + direction: left + alphanumeric: + default: *dom + probability: 0.95 + alternatives: + - alternative: *dom_latin + probability: 0.05 + + # Very common in Russian to write dom/d + alphanumeric_phrase_probability: 0.6 + +and: + default: &i + canonical: и + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.9 + alternatives: + - alternative: &i_latin + canonical: i + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.1 + + + +cross_streets: + and: *i + and: *i_latin + corner: &ugol + canonical: угол + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + ugol_latin: &ugol_latin + canonical: ugol + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + uglu: &uglu + canonical: углу + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + uglu_latin: &uglu_latin + canonical: uglu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_uglu: &na_uglu + canonical: на углу + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + na_uglu_latin: &na_uglu_latin + canonical: na uglu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + intersection: + default: *i + probability: 0.65 + alternatives: + - alternative: *i_latin + probability: 0.05 + - alternative: *ugol + probability: 0.075 + - alternative: *ugol_latin + probability: 0.075 + - alternative: *uglu + probability: 0.05 + - alternative: *uglu_latin + probability: 0.05 + - alternative: *na_uglu + probability: 0.025 + - alternative: *na_uglu_latin + probability: 0.025 + mezhdu: &mezhdu + canonical: между + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + mezhdu_latin: &mezhdu_latin + canonical: mezhdu + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + parentheses_probability: 0.5 + between: + default: *mezhdu + probability: 0.9 + alternatives: + - alternative: *mezhdu_latin + probability: 0.1 + +levels: + etazh: &etazh + canonical: этаж + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + numeric: + direction: left + direction_probability: 0.9 + ordinal: + direction: right + numeric_probability: 0.4 + ordinal_probability: 0.6 + etazh_latin: &etazh_latin + canonical: etazh + sample: true + canonical_probability: 0.7 + sample_probability: 0.3 + numeric: + direction: left + direction_probability: 0.9 + ordinal: + direction: right + numeric_probability: 0.4 + ordinal_probability: 0.6 + tsokolnyy_etazh: &tsokolnyy_etazh + canonical: цокольный этаж + abbreviated: цок эт + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + tsokolnyy_etazh_latin: &tsokolnyy_etazh_latin + canonical: tsokol'nyy etazh + abbreviated: tsok et + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.4 + sample_probability: 0.2 + podval: &podval + canonical: подвал + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + numeric_affix: + affix: п + direction: left + ordinal: + direction: right + ordinal: + direction: right + number_abs_value: true + number_min_abs_value: 2 + # Basement 2 == Sub-basement 1 + number_subtract_abs_value: 1 + standalone_probability: 0.985 + numeric_probability: 0.005 + numeric_affix_probability: 0.005 + ordinal_probability: 0.005 + podval_latin: &podval_latin + canonical: podval + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + direction_probability: 0.9 + numeric_affix: + affix: p + direction: left + ordinal: + direction: right + ordinal: + direction: right + number_abs_value: true + number_min_abs_value: 2 + # Basement 2 == Sub-basement 1 + number_subtract_abs_value: 1 + standalone_probability: 0.985 + numeric_probability: 0.005 + numeric_affix_probability: 0.005 + ordinal_probability: 0.005 + + aliases: + "<-1": + default: *podval + probability: 0.9 + alternatives: + - alternative: *podval_latin + probability: 0.1 + "-1": &ground_floor + default: *tsokolnyy_etazh + probability: 0.89 + alternatives: + - alternative: *tsokolnyy_etazh_latin + probability: 0.01 + - alternative: *etazh + probability: 0.09 + - alternative: *etazh_latin + probability: 0.01 + "0": *ground_floor + + numbering_starts_at: 0 + + alphanumeric: + default: *etazh + probability: 0.9 + alternatives: + - alternative: *etazh_latin + probability: 0.1 + numeric_probability: 0.99 # With this probability, pick an integer + alpha_probability: 0.0098 # With this probability, pick a letter e.g. A + numeric_plus_alpha_probability: 0.0001 # e.g. 2A + alpha_plus_numeric_probability: 0.0001 # e.g. A2 + +categories: + near: + default: + canonical: вблизи + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.74 + alternatives: + - alternative: + canonical: vblizi + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: близ + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: bliz + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: около + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: okolo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: у + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: u + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: возле + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: vozle + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: рядом с + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: ryadom s + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + + nearby: + default: + canonical: поблизости + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.64 + alternatives: + - alternative: + canonical: poblizosti + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: рядом здесь + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.19 + - alternative: + canonical: ryadom zdes' + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: здесь + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.09 + - alternative: + canonical: zdes' + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + - alternative: + canonical: рядом + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.04 + - alternative: + canonical: ryadom + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + near_me: + default: + canonical: рядом с мной + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.99 + alternatives: + - alternative: + canonical: ryadom s mnoy + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + probability: 0.01 + in: + default: + canonical: в + probability: 0.99 + alternatives: + - alternative: + canonical: v + probability: 0.01 + + # Probabilities of each phrase + near_probability: 0.35 + nearby_probability: 0.2 + near_me_probability: 0.1 + in_probability: 0.35 + +directions: + pravo: &pravo + canonical: право + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + pravo_latin: &pravo_latin + canonical: pravo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + levo: &levo + canonical: лево + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + levo_latin: &levo_latin + canonical: levo + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: right + alternatives: + - alternative: *pravo + probability: 0.49 + - alternative: *pravo_latin + probability: 0.01 + - alternative: *levo + probability: 0.49 + - alternative: *levo_latin + probability: 0.01 + + + +cardinal_directions: + vostok: &vostok + canonical: восток + abbreviated: в + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: в + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + vostok_latin: &vostok_latin + canonical: vostok + abbreviated: v + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: v + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + zapad: &zapad + canonical: запад + abbreviated: з + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: з + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + zapad_latin: &zapad_latin + canonical: zapad + abbreviated: z + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: z + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + sever: &sever + canonical: север + abbreviated: с + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: с + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + sever_latin: &sever_latin + canonical: sever + abbreviated: s + canonical_probability: 0.95 + abbreviated_probability: 0.05 + numeric: + direction: right + numeric_affix: + affix: s + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + yug: &yug + canonical: Юг + abbreviated: Ю + sample: true + canonical_probability: 0.75 + abbreviated_probability: 0.1 + sample_probability: 0.15 + numeric: + direction: right + numeric_affix: + affix: Ю + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + yug_latin: &yug_latin + canonical: yug + abbreviated: y + sample: true + canonical_probability: 0.75 + abbreviated_probability: 0.1 + sample_probability: 0.15 + numeric: + direction: right + numeric_affix: + affix: y + direction: right + numeric_probability: 0.5 + numeric_affix_probability: 0.5 + + alternatives: + - alternative: *sever + probability: 0.24 + - alternative: *sever_latin + probability: 0.01 + - alternative: *vostok + probability: 0.24 + - alternative: *vostok_latin + probability: 0.01 + - alternative: *yug + probability: 0.24 + - alternative: *yug_latin + probability: 0.01 + - alternative: *zapad + probability: 0.24 + - alternative: *zapad_latin + probability: 0.01 + +entrances: + vkhod: &vkhod + canonical: вход + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + vkhod_latin: &vkhod_latin + canonical: vkhod + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + # вход 1, вход A, etc. + alphanumeric: + default: *vkhod + probability: 0.99 + alternatives: + - alternative: *vkhod_latin + probability: 0.01 + numeric_probability: 0.1 # e.g. Wejście 1 + alpha_probability: 0.85 # e.g. Wejście A + numeric_plus_alpha_probability: 0.025 # e.g. 1A + alpha_plus_numeric_probability: 0.025 # e.g. A1 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + +staircases: + lestnitsa: &lestnitsa + canonical: лестница + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + lestnitsa_latin: &lestnitsa_latin + canonical: lestnitsa + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + + alphanumeric: &staircase_alphanumeric + default: *lestnitsa + probability: 0.99 + alternatives: + - alternative: *lestnitsa_latin + probability: 0.01 + numeric_probability: 0.75 + alpha_probability: 0.2 + numeric_plus_alpha_probability: 0.025 + alpha_plus_numeric_probability: 0.025 + + alpha_plus_numeric: + whitespace_probability: 0.1 + + numeric_plus_alpha: + whitespace_probability: 0.1 + + directional: + direction: left + direction_probability: 0.85 + modifier: + alternatives: + - alternative: *sever + - alternative: *vostok + - alternative: *yug + - alternative: *zapad + +po_boxes: + abonementnyy_pochtovyy_yashchik: &abonementnyy_pochtovyy_yashchik + canonical: абонементный почтовый ящик + abbreviated: а/я + sample: true + canonical_probability: 0.2 + abbreviated_probability: 0.7 + sample_probability: 0.1 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + abonementnyy_pochtovyy_yashchik_latin: &abonementnyy_pochtovyy_yashchik_latin + canonical: abonementnyy pochtovyy yashchik + abbreviated: a/ya + sample: true + canonical_probability: 0.2 + abbreviated_probability: 0.7 + sample_probability: 0.1 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + pochtovyy_yashchik: &pochtovyy_yashchik + canonical: абонементный почтовый ящик + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + pochtovyy_yashchik_latin: &pochtovyy_yashchik_latin + canonical: pochtovyy yashchik + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + + pochtovyy_abonentskiy_yashchik: &pochtovyy_abonentskiy_yashchik + canonical: почтовый абонентский ящик + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + + pochtovyy_abonentskiy_yashchik_latin: &pochtovyy_abonentskiy_yashchik_latin + canonical: pochtovyy abonentskiy yashchik + sample: true + canonical_probability: 0.8 + sample_probability: 0.2 + numeric: + direction: left + add_number_phrase: true + add_number_phrase_probability: 0.2 + + alphanumeric: + default: *abonementnyy_pochtovyy_yashchik + probability: 0.79 + alternatives: + - alternative: *abonementnyy_pochtovyy_yashchik_latin + probability: 0.01 + - alternative: *pochtovyy_yashchik + probability: 0.14 + - alternative: *pochtovyy_yashchik_latin + probability: 0.01 + - alternative: *pochtovyy_abonentskiy_yashchik + probability: 0.04 + - alternative: *pochtovyy_abonentskiy_yashchik_latin + probability: 0.01 + numeric_probability: 0.9 # а/я 123 + alpha_probability: 0.05 # а/я А + numeric_plus_alpha_probability: 0.04 # а/я 123А + alpha_plus_numeric_probability: 0.01 # а/я А123 + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + digits: + - length: 1 + probability: 0.05 + - length: 2 + probability: 0.1 + - length: 3 + probability: 0.2 + - length: 4 + probability: 0.5 + - length: 5 + probability: 0.1 + - length: 6 + probability: 0.05 + +units: + kvartira: &kvartira + canonical: квартира + abbreviated: кв + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.6 + sample_probability: 0.1 + numeric: + direction: left + kvartira_latin: &kvartira_latin + canonical: kvartira + abbreviated: kv + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.6 + sample_probability: 0.1 + numeric: + direction: left + + kabinet: &kabinet + canonical: кабинет + abbreviated: каб + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + kabinet_latin: &kabinet_latin + canonical: kabinet + abbreviated: kab + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + + litera: &litera + canonical: литера + abbreviated: лит + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + litera_latin: &litera_latin + canonical: litera + abbreviated: lit + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + + ofis: &ofis + canonical: офис + abbreviated: оф + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.5 + sample_probability: 0.1 + numeric: + direction: left + ofis_latin: &ofis_latin + canonical: ofis + abbreviated: of + sample: true + canonical_probability: 0.4 + abbreviated_probability: 0.5 + sample_probability: 0.1 + numeric: + direction: left + + pomeshhenie: &pomeshhenie + canonical: помещение + abbreviated: пом + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + pomeshhenie_latin: &pomeshhenie_latin + canonical: pomeshhenie + abbreviated: pom + sample: true + canonical_probability: 0.3 + abbreviated_probability: 0.4 + sample_probability: 0.3 + numeric: + direction: left + + alphanumeric: &unit_alphanumeric + default: *kvartira + probability: 0.89 + alternatives: + - alternative: *kvartira_latin + probability: 0.01 + - alternative: *pomeshhenie + probability: 0.09 + - alternative: *pomeshhenie_latin + probability: 0.01 + + numeric_probability: 0.9 # e.g. кв 1 + numeric_plus_alpha_probability: 0.03 # e.g. 1А + alpha_plus_numeric_probability: 0.03 # e.g. AА1 + alpha_probability: 0.04 # e.g. кв А + + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + # If there are 10 floors, create unit numbers like #301 or #1032 + use_floor_probability: 0.1 + + alpha: + default: *kvartira + probability: 0.79 + alternatives: + - alternative: *kvartira_latin + probability: 0.01 + - alternative: *pomeshhenie + probability: 0.09 + - alternative: *pomeshhenie_latin + probability: 0.01 + - alternative: *litera + probability: 0.09 + - alternative: *litera_latin + probability: 0.01 + + + zones: + commercial: + default: *kabinet + probability: 0.59 + alternatives: + - alternative: *kabinet_latin + probability: 0.01 + - alternative: *ofis + probability: 0.29 + - alternative: *ofis_latin + probability: 0.01 + - alternative: *pomeshhenie + probability: 0.09 + - alternative: *pomeshhenie_latin + probability: 0.01 + numeric_probability: 0.95 # e.g. kabinet 1 + numeric_plus_alpha_probability: 0.01 # e.g. kabinet 1A + alpha_plus_numeric_probability: 0.01 # e.g. kab A1 + alpha_probability: 0.03 # e.g. kab A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 + + university: + default: *pomeshhenie + probability: 0.99 + alternatives: + - alternative: *pomeshhenie_latin + probability: 0.01 + numeric_probability: 0.95 # e.g. kabinet 1 + numeric_plus_alpha_probability: 0.01 # e.g. kabinet 1A + alpha_plus_numeric_probability: 0.01 # e.g. kab A1 + alpha_probability: 0.03 # e.g. kab A + alpha_plus_numeric: + whitespace_probability: 0.1 + numeric_plus_alpha: + whitespace_probability: 0.1 diff --git a/scripts/geodata/addresses/config.py b/scripts/geodata/addresses/config.py index 104ed473..37afe41b 100644 --- a/scripts/geodata/addresses/config.py +++ b/scripts/geodata/addresses/config.py @@ -26,7 +26,7 @@ class AddressConfig(object): self.cache = {} for filename in os.listdir(config_dir): - if filename not in ('en.yaml', 'es.yaml', 'ca.yaml', 'fr.yaml', 'de.yaml', 'nl.yaml', 'da.yaml', 'nb.yaml', 'sv.yaml', 'pt.yaml', 'pl.yaml'): + if filename not in ('en.yaml', 'es.yaml', 'ca.yaml', 'fr.yaml', 'de.yaml', 'nl.yaml', 'da.yaml', 'nb.yaml', 'sv.yaml', 'pt.yaml', 'pl.yaml', 'ru.yaml'): continue config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))