[addresses] Russian address config

This commit is contained in:
Al
2016-06-26 01:24:00 -04:00
parent aac2042ba9
commit ee1326b15c
2 changed files with 958 additions and 1 deletions

957
resources/addresses/ru.yaml Normal file
View File

@@ -0,0 +1,957 @@
# ru.yaml
# -------
# Russian language specification
alphabet: абвгдежзийклмнопрстуфхцчшщъыьэюя
alphabet_probability: 0.7
components:
level:
null_probability: 0.95
alphanumeric_probability: 0.04
standalone_probability: 0.01
staircase:
null_probability: 0.99
alphanumeric_probability: 0.01
entrance:
null_probability: 0.999
alphanumeric_probability: 0.001
unit:
null_probability: 0.6
alphanumeric_probability: 0.4
numbers:
default: &nomer
canonical: номер
abbreviated:
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
probability: 0.95
alternatives:
- alternative: &nomer_latin
canonical: nomer
abbreviated: "no"
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
probability: 0.05
house_number:
dom: &dom
canonical: дом
abbreviated: д
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.1
sample_probability: 0.1
numeric:
direction: left
dom_latin: &dom_latin
canonical: dom
abbreviated: d
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.1
sample_probability: 0.1
numeric:
direction: left
alphanumeric:
default: *dom
probability: 0.95
alternatives:
- alternative: *dom_latin
probability: 0.05
# Very common in Russian to write dom/d
alphanumeric_phrase_probability: 0.6
and:
default: &i
canonical: и
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.9
alternatives:
- alternative: &i_latin
canonical: i
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.1
cross_streets:
and: *i
and: *i_latin
corner: &ugol
canonical: угол
sample: true
canonical_probability: 0.8
sample_probability: 0.2
ugol_latin: &ugol_latin
canonical: ugol
sample: true
canonical_probability: 0.8
sample_probability: 0.2
uglu: &uglu
canonical: углу
sample: true
canonical_probability: 0.8
sample_probability: 0.2
uglu_latin: &uglu_latin
canonical: uglu
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_uglu: &na_uglu
canonical: на углу
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_uglu_latin: &na_uglu_latin
canonical: na uglu
sample: true
canonical_probability: 0.8
sample_probability: 0.2
intersection:
default: *i
probability: 0.65
alternatives:
- alternative: *i_latin
probability: 0.05
- alternative: *ugol
probability: 0.075
- alternative: *ugol_latin
probability: 0.075
- alternative: *uglu
probability: 0.05
- alternative: *uglu_latin
probability: 0.05
- alternative: *na_uglu
probability: 0.025
- alternative: *na_uglu_latin
probability: 0.025
mezhdu: &mezhdu
canonical: между
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parentheses_probability: 0.5
mezhdu_latin: &mezhdu_latin
canonical: mezhdu
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parentheses_probability: 0.5
between:
default: *mezhdu
probability: 0.9
alternatives:
- alternative: *mezhdu_latin
probability: 0.1
levels:
etazh: &etazh
canonical: этаж
sample: true
canonical_probability: 0.7
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
ordinal:
direction: right
numeric_probability: 0.4
ordinal_probability: 0.6
etazh_latin: &etazh_latin
canonical: etazh
sample: true
canonical_probability: 0.7
sample_probability: 0.3
numeric:
direction: left
direction_probability: 0.9
ordinal:
direction: right
numeric_probability: 0.4
ordinal_probability: 0.6
tsokolnyy_etazh: &tsokolnyy_etazh
canonical: цокольный этаж
abbreviated: цок эт
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
tsokolnyy_etazh_latin: &tsokolnyy_etazh_latin
canonical: tsokol'nyy etazh
abbreviated: tsok et
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
podval: &podval
canonical: подвал
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
numeric_affix:
affix: п
direction: left
ordinal:
direction: right
ordinal:
direction: right
number_abs_value: true
number_min_abs_value: 2
# Basement 2 == Sub-basement 1
number_subtract_abs_value: 1
standalone_probability: 0.985
numeric_probability: 0.005
numeric_affix_probability: 0.005
ordinal_probability: 0.005
podval_latin: &podval_latin
canonical: podval
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
numeric_affix:
affix: p
direction: left
ordinal:
direction: right
ordinal:
direction: right
number_abs_value: true
number_min_abs_value: 2
# Basement 2 == Sub-basement 1
number_subtract_abs_value: 1
standalone_probability: 0.985
numeric_probability: 0.005
numeric_affix_probability: 0.005
ordinal_probability: 0.005
aliases:
"<-1":
default: *podval
probability: 0.9
alternatives:
- alternative: *podval_latin
probability: 0.1
"-1": &ground_floor
default: *tsokolnyy_etazh
probability: 0.89
alternatives:
- alternative: *tsokolnyy_etazh_latin
probability: 0.01
- alternative: *etazh
probability: 0.09
- alternative: *etazh_latin
probability: 0.01
"0": *ground_floor
numbering_starts_at: 0
alphanumeric:
default: *etazh
probability: 0.9
alternatives:
- alternative: *etazh_latin
probability: 0.1
numeric_probability: 0.99 # With this probability, pick an integer
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
alpha_plus_numeric_probability: 0.0001 # e.g. A2
categories:
near:
default:
canonical: вблизи
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.74
alternatives:
- alternative:
canonical: vblizi
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: близ
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: bliz
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: около
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: okolo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: у
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: u
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: возле
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: vozle
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: рядом с
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: ryadom s
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
nearby:
default:
canonical: поблизости
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.64
alternatives:
- alternative:
canonical: poblizosti
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: рядом здесь
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.19
- alternative:
canonical: ryadom zdes'
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: здесь
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.09
- alternative:
canonical: zdes'
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: рядом
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: ryadom
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
near_me:
default:
canonical: рядом с мной
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.99
alternatives:
- alternative:
canonical: ryadom s mnoy
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
in:
default:
canonical: в
probability: 0.99
alternatives:
- alternative:
canonical: v
probability: 0.01
# Probabilities of each phrase
near_probability: 0.35
nearby_probability: 0.2
near_me_probability: 0.1
in_probability: 0.35
directions:
pravo: &pravo
canonical: право
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
pravo_latin: &pravo_latin
canonical: pravo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
levo: &levo
canonical: лево
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
levo_latin: &levo_latin
canonical: levo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
alternatives:
- alternative: *pravo
probability: 0.49
- alternative: *pravo_latin
probability: 0.01
- alternative: *levo
probability: 0.49
- alternative: *levo_latin
probability: 0.01
cardinal_directions:
vostok: &vostok
canonical: восток
abbreviated: в
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: в
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
vostok_latin: &vostok_latin
canonical: vostok
abbreviated: v
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: v
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
zapad: &zapad
canonical: запад
abbreviated: з
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: з
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
zapad_latin: &zapad_latin
canonical: zapad
abbreviated: z
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: z
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
sever: &sever
canonical: север
abbreviated: с
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: с
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
sever_latin: &sever_latin
canonical: sever
abbreviated: s
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: s
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
yug: &yug
canonical: Юг
abbreviated: Ю
sample: true
canonical_probability: 0.75
abbreviated_probability: 0.1
sample_probability: 0.15
numeric:
direction: right
numeric_affix:
affix: Ю
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
yug_latin: &yug_latin
canonical: yug
abbreviated: y
sample: true
canonical_probability: 0.75
abbreviated_probability: 0.1
sample_probability: 0.15
numeric:
direction: right
numeric_affix:
affix: y
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
alternatives:
- alternative: *sever
probability: 0.24
- alternative: *sever_latin
probability: 0.01
- alternative: *vostok
probability: 0.24
- alternative: *vostok_latin
probability: 0.01
- alternative: *yug
probability: 0.24
- alternative: *yug_latin
probability: 0.01
- alternative: *zapad
probability: 0.24
- alternative: *zapad_latin
probability: 0.01
entrances:
vkhod: &vkhod
canonical: вход
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
vkhod_latin: &vkhod_latin
canonical: vkhod
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
# вход 1, вход A, etc.
alphanumeric:
default: *vkhod
probability: 0.99
alternatives:
- alternative: *vkhod_latin
probability: 0.01
numeric_probability: 0.1 # e.g. Wejście 1
alpha_probability: 0.85 # e.g. Wejście A
numeric_plus_alpha_probability: 0.025 # e.g. 1A
alpha_plus_numeric_probability: 0.025 # e.g. A1
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
staircases:
lestnitsa: &lestnitsa
canonical: лестница
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
lestnitsa_latin: &lestnitsa_latin
canonical: lestnitsa
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
alphanumeric: &staircase_alphanumeric
default: *lestnitsa
probability: 0.99
alternatives:
- alternative: *lestnitsa_latin
probability: 0.01
numeric_probability: 0.75
alpha_probability: 0.2
numeric_plus_alpha_probability: 0.025
alpha_plus_numeric_probability: 0.025
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
directional:
direction: left
direction_probability: 0.85
modifier:
alternatives:
- alternative: *sever
- alternative: *vostok
- alternative: *yug
- alternative: *zapad
po_boxes:
abonementnyy_pochtovyy_yashchik: &abonementnyy_pochtovyy_yashchik
canonical: абонементный почтовый ящик
abbreviated: а
sample: true
canonical_probability: 0.2
abbreviated_probability: 0.7
sample_probability: 0.1
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
abonementnyy_pochtovyy_yashchik_latin: &abonementnyy_pochtovyy_yashchik_latin
canonical: abonementnyy pochtovyy yashchik
abbreviated: a/ya
sample: true
canonical_probability: 0.2
abbreviated_probability: 0.7
sample_probability: 0.1
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
pochtovyy_yashchik: &pochtovyy_yashchik
canonical: абонементный почтовый ящик
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
pochtovyy_yashchik_latin: &pochtovyy_yashchik_latin
canonical: pochtovyy yashchik
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
pochtovyy_abonentskiy_yashchik: &pochtovyy_abonentskiy_yashchik
canonical: почтовый абонентский ящик
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
pochtovyy_abonentskiy_yashchik_latin: &pochtovyy_abonentskiy_yashchik_latin
canonical: pochtovyy abonentskiy yashchik
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
alphanumeric:
default: *abonementnyy_pochtovyy_yashchik
probability: 0.79
alternatives:
- alternative: *abonementnyy_pochtovyy_yashchik_latin
probability: 0.01
- alternative: *pochtovyy_yashchik
probability: 0.14
- alternative: *pochtovyy_yashchik_latin
probability: 0.01
- alternative: *pochtovyy_abonentskiy_yashchik
probability: 0.04
- alternative: *pochtovyy_abonentskiy_yashchik_latin
probability: 0.01
numeric_probability: 0.9 # а/я 123
alpha_probability: 0.05 # аА
numeric_plus_alpha_probability: 0.04 # а/я 123А
alpha_plus_numeric_probability: 0.01 # аА123
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
digits:
- length: 1
probability: 0.05
- length: 2
probability: 0.1
- length: 3
probability: 0.2
- length: 4
probability: 0.5
- length: 5
probability: 0.1
- length: 6
probability: 0.05
units:
kvartira: &kvartira
canonical: квартира
abbreviated: кв
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.6
sample_probability: 0.1
numeric:
direction: left
kvartira_latin: &kvartira_latin
canonical: kvartira
abbreviated: kv
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.6
sample_probability: 0.1
numeric:
direction: left
kabinet: &kabinet
canonical: кабинет
abbreviated: каб
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
kabinet_latin: &kabinet_latin
canonical: kabinet
abbreviated: kab
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
litera: &litera
canonical: литера
abbreviated: лит
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
litera_latin: &litera_latin
canonical: litera
abbreviated: lit
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
ofis: &ofis
canonical: офис
abbreviated: оф
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.5
sample_probability: 0.1
numeric:
direction: left
ofis_latin: &ofis_latin
canonical: ofis
abbreviated: of
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.5
sample_probability: 0.1
numeric:
direction: left
pomeshhenie: &pomeshhenie
canonical: помещение
abbreviated: пом
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
pomeshhenie_latin: &pomeshhenie_latin
canonical: pomeshhenie
abbreviated: pom
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
numeric:
direction: left
alphanumeric: &unit_alphanumeric
default: *kvartira
probability: 0.89
alternatives:
- alternative: *kvartira_latin
probability: 0.01
- alternative: *pomeshhenie
probability: 0.09
- alternative: *pomeshhenie_latin
probability: 0.01
numeric_probability: 0.9 # e.g. кв 1
numeric_plus_alpha_probability: 0.03 # e.g. 1А
alpha_plus_numeric_probability: 0.03 # e.g. AА1
alpha_probability: 0.04 # e.g. кв А
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
# If there are 10 floors, create unit numbers like #301 or #1032
use_floor_probability: 0.1
alpha:
default: *kvartira
probability: 0.79
alternatives:
- alternative: *kvartira_latin
probability: 0.01
- alternative: *pomeshhenie
probability: 0.09
- alternative: *pomeshhenie_latin
probability: 0.01
- alternative: *litera
probability: 0.09
- alternative: *litera_latin
probability: 0.01
zones:
commercial:
default: *kabinet
probability: 0.59
alternatives:
- alternative: *kabinet_latin
probability: 0.01
- alternative: *ofis
probability: 0.29
- alternative: *ofis_latin
probability: 0.01
- alternative: *pomeshhenie
probability: 0.09
- alternative: *pomeshhenie_latin
probability: 0.01
numeric_probability: 0.95 # e.g. kabinet 1
numeric_plus_alpha_probability: 0.01 # e.g. kabinet 1A
alpha_plus_numeric_probability: 0.01 # e.g. kab A1
alpha_probability: 0.03 # e.g. kab A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
university:
default: *pomeshhenie
probability: 0.99
alternatives:
- alternative: *pomeshhenie_latin
probability: 0.01
numeric_probability: 0.95 # e.g. kabinet 1
numeric_plus_alpha_probability: 0.01 # e.g. kabinet 1A
alpha_plus_numeric_probability: 0.01 # e.g. kab A1
alpha_probability: 0.03 # e.g. kab A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1

View File

@@ -26,7 +26,7 @@ class AddressConfig(object):
self.cache = {}
for filename in os.listdir(config_dir):
if filename not in ('en.yaml', 'es.yaml', 'ca.yaml', 'fr.yaml', 'de.yaml', 'nl.yaml', 'da.yaml', 'nb.yaml', 'sv.yaml', 'pt.yaml', 'pl.yaml'):
if filename not in ('en.yaml', 'es.yaml', 'ca.yaml', 'fr.yaml', 'de.yaml', 'nl.yaml', 'da.yaml', 'nb.yaml', 'sv.yaml', 'pt.yaml', 'pl.yaml', 'ru.yaml'):
continue
config = yaml.load(open(os.path.join(ADDRESS_CONFIG_DIR, filename)))