[addresses] Adding digit spellout and the list form of field combinations to existing configs

This commit is contained in:
Al
2016-07-04 13:46:19 -04:00
parent 64f167f045
commit af11db1488
20 changed files with 353 additions and 124 deletions

View File

@@ -100,11 +100,15 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
@@ -120,9 +124,13 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
numeric_probability: 0.4
ordinal_probability: 0.6
@@ -134,9 +142,13 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
numeric_probability: 0.4
ordinal_probability: 0.6
@@ -163,6 +175,8 @@ levels:
# e.g. 1. podzemní podlaží
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
standalone_probability: 0.985
number_abs_value: true

View File

@@ -21,7 +21,7 @@ components:
alphanumeric_probability: 0.25
combinations:
level_unit:
-
components:
- level
- unit
@@ -32,7 +32,7 @@ components:
- separator: " - "
probability: 0.1
probability: 0.005
entrance_unit:
-
components:
- entrance
- unit
@@ -121,6 +121,9 @@ levels:
direction_probability: 0.9
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
etage: &etage
@@ -132,6 +135,9 @@ levels:
direction: right
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
stuen: &stuen

View File

@@ -26,7 +26,7 @@ components:
combinations:
# e.g. 2/34, more common way to specify a unit number in German
# if unit exists in the first place
house_number_unit:
-
components:
- house_number
- unit
@@ -117,6 +117,9 @@ levels:
direction: right
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.3
numeric_affix_probability: 0.5
ordinal_probability: 0.2
@@ -129,6 +132,9 @@ levels:
direction: right
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
stock: &stock
@@ -140,6 +146,9 @@ levels:
direction: right
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.1
ordinal_probability: 0.9
erdgeschoss: &erdgeschoss
@@ -613,29 +622,8 @@ countries:
# Combined apartment numbers are very common
combinations:
# e.g. Neubaugasse 55/5
house_number_unit:
probability: 0.7
separators:
- separator: /
probability: 0.98
- separator: "-"
probability: 0.02
# e.g. Neubaugasse 55/1/5
house_number_staircase_unit:
components:
- house_number
- staircase
- unit
label: house_number
separators:
- separator: /
probability: 0.98
- separator: "-"
probability: 0.02
probability: 0.8
# e.g. Neubaugasse 55/A/1/5
house_number_entrance_staircase_unit:
-
components:
- house_number
- entrance
@@ -648,6 +636,31 @@ countries:
- separator: "-"
probability: 0.02
probability: 0.9
# e.g. Neubaugasse 55/1/5
-
components:
- house_number
- staircase
- unit
label: house_number
separators:
- separator: /
probability: 0.98
- separator: "-"
probability: 0.02
probability: 0.8
# e.g. Neubaugasse 55/5
-
components:
- house_number
- unit
label: house_number
probability: 0.7
separators:
- separator: /
probability: 0.98
- separator: "-"
probability: 0.02
units:
top: &top

View File

@@ -65,7 +65,7 @@ components:
combinations:
# For unit types like 2/34 (more common in Canada and Australia)
house_number_unit:
-
components:
- house_number
- unit
@@ -163,6 +163,9 @@ levels:
# e.g. 1st Floor
ordinal:
direction: right # canonical or abbreviated form goes to the ordinal's right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
# Probabilities
numeric_probability: 0.75 # Use the simple number e.g. Floor 1 (or Floor No. 1)
numeric_affix_probability: 0.05 # Use the 2/F (less common)
@@ -1356,9 +1359,7 @@ countries:
ca:
components:
combinations:
house_number_unit:
probability: 0.0
unit_house_number:
-
components:
- unit
- house_number

View File

@@ -108,6 +108,9 @@ levels:
direction: right
direction_probability: 0.95 # Let it vary occasionally e.g. Piso 2o
standalone_probability: 0.2 # Let e.g. 5º be the entire floor string
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.6
numeric_affix_probability: 0.05
ordinal_probability: 0.35
@@ -954,6 +957,9 @@ countries:
ordinal:
direction: right
gender: f
digits:
ascii_probability: 0.8
spellout_probability: 0.2
canonical_probability: 0.6
abbreviated_probability: 0.2
sample_probability: 0.2

View File

@@ -21,7 +21,7 @@ components:
alphanumeric_probability: 0.25
combinations:
house_number_unit:
-
components:
- house_number
- unit

View File

@@ -21,7 +21,7 @@ components:
alphanumeric_probability: 0.25
combinations:
staircase_unit:
-
components:
- staircase
- unit
@@ -107,6 +107,9 @@ levels:
direction_probability: 0.9
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6

View File

@@ -21,7 +21,7 @@ components:
alphanumeric_probability: 0.2
combinations:
house_number_unit:
-
components:
- house_number
- unit
@@ -93,6 +93,9 @@ levels:
add_number_phrase_probability: 0.05
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.75
ordinal_probability: 0.25
niveau: &niveau
@@ -106,6 +109,9 @@ levels:
add_number_phrase_probability: 0.05
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.75
ordinal_probability: 0.25
bel_etage: &bel_etage
@@ -889,9 +895,7 @@ countries:
null_probability: 0.6
alphanumeric_probability: 0.4
combinations:
house_number_unit:
probability: 0.0
unit_house_number:
-
components:
- unit
- house_number

View File

@@ -13,17 +13,19 @@ components:
alphanumeric_probability: 0.25
combinations:
level_unit:
-
components:
- level
- unit
label: unit
separators:
- separator: "/"
probability: 0.95
probability: 0.55
- separator: " "
probability: 0.4
- separator: "-"
probability: 0.05
probability: 0.5
probability: 0.8
numbers:
@@ -100,6 +102,9 @@ levels:
direction_probability: 0.9
ordinal:
direction: right
digits:
ascii_probability: 0.2
roman_numeral_probability: 0.8
numeric_probability: 0.1
ordinal_probability: 0.9
foldszint: &foldszint
@@ -220,8 +225,8 @@ levels:
alphanumeric:
default: *emelet
roman_numeral_probability: 0.8 # With this probability, pick a Roman numeral
numeric_probability: 0.19 # With this probability, pick an integer
numeric_probability: 0.59 # With this probability, pick an integer
roman_numeral_probability: 0.4 # Pick a Roman numeral for the actual value
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
alpha_plus_numeric_probability: 0.0001 # e.g. A2

View File

@@ -22,7 +22,7 @@ components:
alphanumeric_probability: 0.2
combinations:
house_number_unit:
-
components:
- house_number
- unit
@@ -87,9 +87,14 @@ levels:
direction_probability: 0.95
add_number_phrase: true
add_number_phrase_probability: 0.05
digits:
ascii_probability: 0.9
roman_numeral_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.5
spellout_probability: 0.2
roman_numeral_probability: 0.3
numeric_probability: 0.55
ordinal_probability: 0.45
@@ -104,6 +109,8 @@ levels:
add_number_phrase_probability: 0.05
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
numeric_probability: 0.75
ordinal_probability: 0.25

View File

@@ -24,7 +24,7 @@ components:
combinations:
# Unit is just appended onto the house number
house_number_unit:
-
components:
- house_number
- unit

View File

@@ -23,7 +23,7 @@ components:
combinations:
# Unit is just appended onto the house number
house_number_unit:
-
components:
- house_number
- unit

View File

@@ -22,7 +22,7 @@ components:
combinations:
# Bolignummer
level_unit:
-
components:
- level
- unit
@@ -110,6 +110,9 @@ levels:
direction_probability: 0.9
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
hovedetasje: &hovedetasje

View File

@@ -22,7 +22,7 @@ components:
alphanumeric_probability: 0.2
combinations:
house_number_unit:
-
components:
- house_number
- unit
@@ -72,10 +72,17 @@ levels:
verdieping: &verdieping
canonical: verdieping
sample: true
canonical_probability: 0.8
sample_probability: 0.2
canonical_probability: 0.9
sample_probability: 0.1
numeric:
direction: left
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.7
ordinal_probability: 0.3
etage: &etage
canonical: etage
abbreviated: et
@@ -85,12 +92,19 @@ levels:
sample_probability: 0.2
numeric:
direction: left
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.7
ordinal_probability: 0.3
begane_grond: &begane_grond
canonical: begane grond
abbreviated: bg
sample: true
canonical_probability: 0.5
sample_probability: 0.2
abbreviated_probability: 0.2
sample_probability: 0.3
benedenverdieping: &benedenverdieping
canonical: benedenverdieping
@@ -114,11 +128,11 @@ levels:
sample_probability: 0.2
aliases:
"0":
default: *benedenverdieping
probability: 0.5
default: *begane_grond
probability: 0.6
alternatives:
- alternative: *begane_grond
probability: 0.45
- alternative: *benedenverdieping
probability: 0.35
- alternative: *parterre
probability: 0.04
- alternative: *het_gelijkvloers
@@ -497,6 +511,13 @@ countries:
sample_probability: 0.2
numeric:
direction: left
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.7
ordinal_probability: 0.3
aliases:
"0":

View File

@@ -21,7 +21,7 @@ components:
alphanumeric_probability: 0.25
combinations:
house_number_unit:
-
components:
- house_number
- unit
@@ -128,9 +128,13 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
numeric_probability: 0.4
ordinal_probability: 0.6
@@ -152,6 +156,8 @@ levels:
# e.g. 1. suterena
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
standalone_probability: 0.985
number_abs_value: true

View File

@@ -25,35 +25,7 @@ components:
combinations:
# For unit types like 2/34 (more common in Canada and Australia)
house_number_unit:
components:
- house_number
- unit
label: house_number
separators:
- separator: "-"
probability: 0.9
- separator: " - "
probability: 0.05
- separator: /
probability: 0.05
probability: 0.005
house_number_floor:
components:
- house_number
- unit
label: house_number
separators:
- separator: "-"
probability: 0.9
- separator: " - "
probability: 0.05
- separator: /
probability: 0.05
probability: 0.005
house_number_staircase_unit:
-
components:
- house_number
- staircase
@@ -67,6 +39,33 @@ components:
- separator: /
probability: 0.05
probability: 0.005
# For unit types like 2/34 (more common in Canada and Australia)
-
components:
- house_number
- unit
label: house_number
separators:
- separator: "-"
probability: 0.9
- separator: " - "
probability: 0.05
- separator: /
probability: 0.05
probability: 0.005
-
components:
- house_number
- level
label: house_number
separators:
- separator: "-"
probability: 0.9
- separator: " - "
probability: 0.05
- separator: /
probability: 0.05
probability: 0.005
numbers:
@@ -134,6 +133,9 @@ levels:
standalone_probability: 0.2 # Let e.g. 5º be the entire floor string
# If ordinal is selected, chance of e.g. just using 2o without Andar
null_phrase_probability: 0.6
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.2
ordinal_probability: 0.8
nivel: &nivel
@@ -151,6 +153,9 @@ levels:
direction_probability: 0.95
standalone_probability: 0.2
null_phrase_probability: 0.6
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.2
ordinal_probability: 0.8
@@ -176,6 +181,9 @@ levels:
standalone_probability: 0.2 # Let e.g. 5º be the entire floor string
# If ordinal is selected, chance of e.g. just using 2o without Piso
null_phrase_probability: 0.6
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.2
numeric_affix_probability: 0.05
ordinal_probability: 0.75
@@ -204,8 +212,7 @@ levels:
terreo: &terreo
canonical: terréo
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
canonical_probability: 0.7
sample_probability: 0.3
baixos: &baixos
canonical: baixos
@@ -241,6 +248,9 @@ levels:
canonical: sub cave
abbreviated: scv
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.3
sample_probability: 0.3
# e.g. sub cave 1
numeric:
direction: left
@@ -750,6 +760,10 @@ units:
canonical: casa
numeric:
direction: left
moradia: &moradia
canonical: moradia
numeric:
direction: left
room: &sala
canonical: sala
numeric:
@@ -769,10 +783,12 @@ units:
alternatives:
- alternative: *sala
probability: 0.1
- alternative: *casa
probability: 0.05
- alternative: *porta
probability: 0.05
- alternative: *casa
probability: 0.04
- alternative: *moradia
probability: 0.01
# Separate random probability for adding directions like 2o Izq, 2 Dcha, etc.
add_direction: true
@@ -796,7 +812,9 @@ units:
- alternative: *sala
probability: 0.1
- alternative: *casa
probability: 0.03
probability: 0.02
- alternative: *moradia
probability: 0.01
- alternative: *porta
probability: 0.05
- alternative: *letra
@@ -903,7 +921,7 @@ countries:
levels:
numbering_starts_at: 1
aliases:
"0":
"0": &ground_floor_brasil
default: *andar_terreo
probability: 0.4
alternatives:
@@ -920,7 +938,7 @@ countries:
probability: 0.05
- alternative: *piso
probability: 0.01
"1": *ground_floor_brasil
postcodes:
alphanumeric:
@@ -929,8 +947,8 @@ countries:
abbreviated: cep
sample: true
canonical_probability: 0.001
abbreviated_probability: 0.995
sample_probability: 0.004
abbreviated_probability: 0.949
sample_probability: 0.05
numeric:
# Postcodes in Brazil are sometimes prefixed by CEP
@@ -945,7 +963,7 @@ countries:
numeric_affix_probability: 0.12
strict_numeric: true
po_boxes:
po_boxes: &po_boxes_caixa_postal
alphanumeric:
default:
canonical: caixa postal
@@ -982,3 +1000,55 @@ countries:
probability: 0.05
- alternative: *letra
probability: 0.05
# Angola
ao:
postcodes: &postcodes_codigo_postal
alphanumeric:
default:
canonical: código postal
abbreviated: cp
sample: true
canonical_probability: 0.001
abbreviated_probability: 0.949
sample_probability: 0.05
numeric:
direction: left
numeric_affix:
affix: cp
direction: left
# null_probability means the chance of doing nothing e.g. just the postal code
null_probability: 0.7
numeric_probability: 0.18
numeric_affix_probability: 0.12
strict_numeric: true
po_boxes: *po_boxes_caixa_postal
# Mozambique
mz:
postcodes: *postcodes_codigo_postal
po_boxes: *po_boxes_caixa_postal
# Cape Verde
cv:
po_boxes: *po_boxes_caixa_postal
# East Timor
tl:
po_boxes: *po_boxes_caixa_postal
# São Tome and Principe
st:
po_boxes: *po_boxes_caixa_postal
# Guinea-Bissau
gw:
po_boxes: *po_boxes_caixa_postal
# Macau
mo:
po_boxes: *po_boxes_caixa_postal

View File

@@ -114,6 +114,8 @@ levels:
direction: left
add_number_phrase: true # Occasionally add variation of "number", e.g. et. nr 2
add_number_phrase_probability: 0.05
digits:
ascii_probability: 0.8
roman_numeral_probability: 0.2
# Ground floor
parter: &parter

View File

@@ -53,8 +53,8 @@ house_number:
canonical: дом
abbreviated: д
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.1
canonical_probability: 0.6
abbreviated_probability: 0.3
sample_probability: 0.1
numeric:
direction: left
@@ -62,8 +62,8 @@ house_number:
canonical: dom
abbreviated: d
sample: true
canonical_probability: 0.8
abbreviated_probability: 0.1
canonical_probability: 0.6
abbreviated_probability: 0.3
sample_probability: 0.1
numeric:
direction: left
@@ -95,8 +95,8 @@ and:
cross_streets:
and: *i
and: *i_latin
i: *i
i_latin: *i_latin
corner: &ugol
canonical: угол
sample: true
@@ -177,6 +177,9 @@ levels:
direction_probability: 0.9
ordinal:
direction: right
digits:
ascii_probability: 0.8
spellout_probability: 0.2
numeric_probability: 0.4
ordinal_probability: 0.6
etazh_latin: &etazh_latin
@@ -217,6 +220,26 @@ levels:
direction: right
numeric_probability: 0.4
ordinal_probability: 0.6
pervyy_etazh: &pervyy_etazh
canonical: первый этаж
sample: true
canonical_probability: 0.9
sample_probability: 0.1
pervyy_etazh_latin: &pervyy_etazh_latin
canonical: pervyy etazh
sample: true
canonical_probability: 0.9
sample_probability: 0.1
nizhniy_etazh: &nizhniy_etazh
canonical: нижний этаж
sample: true
canonical_probability: 0.9
sample_probability: 0.1
nizhniy_etazh_latin: &nizhniy_etazh_latin
canonical: nizhniy etazh
sample: true
canonical_probability: 0.9
sample_probability: 0.1
tsokolnyy_etazh: &tsokolnyy_etazh
canonical: цокольный этаж
abbreviated: цок эт
@@ -295,7 +318,20 @@ levels:
probability: 0.09
- alternative: *etazh_latin
probability: 0.01
"0": *ground_floor
"0":
default: *pervyy_etazh
probability: 0.6
alternatives:
- alternative: *pervyy_etazh_latin
probability: 0.05
- alternative: *nizhniy_etazh
probability: 0.2
- alternative: *nizhniy_etazh_latin
probability: 0.05
- alternative: *tsokolnyy_etazh
probability: 0.075
- alternative: *tsokolnyy_etazh_latin
probability: 0.025
numbering_starts_at: 0
@@ -321,7 +357,7 @@ categories:
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.74
probability: 0.69
alternatives:
- alternative:
canonical: vblizi
@@ -347,6 +383,18 @@ categories:
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: под
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.04
- alternative:
canonical: pod
sample: true
canonical_probability: 0.8
sample_probability: 0.2
probability: 0.01
- alternative:
canonical: okolo
sample: true

View File

@@ -101,11 +101,15 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
@@ -119,11 +123,15 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
@@ -140,9 +148,13 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
numeric_probability: 0.4
ordinal_probability: 0.6
@@ -154,9 +166,13 @@ levels:
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
numeric_probability: 0.4
ordinal_probability: 0.6
@@ -183,6 +199,8 @@ levels:
# e.g. 1. podzemné podlažie
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
standalone_probability: 0.985
number_abs_value: true

View File

@@ -117,6 +117,7 @@ po_boxes:
affix: 邮政信箱
direction: left
digits:
ascii_probability: 0.3
unicode_full_width_probability: 0.5
spellout_probability: 0.2
use_number_phrase: true
@@ -129,6 +130,7 @@ po_boxes:
affix: 郵政信箱
direction: left
digits:
ascii_probability: 0.3
unicode_full_width_probability: 0.5
spellout_probability: 0.2
use_number_phrase: true