Files
libpostal-addrss/resources/addresses/hr.yaml
2025-09-06 22:03:29 -04:00

587 lines
16 KiB
YAML

# hr.yaml
# -------
# Croatian language specification
components:
level:
null_probability: 0.9
alphanumeric_probability: 0.1
staircase:
null_probability: 0.99
alphanumeric_probability: 0.01
entrance:
null_probability: 0.999
alphanumeric_probability: 0.001
unit:
null_probability: 0.7
alphanumeric_probability: 0.3
combinations:
-
components:
- house_number
- staircase
- level
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
-
components:
- house_number
- level
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
-
components:
- house_number
- level
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.1
# For unit types like 2/34
-
components:
- house_number
- unit
label: house_number
separators:
- separator: "/"
probability: 0.95
- separator: "-"
probability: 0.05
probability: 0.005
numbers:
no_number:
default:
canonical: bez broja
abbreviated: bb
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.4
sample_probability: 0.3
default: &broj
canonical: broj
abbreviated: br
sample: true
canonical_probability: 0.3
abbreviated_probability: 0.6
sample_probability: 0.1
numeric:
direction: left
numeric_affix:
affix: "br."
whitespace_probability: 0.6
direction: left
numeric_probability: 0.4
numeric_affix_probability: 0.6
alphanumeric_phrase_probability: 0.05
no_number_probability: 0.05
and:
default: &i
canonical: i
sample: true
canonical_probability: 0.8
sample_probability: 0.2
cross_streets:
i: *i
at: &na
canonical: na
sample: true
canonical_probability: 0.8
sample_probability: 0.2
corner: &ugao
canonical: ugao
sample: true
canonical_probability: 0.8
sample_probability: 0.2
corner_of: &uglu
canonical: uglu
sample: true
canonical_probability: 0.8
sample_probability: 0.2
na_uglu: &na_uglu
canonical: na uglu
sample: true
canonical_probability: 0.8
sample_probability: 0.2
intersection:
default: *i
probability: 0.65
alternatives:
- alternative: *na
probability: 0.1
- alternative: *uglu
probability: 0.1
- alternative: *na_uglu
probability: 0.1
- alternative: *ugao
probability: 0.05
izmedu: &izmedu
canonical: između
sample: true
canonical_probability: 0.8
sample_probability: 0.2
parentheses_probability: 0.5
between:
default: *izmedu
levels:
kat: &kat
canonical: kat
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
etaza: &etaza
canonical: etaža
abbreviated: et
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.4
sample_probability: 0.2
numeric:
direction: left
direction_probability: 0.9
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
add_number_phrase: true
add_number_phrase_probability: 0.1
ordinal:
direction: right
digits:
ascii_probability: 0.3
roman_numeral_probability: 0.7
add_number_phrase: true
add_number_phrase_probability: 0.1
numeric_probability: 0.4
ordinal_probability: 0.6
prizemlje: &prizemlje
canonical: prizemlje
sample: true
canonical_probability: 0.9
sample_probability: 0.1
parter: &parter
canonical: parter
sample: true
canonical_probability: 0.9
sample_probability: 0.1
mezanino: &polukat
canonical: polukat
half_floors: true
canonical_probability: 0.8
sample_probability: 0.2
sample: true
# e.g. polukat 2
numeric:
direction: left
# e.g. 2. entresuelo
ordinal:
direction: right
numeric_probability: 0.1
ordinal_probability: 0.2
standalone_probability: 0.6
podrum: &podrum
canonical: podrum
sample: true
canonical_probability: 0.7
sample_probability: 0.3
# e.g. подрум 1
numeric:
direction: left
direction_probability: 0.8
# e.g. 1. подрум
ordinal:
direction: right
digits:
ascii_probability: 0.7
roman_numeral_probability: 0.3
standalone_probability: 0.99
number_abs_value: true
number_min_abs_value: 1
numeric_probability: 0.005
ordinal_probability: 0.005
aliases:
"<-1":
default: *podrum
"-1":
default: *podrum
# Special token for half-floors
half_floors:
default: *polukat
"0":
default: *prizemlje
probability: 0.5
alternatives:
- alternative: *parter
probability: 0.4
- alternative: *kat
probability: 0.1
numbering_starts_at: 0
alphanumeric:
default: *kat
probability: 0.95
alternatives:
- alternative: *etaza
probability: 0.05
numeric_probability: 0.69 # With this probability, pick an integer
roman_numeral_probability: 0.3 # Pick a Roman numeral for the actual value
alpha_probability: 0.0098 # With this probability, pick a letter e.g. A
numeric_plus_alpha_probability: 0.0001 # e.g. 2A
alpha_plus_numeric_probability: 0.0001 # e.g. A2
categories:
near:
default:
canonical: u blizini
nearby:
default:
canonical: u blizini
probability: 0.6
alternatives:
- alternative:
canonical: u blizini ovdje
probability: 0.3
- alternative:
canonical: oko ovdje
probability: 0.1
near_me:
default:
canonical: u blizini mene
# Don't worry about agreement
in:
default:
canonical: u
# Probabilities of each phrase
near_probability: 0.35
nearby_probability: 0.2
near_me_probability: 0.1
in_probability: 0.35
directions:
right: &desno
canonical: desno
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
left: &lijevo
canonical: lijevo
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: right
alternatives:
- alternative: *desno
probability: 0.5
- alternative: *lijevo
probability: 0.5
cardinal_directions:
east: &istok
canonical: istok
abbreviated: i
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: i
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
west: &zapad
canonical: zapad
abbreviated: z
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: z
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
north: &sjever
canonical: sjever
abbreviated: s
canonical_probability: 0.95
abbreviated_probability: 0.05
numeric:
direction: right
numeric_affix:
affix: s
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
south: &jug
canonical: jug
abbreviated: j
sample: true
canonical_probability: 0.75
abbreviated_probability: 0.1
sample_probability: 0.15
numeric:
direction: right
numeric_affix:
affix: j
direction: right
numeric_probability: 0.5
numeric_affix_probability: 0.5
alternatives:
- alternative: *sjever
probability: 0.25
- alternative: *istok
probability: 0.23
- alternative: *jug
probability: 0.23
- alternative: *zapad
probability: 0.23
entrances:
ulaz: &ulaz
canonical: ulaz
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
# Ulaz 1, Ulaz A, etc.
alphanumeric: &entrance_alphanumeric
default: *ulaz
numeric_probability: 0.1 # e.g. Ulaz 1
alpha_probability: 0.85 # e.g. Ulaz A
numeric_plus_alpha_probability: 0.025 # e.g. 1A
alpha_plus_numeric_probability: 0.025 # e.g. A1
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
staircases:
stubiste: &stubiste
canonical: stubište
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
alphanumeric: &staircase_alphanumeric
default: *stubiste
numeric_probability: 0.75
alpha_probability: 0.2
numeric_plus_alpha_probability: 0.025
alpha_plus_numeric_probability: 0.025
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
directional:
direction: right
direction_probability: 0.85
modifier:
alternatives:
- alternative: *desno
probability: 0.2
- alternative: *lijevo
probability: 0.2
- alternative: *sjever
probability: 0.15
- alternative: *jug
probability: 0.15
- alternative: *istok
probability: 0.15
- alternative: *zapad
probability: 0.15
po_boxes:
postanski_pretinac: &postanski_pretinac
canonical: poštanski pretinac
abbreviated: p.p
sample: true
canonical_probability: 0.2
abbreviated_probability: 0.4
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.2
alphanumeric:
default: *postanski_pretinac
numeric_probability: 0.9 # pp 123
alpha_probability: 0.05 # p.p A
numeric_plus_alpha_probability: 0.04 # pp 123G
alpha_plus_numeric_probability: 0.01 # pp A123
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
digits:
- length: 1
probability: 0.05
- length: 2
probability: 0.1
- length: 3
probability: 0.2
- length: 4
probability: 0.5
- length: 5
probability: 0.1
- length: 6
probability: 0.05
units:
stan: &stan
canonical: stan
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
apartman: &apartman
canonical: apartman
abbreviated: ap
sample: true
canonical_probability: 0.4
abbreviated_probability: 0.2
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
soba: &soba
canonical: soba
sample: true
canonical_probability: 0.8
sample_probability: 0.2
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
ured: &ured
canonical: ured
sample: true
canonical_probability: 0.6
sample_probability: 0.4
numeric:
direction: left
add_number_phrase: true
add_number_phrase_probability: 0.1
alphanumeric: &unit_alphanumeric
default: *stan
probability: 0.6
alternatives:
- alternative: *apartman
probability: 0.3
- alternative: *soba
probability: 0.1
numeric_probability: 0.9 # e.g. stan. 1
numeric_plus_alpha_probability: 0.03 # e.g. 1A
alpha_plus_numeric_probability: 0.03 # e.g. A1
alpha_probability: 0.04 # e.g. stan A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
# If there are 10 floors, create unit numbers like #301 or #1032
use_floor_probability: 0.05
zones:
commercial: &commercial_unit_types
default: *soba
probability: 0.6
alternatives:
- alternative: *ured
probability: 0.4
numeric_probability: 0.95 # e.g. soba 1
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
alpha_probability: 0.03 # e.g. soba A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1
university:
default: *soba
numeric_probability: 0.95 # e.g. soba 1
numeric_plus_alpha_probability: 0.01 # e.g. soba 1A
alpha_plus_numeric_probability: 0.01 # e.g. soba A1
alpha_probability: 0.03 # e.g. soba A
alpha_plus_numeric:
whitespace_probability: 0.1
numeric_plus_alpha:
whitespace_probability: 0.1