Files
2025-09-06 22:03:29 -04:00

146 lines
3.8 KiB
YAML

names:
replace_affix_probability: 0.6
languages:
# sample a language from the distribution of languages found on the Internet
non_local_language_probability: 0.05
# Replace user-tagged admin components with the non-local language version
replace_non_local_probability: 0.4
# Dependencies for including each component in an "address"
# Two-way dependencies are not an issue
component_dependencies:
road:
dependencies: []
po_box:
dependencies:
- road
- suburb
- city_district
- city
- postcode
house_number:
dependencies:
- road
entrance:
dependencies:
- house_number
staircase:
dependencies:
- house_number
level:
dependencies:
- house_number
unit:
dependencies:
- house_number
metro_station:
dependencies:
- house
- road
- house_number
postcode:
dependencies: []
# Country exceptions
exceptions:
jp:
house_number:
dependencies:
- road
- suburb
- city_district
# Each component is dropped out separately and a new address
# is added to the training set. These are only the address-level
# components. Places/boundaries are taken care of elsewhere.
dropout:
attention:
probability: 0.8
care_of:
probability: 0.8
house:
probability: 0.6
house_number:
probability: 0.5
road:
probability: 0.4
entrance:
probability: 0.8
staircase:
probability: 0.8
level:
probability: 0.6
unit:
probability: 0.5
postcode:
probability: 0.6
po_box:
probability: 0.1
# Note: these probabilities all independent (don't need to sum to 1)
drop_address_probability: 0.8 # drop house number, road, etc.
drop_places_probability: 0.1 # drop place names
drop_postcode_probability: 0.3 # drop postal code
category:
# Same thing for category queries
drop_address_probability: 0.8 # drop house number, road, etc.
drop_places_probability: 0.1 # drop place names
drop_postcode_probability: 0.3 # drop postal code
places:
hyphenate_multiword_probability: 0.01
remove_hyphen_probability: 0.5
boundaries:
abbreviate_toponym_probability: 0.35
# Usually in Germany, may have e.g. name:prefix=Stadtbezirk
add_prefix_probability: 0.5
neighborhood:
# Usually in Germany, may have e.g. name:prefix=Ortsteil
add_prefix_probability: 0.5
use_first_match_probability: 0.8
city:
quattroshapes_geonames_backup_city_probability: 0.2
quattroshapes_geonames_abbreviated_probability: 0.1
state_district:
join_probability: 0.5
state:
# Probability of using full name e.g. New York vs. NY
full_name_probability: 0.2
abbreviated_probability: 0.8
# Currently for Russian and Ukrainian, convert some names to the genitive case
slavic_names:
state:
genitive_probability: 0.4
state_district:
genitive_probability: 0.4
country:
# If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
cldr_country_probability: 0.5
# When a country is specified and is simply an ISO code (e.g. US, DE), replace with one of the CLDR names
replace_with_cldr_country_probability: 0.9
# When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components)
remove_iso_code_probability: 0.1
cldr:
localized_name_probability: 0.92
iso_alpha_2_code_probability: 0.02
iso_alpha_3_code_probability: 0.01
iso_3166_name_probability: 0.05