135 lines
3.4 KiB
YAML
135 lines
3.4 KiB
YAML
names:
|
|
replace_affix_probability: 0.6
|
|
|
|
languages:
|
|
# sample a language from the distribution of languages found on the Internet
|
|
non_local_language_probability: 0.05
|
|
# Replace user-tagged admin components with the non-local language version
|
|
replace_non_local_probability: 0.4
|
|
|
|
# Dependencies for including each component in an "address"
|
|
# Two-way dependencies are not an issue
|
|
component_dependencies:
|
|
road:
|
|
dependencies: []
|
|
|
|
po_box:
|
|
dependencies:
|
|
- road
|
|
- suburb
|
|
- city_district
|
|
- city
|
|
- postcode
|
|
|
|
house_number:
|
|
dependencies:
|
|
- road
|
|
|
|
entrance:
|
|
dependencies:
|
|
- house_number
|
|
|
|
staircase:
|
|
dependencies:
|
|
- house_number
|
|
|
|
level:
|
|
dependencies:
|
|
- house_number
|
|
|
|
unit:
|
|
dependencies:
|
|
- house_number
|
|
|
|
metro_station:
|
|
dependencies:
|
|
- house
|
|
- road
|
|
- house_number
|
|
|
|
postcode:
|
|
dependencies: []
|
|
|
|
# Country exceptions
|
|
exceptions:
|
|
jp:
|
|
house_number:
|
|
dependencies:
|
|
- road
|
|
- suburb
|
|
- city_district
|
|
|
|
|
|
|
|
# Each component is dropped out separately and a new address
|
|
# is added to the training set. These are only the address-level
|
|
# components. Places/boundaries are taken care of elsewhere.
|
|
dropout:
|
|
attention:
|
|
probability: 0.8
|
|
care_of:
|
|
probability: 0.8
|
|
house:
|
|
probability: 0.6
|
|
house_number:
|
|
probability: 0.5
|
|
road:
|
|
probability: 0.4
|
|
entrance:
|
|
probability: 0.8
|
|
staircase:
|
|
probability: 0.8
|
|
level:
|
|
probability: 0.6
|
|
unit:
|
|
probability: 0.5
|
|
postcode:
|
|
probability: 0.6
|
|
po_box:
|
|
probability: 0.1
|
|
# Note: these probabilities all independent (don't need to sum to 1)
|
|
drop_address_probability: 0.8 # drop house number, road, etc.
|
|
drop_places_probability: 0.1 # drop place names
|
|
drop_postcode_probability: 0.3 # drop postal code
|
|
|
|
category:
|
|
# Same thing for category queries
|
|
drop_address_probability: 0.8 # drop house number, road, etc.
|
|
drop_places_probability: 0.1 # drop place names
|
|
drop_postcode_probability: 0.3 # drop postal code
|
|
|
|
places:
|
|
hyphenate_multiword_probability: 0.01
|
|
remove_hyphen_probability: 0.5
|
|
|
|
boundaries:
|
|
abbreviate_toponym_probability: 0.35
|
|
|
|
neighborhood:
|
|
# Usually in Germany, may have e.g. name:prefix=Ortsteil
|
|
add_prefix_probability: 0.5
|
|
|
|
city:
|
|
quattroshapes_geonames_backup_city_probability: 0.2
|
|
quattroshapes_geonames_abbreviated_probability: 0.1
|
|
|
|
state_district:
|
|
join_probability: 0.5
|
|
|
|
state:
|
|
# Probability of using full name e.g. New York vs. NY
|
|
full_name_probability: 0.2
|
|
abbreviated_probability: 0.8
|
|
|
|
country:
|
|
# If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
|
|
cldr_country_probability: 0.5
|
|
# When a country is specified and is simply an ISO code (e.g. US, DE), replace with one of the CLDR names
|
|
replace_with_cldr_country_probability: 0.9
|
|
# When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components)
|
|
remove_iso_code_probability: 0.1
|
|
cldr:
|
|
localized_name_probability: 0.97
|
|
iso_alpha_2_code_probability: 0.02
|
|
iso_alpha_3_code_probability: 0.01
|