Initial fork commit
This commit is contained in:
4802
resources/parser/data_sets/openaddresses.yaml
Normal file
4802
resources/parser/data_sets/openaddresses.yaml
Normal file
File diff suppressed because it is too large
Load Diff
39
resources/parser/data_sets/osm.yaml
Normal file
39
resources/parser/data_sets/osm.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
languages:
|
||||
pick_namespaced_language_probability: 0.6
|
||||
|
||||
categories:
|
||||
plural_probability: 0.6
|
||||
place_only_probability: 0.9
|
||||
|
||||
conscription_numbers:
|
||||
phrase_probability: 0.4
|
||||
no_phrase_probability: 0.1
|
||||
|
||||
austro_hungarian_street_numbers:
|
||||
no_phrase_probability: 0.1
|
||||
|
||||
chains:
|
||||
sample_probability: 0.5
|
||||
place_only_probability: 0.9
|
||||
|
||||
places:
|
||||
drop_postcode_probability: 0.6
|
||||
cldr_country_probability: 0.5
|
||||
|
||||
streets:
|
||||
abbreviate_probability: 0.3
|
||||
separate_probability: 0.2
|
||||
|
||||
venues:
|
||||
address_probability: 0.4
|
||||
abbreviate_probability: 0.3
|
||||
separate_probability: 0.0
|
||||
sub_building_probability: 0.05
|
||||
|
||||
countries:
|
||||
jp:
|
||||
# Always do this as there are plenty of examples of block numbers without house number
|
||||
combine_block_house_number_probability: 1.0
|
||||
block_phrase_probability: 0.4
|
||||
romaji_probability: 0.4
|
||||
add_metro_probability: 0.6
|
||||
145
resources/parser/default.yaml
Normal file
145
resources/parser/default.yaml
Normal file
@@ -0,0 +1,145 @@
|
||||
names:
|
||||
replace_affix_probability: 0.6
|
||||
|
||||
languages:
|
||||
# sample a language from the distribution of languages found on the Internet
|
||||
non_local_language_probability: 0.05
|
||||
# Replace user-tagged admin components with the non-local language version
|
||||
replace_non_local_probability: 0.4
|
||||
|
||||
# Dependencies for including each component in an "address"
|
||||
# Two-way dependencies are not an issue
|
||||
component_dependencies:
|
||||
road:
|
||||
dependencies: []
|
||||
|
||||
po_box:
|
||||
dependencies:
|
||||
- road
|
||||
- suburb
|
||||
- city_district
|
||||
- city
|
||||
- postcode
|
||||
|
||||
house_number:
|
||||
dependencies:
|
||||
- road
|
||||
|
||||
entrance:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
staircase:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
level:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
unit:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
metro_station:
|
||||
dependencies:
|
||||
- house
|
||||
- road
|
||||
- house_number
|
||||
|
||||
postcode:
|
||||
dependencies: []
|
||||
|
||||
# Country exceptions
|
||||
exceptions:
|
||||
jp:
|
||||
house_number:
|
||||
dependencies:
|
||||
- road
|
||||
- suburb
|
||||
- city_district
|
||||
|
||||
|
||||
|
||||
# Each component is dropped out separately and a new address
|
||||
# is added to the training set. These are only the address-level
|
||||
# components. Places/boundaries are taken care of elsewhere.
|
||||
dropout:
|
||||
attention:
|
||||
probability: 0.8
|
||||
care_of:
|
||||
probability: 0.8
|
||||
house:
|
||||
probability: 0.6
|
||||
house_number:
|
||||
probability: 0.5
|
||||
road:
|
||||
probability: 0.4
|
||||
entrance:
|
||||
probability: 0.8
|
||||
staircase:
|
||||
probability: 0.8
|
||||
level:
|
||||
probability: 0.6
|
||||
unit:
|
||||
probability: 0.5
|
||||
postcode:
|
||||
probability: 0.6
|
||||
po_box:
|
||||
probability: 0.1
|
||||
# Note: these probabilities all independent (don't need to sum to 1)
|
||||
drop_address_probability: 0.8 # drop house number, road, etc.
|
||||
drop_places_probability: 0.1 # drop place names
|
||||
drop_postcode_probability: 0.3 # drop postal code
|
||||
|
||||
category:
|
||||
# Same thing for category queries
|
||||
drop_address_probability: 0.8 # drop house number, road, etc.
|
||||
drop_places_probability: 0.1 # drop place names
|
||||
drop_postcode_probability: 0.3 # drop postal code
|
||||
|
||||
places:
|
||||
hyphenate_multiword_probability: 0.01
|
||||
remove_hyphen_probability: 0.5
|
||||
|
||||
boundaries:
|
||||
abbreviate_toponym_probability: 0.35
|
||||
# Usually in Germany, may have e.g. name:prefix=Stadtbezirk
|
||||
add_prefix_probability: 0.5
|
||||
|
||||
neighborhood:
|
||||
# Usually in Germany, may have e.g. name:prefix=Ortsteil
|
||||
add_prefix_probability: 0.5
|
||||
use_first_match_probability: 0.8
|
||||
|
||||
city:
|
||||
quattroshapes_geonames_backup_city_probability: 0.2
|
||||
quattroshapes_geonames_abbreviated_probability: 0.1
|
||||
|
||||
state_district:
|
||||
join_probability: 0.5
|
||||
|
||||
state:
|
||||
# Probability of using full name e.g. New York vs. NY
|
||||
full_name_probability: 0.2
|
||||
abbreviated_probability: 0.8
|
||||
|
||||
# Currently for Russian and Ukrainian, convert some names to the genitive case
|
||||
slavic_names:
|
||||
state:
|
||||
genitive_probability: 0.4
|
||||
state_district:
|
||||
genitive_probability: 0.4
|
||||
|
||||
country:
|
||||
# If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
|
||||
cldr_country_probability: 0.5
|
||||
# When a country is specified and is simply an ISO code (e.g. US, DE), replace with one of the CLDR names
|
||||
replace_with_cldr_country_probability: 0.9
|
||||
# When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components)
|
||||
remove_iso_code_probability: 0.1
|
||||
cldr:
|
||||
localized_name_probability: 0.92
|
||||
iso_alpha_2_code_probability: 0.02
|
||||
iso_alpha_3_code_probability: 0.01
|
||||
iso_3166_name_probability: 0.05
|
||||
Reference in New Issue
Block a user