libpostal-addrss/resources/parser/default.yaml

names:
    replace_affix_probability: 0.6

languages:
    # sample a language from the distribution of languages found on the Internet
    non_local_language_probability: 0.05
    # Replace user-tagged admin components with the non-local language version
    replace_non_local_probability: 0.4

# Dependencies for including each component in an "address"
# Two-way dependencies are not an issue
component_dependencies:
    road:
        dependencies: []

    po_box:
        dependencies:
            - road
            - suburb
            - city_district
            - city
            - postcode

    house_number:
        dependencies:
            - road

    entrance:
        dependencies:
            - house_number

    staircase:
        dependencies:
            - house_number

    level:
        dependencies:
            - house_number

    unit:
        dependencies:
            - house_number

    metro_station:
        dependencies:
            - house
            - road
            - house_number

    postcode:
        dependencies: []

    # Country exceptions
    exceptions:
        jp:
            house_number:
                dependencies:
                    - road
                    - suburb
                    - city_district


# Each component is dropped out separately and a new address
# is added to the training set. These are only the address-level
# components. Places/boundaries are taken care of elsewhere.
dropout:
    attention:
        probability: 0.8
    care_of:
        probability: 0.8
    house:
        probability: 0.6
    house_number:
        probability: 0.5
    road:
        probability: 0.4
    entrance:
        probability: 0.8
    staircase:
        probability: 0.8
    level:
        probability: 0.6
    unit:
        probability: 0.5
    postcode:
        probability: 0.6
po_box:
    probability: 0.1
    # Note: these probabilities all independent (don't need to sum to 1)
    drop_address_probability: 0.8 # drop house number, road, etc.
    drop_places_probability: 0.1 # drop place names
    drop_postcode_probability: 0.3 # drop postal code

category:
    # Same thing for category queries
    drop_address_probability: 0.8 # drop house number, road, etc.
    drop_places_probability: 0.1 # drop place names
    drop_postcode_probability: 0.3 # drop postal code

places:
    hyphenate_multiword_probability: 0.01
    remove_hyphen_probability: 0.5

boundaries:
    abbreviate_toponym_probability: 0.35
    # Usually in Germany, may have e.g. name:prefix=Stadtbezirk
    add_prefix_probability: 0.5

neighborhood:
    # Usually in Germany, may have e.g. name:prefix=Ortsteil
    add_prefix_probability: 0.5
    use_first_match_probability: 0.8

city:
    quattroshapes_geonames_backup_city_probability: 0.2
    quattroshapes_geonames_abbreviated_probability: 0.1

state_district:
    join_probability: 0.5

state:
    # Probability of using full name e.g. New York vs. NY
    full_name_probability: 0.2
    abbreviated_probability: 0.8

# Currently for Russian and Ukrainian, convert some names to the genitive case
slavic_names:
    state:
        genitive_probability: 0.4
    state_district:
        genitive_probability: 0.4

country:
    # If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
    cldr_country_probability: 0.5
    # When a country is specified and is simply an ISO code (e.g. US, DE), replace with one of the CLDR names
    replace_with_cldr_country_probability: 0.9
    # When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components)
    remove_iso_code_probability: 0.1
    cldr:
        localized_name_probability: 0.92
        iso_alpha_2_code_probability: 0.02
        iso_alpha_3_code_probability: 0.01
        iso_3166_name_probability: 0.05