Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/resources/parser/data_sets/openaddresses.yaml
+++ b/resources/parser/data_sets/openaddresses.yaml
--- a/resources/parser/data_sets/osm.yaml
+++ b/resources/parser/data_sets/osm.yaml
@@ -0,0 +1,39 @@
+languages:
+    pick_namespaced_language_probability: 0.6
+
+categories:
+    plural_probability: 0.6
+    place_only_probability: 0.9
+
+conscription_numbers:
+    phrase_probability: 0.4
+    no_phrase_probability: 0.1
+
+austro_hungarian_street_numbers:
+    no_phrase_probability: 0.1
+
+chains:
+    sample_probability: 0.5
+    place_only_probability: 0.9
+
+places:
+    drop_postcode_probability: 0.6
+    cldr_country_probability: 0.5
+
+streets:
+    abbreviate_probability: 0.3
+    separate_probability: 0.2
+
+venues:
+    address_probability: 0.4
+    abbreviate_probability: 0.3
+    separate_probability: 0.0
+    sub_building_probability: 0.05
+
+countries:
+    jp:
+        # Always do this as there are plenty of examples of block numbers without house number
+        combine_block_house_number_probability: 1.0
+        block_phrase_probability: 0.4
+        romaji_probability: 0.4
+        add_metro_probability: 0.6
--- a/resources/parser/default.yaml
+++ b/resources/parser/default.yaml
@@ -0,0 +1,145 @@
+names:
+    replace_affix_probability: 0.6
+
+languages:
+    # sample a language from the distribution of languages found on the Internet
+    non_local_language_probability: 0.05
+    # Replace user-tagged admin components with the non-local language version
+    replace_non_local_probability: 0.4
+
+# Dependencies for including each component in an "address"
+# Two-way dependencies are not an issue
+component_dependencies:
+    road:
+        dependencies: []
+
+    po_box:
+        dependencies:
+            - road
+            - suburb
+            - city_district
+            - city
+            - postcode
+
+    house_number:
+        dependencies:
+            - road
+
+    entrance:
+        dependencies:
+            - house_number
+
+    staircase:
+        dependencies:
+            - house_number
+
+    level:
+        dependencies:
+            - house_number
+
+    unit:
+        dependencies:
+            - house_number
+
+    metro_station:
+        dependencies:
+            - house
+            - road
+            - house_number
+
+    postcode:
+        dependencies: []
+
+    # Country exceptions
+    exceptions:
+        jp:
+            house_number:
+                dependencies:
+                    - road
+                    - suburb
+                    - city_district
+
+
+
+# Each component is dropped out separately and a new address
+# is added to the training set. These are only the address-level
+# components. Places/boundaries are taken care of elsewhere.
+dropout:
+    attention:
+        probability: 0.8
+    care_of:
+        probability: 0.8
+    house:
+        probability: 0.6
+    house_number:
+        probability: 0.5
+    road:
+        probability: 0.4
+    entrance:
+        probability: 0.8
+    staircase:
+        probability: 0.8
+    level:
+        probability: 0.6
+    unit:
+        probability: 0.5
+    postcode:
+        probability: 0.6
+po_box:
+    probability: 0.1 
+    # Note: these probabilities all independent (don't need to sum to 1)
+    drop_address_probability: 0.8 # drop house number, road, etc.
+    drop_places_probability: 0.1 # drop place names
+    drop_postcode_probability: 0.3 # drop postal code
+
+category:
+    # Same thing for category queries
+    drop_address_probability: 0.8 # drop house number, road, etc.
+    drop_places_probability: 0.1 # drop place names
+    drop_postcode_probability: 0.3 # drop postal code
+
+places:
+    hyphenate_multiword_probability: 0.01
+    remove_hyphen_probability: 0.5
+
+boundaries:
+    abbreviate_toponym_probability: 0.35
+    # Usually in Germany, may have e.g. name:prefix=Stadtbezirk
+    add_prefix_probability: 0.5
+
+neighborhood:
+    # Usually in Germany, may have e.g. name:prefix=Ortsteil
+    add_prefix_probability: 0.5
+    use_first_match_probability: 0.8
+
+city:
+    quattroshapes_geonames_backup_city_probability: 0.2
+    quattroshapes_geonames_abbreviated_probability: 0.1
+
+state_district:
+    join_probability: 0.5
+
+state:
+    # Probability of using full name e.g. New York vs. NY
+    full_name_probability: 0.2
+    abbreviated_probability: 0.8
+
+# Currently for Russian and Ukrainian, convert some names to the genitive case
+slavic_names:
+    state:
+        genitive_probability: 0.4
+    state_district:
+        genitive_probability: 0.4
+
+country:
+    # If no country is specified, pull the country name from CLDR (authoratative country names translated into different languages)
+    cldr_country_probability: 0.5
+    # When a country is specified and is simply an ISO code (e.g. US, DE), replace with one of the CLDR names
+    replace_with_cldr_country_probability: 0.9
+    # When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components)
+    remove_iso_code_probability: 0.1
+    cldr:
+        localized_name_probability: 0.92
+        iso_alpha_2_code_probability: 0.02
+        iso_alpha_3_code_probability: 0.01
+        iso_3166_name_probability: 0.05