Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/categories/init.py
+++ b/scripts/geodata/categories/init.py
--- a/scripts/geodata/categories/config.py
+++ b/scripts/geodata/categories/config.py
@@ -0,0 +1,72 @@
+import csv
+import os
+import six
+import random
+import sys
+
+from collections import defaultdict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode
+
+CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                              'resources', 'categories')
+
+
+class CategoryConfig(object):
+    def __init__(self, base_dir=CATEGORIES_DIR):
+        self.language_categories_singular = {}
+        self.language_categories_plural = {}
+
+        self.language_property_names = defaultdict(set)
+
+        if not os.path.exists(base_dir):
+            raise RuntimeError('{} does not exist'.format(base_dir))
+
+        for filename in os.listdir(base_dir):
+            if not filename.endswith('.tsv'):
+                continue
+
+            lang = filename.rsplit('.tsv')[0]
+            base_lang = lang.split('_')[0]
+
+            singular_rules = self.language_categories_singular.get(base_lang, defaultdict(list))
+            plural_rules = self.language_categories_plural.get(base_lang, defaultdict(list))
+
+            reader = csv.reader(open(os.path.join(CATEGORIES_DIR, filename)), delimiter='\t')
+            reader.next()  # headers
+
+            for key, value, is_plural, phrase in reader:
+                self.language_property_names[lang].add(key)
+                is_plural = bool(int(is_plural))
+                if is_plural:
+                    plural_rules[(key, value)].append(phrase)
+                else:
+                    singular_rules[(key, value)].append(phrase)
+
+            self.language_categories_singular[base_lang] = singular_rules
+            self.language_categories_plural[base_lang] = plural_rules
+
+        self.language_categories_singular = {key: dict(value) for key, value
+                                             in six.iteritems(self.language_categories_singular)}
+
+        self.language_categories_plural = {key: dict(value) for key, value
+                                           in six.iteritems(self.language_categories_plural)}
+
+    def has_keys(self, language, keys):
+        prop_names = self.language_property_names.get(language, set())
+        return [k for k in keys if k in prop_names]
+
+    def get_phrase(self, language, key, value, is_plural=False):
+        config = self.language_categories_singular if not is_plural else self.language_categories_plural
+        if language not in config:
+            return None
+        language_config = config[language]
+        choices = language_config.get((key, value))
+        if not choices:
+            return None
+        return random.choice(choices)
+
+category_config = CategoryConfig()
--- a/scripts/geodata/categories/preposition.py
+++ b/scripts/geodata/categories/preposition.py
@@ -0,0 +1,31 @@
+from geodata.addresses.config import address_config
+from geodata.categories.config import category_config
+from geodata.math.sampling import weighted_choice, cdf
+
+
+class CategoryPreposition(object):
+    NEAR = 'near'
+    NEARBY = 'nearby'
+    NEAR_ME = 'near_me'
+    IN = 'in'
+    NULL = 'null'
+
+    @classmethod
+    def random(cls, language, country=None):
+        category_props = address_config.get_property('categories', language, country=country)
+        if category_props is None:
+            return None
+
+        values = []
+        probs = []
+
+        for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
+            k = '{}_probability'.format(prep_phrase_type)
+            prob = category_props.get(k, None)
+            if prob is not None:
+                values.append(prep_phrase_type)
+                probs.append(prob)
+
+        probs = cdf(probs)
+
+        return weighted_choice(values, probs)
--- a/scripts/geodata/categories/query.py
+++ b/scripts/geodata/categories/query.py
@@ -0,0 +1,38 @@
+from collections import namedtuple
+
+from geodata.addresses.config import address_config
+from geodata.categories.config import category_config
+from geodata.categories.preposition import CategoryPreposition
+from geodata.encoding import safe_decode
+from geodata.math.sampling import weighted_choice
+
+CategoryQuery = namedtuple('CategoryQuery', 'category, prep, add_place_name, add_address')
+
+NULL_CATEGORY_QUERY = CategoryQuery(None, None, False, False)
+
+
+class Category(object):
+    @classmethod
+    def phrase(cls, language, key, value, is_plural=False, country=None):
+        category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
+        if not category_phrase:
+            return NULL_CATEGORY_QUERY
+
+        category_phrase = safe_decode(category_phrase)
+
+        prep_phrase_type = CategoryPreposition.random(language, country=country)
+
+        if prep_phrase_type in (None, CategoryPreposition.NULL):
+            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
+
+        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
+        if not values:
+            return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
+
+        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
+        prep_phrase = safe_decode(prep_phrase)
+
+        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
+        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
+
+        return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
--- a/scripts/geodata/categories/scrape_nominatim_special_phrases.py
+++ b/scripts/geodata/categories/scrape_nominatim_special_phrases.py
@@ -0,0 +1,125 @@
+'''
+scrape_nominatim_special_phrases.py
+-----------------------------------
+
+Simple script to scrape https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
+for category-related phrases sometimes found in geocoder input.
+
+Populates a per-language CSV with (phrase, OSM key, OSM value, plural):
+
+OSM keys/values are like:
+
+amenity=restaurant
+tourism=museum
+shop=books
+
+Using these phrases, it is possible to construct queries like "restaurants in Brooklyn"
+'''
+
+import csv
+import os
+import re
+import requests
+import six
+import sys
+import time
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode, safe_encode
+
+DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                      'resources', 'categories')
+
+
+# Use Special:Export to get wiki markup
+WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
+NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
+NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
+
+phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
+wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
+
+IGNORE_LANGUAGES = {
+    # Interlingua
+    'ia'
+}
+
+
+IGNORE_PLURAL_LANGUAGES = {
+    # For Japanese, seems to just put an s on the end, which doesn't seem right
+    # Need input from a native speaker on that one
+    'ja',
+}
+
+# Wait this many seconds between page fetches
+POLITENESS_DELAY = 5.0
+
+
+def scrape_nominatim_category_page(url, ignore_plurals=False):
+    result = requests.get(url)
+
+    if not result or not result.content:
+        return
+
+    for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
+        if operator and operator != '-':
+            continue
+
+        is_plural = plural == 'Y'
+        if is_plural and ignore_plurals:
+            continue
+
+        yield safe_decode(phrase).lower(), key, value, is_plural
+
+
+def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
+    print('Fetching main page')
+    result = requests.get(url)
+    languages = {}
+    if not result or not result.content:
+        return languages
+
+    time.sleep(POLITENESS_DELAY)
+
+    for entity, anchor_text in wiki_link_re.findall(result.content):
+        if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
+            continue
+
+        lang = entity.rstrip('/').rsplit('/')[-1].lower()
+        if lang in IGNORE_LANGUAGES:
+            continue
+
+        link = WIKI_BASE_URL + entity.replace(' ', '_')
+
+        ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
+
+        print('Doing {}'.format(lang))
+        phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
+        time.sleep(POLITENESS_DELAY)
+
+        if not phrases:
+            continue
+
+        languages[lang] = phrases
+
+    return languages
+
+
+def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
+    languages = scrape_all_nominatim_category_pages(url=url)
+    for lang, phrases in six.iteritems(languages):
+        filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
+        with open(filename, 'w') as f:
+            writer = csv.writer(f, delimiter='\t')
+            writer.writerow(('key', 'value', 'is_plural', 'phrase'))
+
+            for phrase, key, value, is_plural in phrases:
+                writer.writerow((safe_encode(key), safe_encode(value),
+                                str(int(is_plural)), safe_encode(phrase)))
+
+    print('Done')
+
+if __name__ == '__main__':
+    main()