Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,72 @@
import csv
import os
import six
import random
import sys
from collections import defaultdict
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_decode
CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'categories')
class CategoryConfig(object):
def __init__(self, base_dir=CATEGORIES_DIR):
self.language_categories_singular = {}
self.language_categories_plural = {}
self.language_property_names = defaultdict(set)
if not os.path.exists(base_dir):
raise RuntimeError('{} does not exist'.format(base_dir))
for filename in os.listdir(base_dir):
if not filename.endswith('.tsv'):
continue
lang = filename.rsplit('.tsv')[0]
base_lang = lang.split('_')[0]
singular_rules = self.language_categories_singular.get(base_lang, defaultdict(list))
plural_rules = self.language_categories_plural.get(base_lang, defaultdict(list))
reader = csv.reader(open(os.path.join(CATEGORIES_DIR, filename)), delimiter='\t')
reader.next() # headers
for key, value, is_plural, phrase in reader:
self.language_property_names[lang].add(key)
is_plural = bool(int(is_plural))
if is_plural:
plural_rules[(key, value)].append(phrase)
else:
singular_rules[(key, value)].append(phrase)
self.language_categories_singular[base_lang] = singular_rules
self.language_categories_plural[base_lang] = plural_rules
self.language_categories_singular = {key: dict(value) for key, value
in six.iteritems(self.language_categories_singular)}
self.language_categories_plural = {key: dict(value) for key, value
in six.iteritems(self.language_categories_plural)}
def has_keys(self, language, keys):
prop_names = self.language_property_names.get(language, set())
return [k for k in keys if k in prop_names]
def get_phrase(self, language, key, value, is_plural=False):
config = self.language_categories_singular if not is_plural else self.language_categories_plural
if language not in config:
return None
language_config = config[language]
choices = language_config.get((key, value))
if not choices:
return None
return random.choice(choices)
category_config = CategoryConfig()

View File

@@ -0,0 +1,31 @@
from geodata.addresses.config import address_config
from geodata.categories.config import category_config
from geodata.math.sampling import weighted_choice, cdf
class CategoryPreposition(object):
NEAR = 'near'
NEARBY = 'nearby'
NEAR_ME = 'near_me'
IN = 'in'
NULL = 'null'
@classmethod
def random(cls, language, country=None):
category_props = address_config.get_property('categories', language, country=country)
if category_props is None:
return None
values = []
probs = []
for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
k = '{}_probability'.format(prep_phrase_type)
prob = category_props.get(k, None)
if prob is not None:
values.append(prep_phrase_type)
probs.append(prob)
probs = cdf(probs)
return weighted_choice(values, probs)

View File

@@ -0,0 +1,38 @@
from collections import namedtuple
from geodata.addresses.config import address_config
from geodata.categories.config import category_config
from geodata.categories.preposition import CategoryPreposition
from geodata.encoding import safe_decode
from geodata.math.sampling import weighted_choice
CategoryQuery = namedtuple('CategoryQuery', 'category, prep, add_place_name, add_address')
NULL_CATEGORY_QUERY = CategoryQuery(None, None, False, False)
class Category(object):
@classmethod
def phrase(cls, language, key, value, is_plural=False, country=None):
category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
if not category_phrase:
return NULL_CATEGORY_QUERY
category_phrase = safe_decode(category_phrase)
prep_phrase_type = CategoryPreposition.random(language, country=country)
if prep_phrase_type in (None, CategoryPreposition.NULL):
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
if not values:
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
prep_phrase = safe_decode(prep_phrase)
add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)

View File

@@ -0,0 +1,125 @@
'''
scrape_nominatim_special_phrases.py
-----------------------------------
Simple script to scrape https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
for category-related phrases sometimes found in geocoder input.
Populates a per-language CSV with (phrase, OSM key, OSM value, plural):
OSM keys/values are like:
amenity=restaurant
tourism=museum
shop=books
Using these phrases, it is possible to construct queries like "restaurants in Brooklyn"
'''
import csv
import os
import re
import requests
import six
import sys
import time
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
from geodata.encoding import safe_decode, safe_encode
DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'categories')
# Use Special:Export to get wiki markup
WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
IGNORE_LANGUAGES = {
# Interlingua
'ia'
}
IGNORE_PLURAL_LANGUAGES = {
# For Japanese, seems to just put an s on the end, which doesn't seem right
# Need input from a native speaker on that one
'ja',
}
# Wait this many seconds between page fetches
POLITENESS_DELAY = 5.0
def scrape_nominatim_category_page(url, ignore_plurals=False):
result = requests.get(url)
if not result or not result.content:
return
for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
if operator and operator != '-':
continue
is_plural = plural == 'Y'
if is_plural and ignore_plurals:
continue
yield safe_decode(phrase).lower(), key, value, is_plural
def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
print('Fetching main page')
result = requests.get(url)
languages = {}
if not result or not result.content:
return languages
time.sleep(POLITENESS_DELAY)
for entity, anchor_text in wiki_link_re.findall(result.content):
if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
continue
lang = entity.rstrip('/').rsplit('/')[-1].lower()
if lang in IGNORE_LANGUAGES:
continue
link = WIKI_BASE_URL + entity.replace(' ', '_')
ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
print('Doing {}'.format(lang))
phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
time.sleep(POLITENESS_DELAY)
if not phrases:
continue
languages[lang] = phrases
return languages
def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
languages = scrape_all_nominatim_category_pages(url=url)
for lang, phrases in six.iteritems(languages):
filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
with open(filename, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerow(('key', 'value', 'is_plural', 'phrase'))
for phrase, key, value, is_plural in phrases:
writer.writerow((safe_encode(key), safe_encode(value),
str(int(is_plural)), safe_encode(phrase)))
print('Done')
if __name__ == '__main__':
main()