Initial fork commit
This commit is contained in:
0
scripts/geodata/categories/__init__.py
Normal file
0
scripts/geodata/categories/__init__.py
Normal file
72
scripts/geodata/categories/config.py
Normal file
72
scripts/geodata/categories/config.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import csv
|
||||
import os
|
||||
import six
|
||||
import random
|
||||
import sys
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'categories')
|
||||
|
||||
|
||||
class CategoryConfig(object):
|
||||
def __init__(self, base_dir=CATEGORIES_DIR):
|
||||
self.language_categories_singular = {}
|
||||
self.language_categories_plural = {}
|
||||
|
||||
self.language_property_names = defaultdict(set)
|
||||
|
||||
if not os.path.exists(base_dir):
|
||||
raise RuntimeError('{} does not exist'.format(base_dir))
|
||||
|
||||
for filename in os.listdir(base_dir):
|
||||
if not filename.endswith('.tsv'):
|
||||
continue
|
||||
|
||||
lang = filename.rsplit('.tsv')[0]
|
||||
base_lang = lang.split('_')[0]
|
||||
|
||||
singular_rules = self.language_categories_singular.get(base_lang, defaultdict(list))
|
||||
plural_rules = self.language_categories_plural.get(base_lang, defaultdict(list))
|
||||
|
||||
reader = csv.reader(open(os.path.join(CATEGORIES_DIR, filename)), delimiter='\t')
|
||||
reader.next() # headers
|
||||
|
||||
for key, value, is_plural, phrase in reader:
|
||||
self.language_property_names[lang].add(key)
|
||||
is_plural = bool(int(is_plural))
|
||||
if is_plural:
|
||||
plural_rules[(key, value)].append(phrase)
|
||||
else:
|
||||
singular_rules[(key, value)].append(phrase)
|
||||
|
||||
self.language_categories_singular[base_lang] = singular_rules
|
||||
self.language_categories_plural[base_lang] = plural_rules
|
||||
|
||||
self.language_categories_singular = {key: dict(value) for key, value
|
||||
in six.iteritems(self.language_categories_singular)}
|
||||
|
||||
self.language_categories_plural = {key: dict(value) for key, value
|
||||
in six.iteritems(self.language_categories_plural)}
|
||||
|
||||
def has_keys(self, language, keys):
|
||||
prop_names = self.language_property_names.get(language, set())
|
||||
return [k for k in keys if k in prop_names]
|
||||
|
||||
def get_phrase(self, language, key, value, is_plural=False):
|
||||
config = self.language_categories_singular if not is_plural else self.language_categories_plural
|
||||
if language not in config:
|
||||
return None
|
||||
language_config = config[language]
|
||||
choices = language_config.get((key, value))
|
||||
if not choices:
|
||||
return None
|
||||
return random.choice(choices)
|
||||
|
||||
category_config = CategoryConfig()
|
||||
31
scripts/geodata/categories/preposition.py
Normal file
31
scripts/geodata/categories/preposition.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.categories.config import category_config
|
||||
from geodata.math.sampling import weighted_choice, cdf
|
||||
|
||||
|
||||
class CategoryPreposition(object):
|
||||
NEAR = 'near'
|
||||
NEARBY = 'nearby'
|
||||
NEAR_ME = 'near_me'
|
||||
IN = 'in'
|
||||
NULL = 'null'
|
||||
|
||||
@classmethod
|
||||
def random(cls, language, country=None):
|
||||
category_props = address_config.get_property('categories', language, country=country)
|
||||
if category_props is None:
|
||||
return None
|
||||
|
||||
values = []
|
||||
probs = []
|
||||
|
||||
for prep_phrase_type in (cls.NEAR, cls.NEARBY, cls.NEAR_ME, cls.IN, cls.NULL):
|
||||
k = '{}_probability'.format(prep_phrase_type)
|
||||
prob = category_props.get(k, None)
|
||||
if prob is not None:
|
||||
values.append(prep_phrase_type)
|
||||
probs.append(prob)
|
||||
|
||||
probs = cdf(probs)
|
||||
|
||||
return weighted_choice(values, probs)
|
||||
38
scripts/geodata/categories/query.py
Normal file
38
scripts/geodata/categories/query.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from collections import namedtuple
|
||||
|
||||
from geodata.addresses.config import address_config
|
||||
from geodata.categories.config import category_config
|
||||
from geodata.categories.preposition import CategoryPreposition
|
||||
from geodata.encoding import safe_decode
|
||||
from geodata.math.sampling import weighted_choice
|
||||
|
||||
CategoryQuery = namedtuple('CategoryQuery', 'category, prep, add_place_name, add_address')
|
||||
|
||||
NULL_CATEGORY_QUERY = CategoryQuery(None, None, False, False)
|
||||
|
||||
|
||||
class Category(object):
|
||||
@classmethod
|
||||
def phrase(cls, language, key, value, is_plural=False, country=None):
|
||||
category_phrase = category_config.get_phrase(language, key, value, is_plural=is_plural)
|
||||
if not category_phrase:
|
||||
return NULL_CATEGORY_QUERY
|
||||
|
||||
category_phrase = safe_decode(category_phrase)
|
||||
|
||||
prep_phrase_type = CategoryPreposition.random(language, country=country)
|
||||
|
||||
if prep_phrase_type in (None, CategoryPreposition.NULL):
|
||||
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
|
||||
if not values:
|
||||
return CategoryQuery(category_phrase, prep=None, add_place_name=True, add_address=True)
|
||||
|
||||
prep_phrase, prep_phrase_props = weighted_choice(values, probs)
|
||||
prep_phrase = safe_decode(prep_phrase)
|
||||
|
||||
add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
|
||||
add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
|
||||
|
||||
return CategoryQuery(category_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
|
||||
125
scripts/geodata/categories/scrape_nominatim_special_phrases.py
Normal file
125
scripts/geodata/categories/scrape_nominatim_special_phrases.py
Normal file
@@ -0,0 +1,125 @@
|
||||
'''
|
||||
scrape_nominatim_special_phrases.py
|
||||
-----------------------------------
|
||||
|
||||
Simple script to scrape https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases
|
||||
for category-related phrases sometimes found in geocoder input.
|
||||
|
||||
Populates a per-language CSV with (phrase, OSM key, OSM value, plural):
|
||||
|
||||
OSM keys/values are like:
|
||||
|
||||
amenity=restaurant
|
||||
tourism=museum
|
||||
shop=books
|
||||
|
||||
Using these phrases, it is possible to construct queries like "restaurants in Brooklyn"
|
||||
'''
|
||||
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import six
|
||||
import sys
|
||||
import time
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
|
||||
DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'categories')
|
||||
|
||||
|
||||
# Use Special:Export to get wiki markup
|
||||
WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
|
||||
NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
|
||||
NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
|
||||
|
||||
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
|
||||
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
|
||||
|
||||
IGNORE_LANGUAGES = {
|
||||
# Interlingua
|
||||
'ia'
|
||||
}
|
||||
|
||||
|
||||
IGNORE_PLURAL_LANGUAGES = {
|
||||
# For Japanese, seems to just put an s on the end, which doesn't seem right
|
||||
# Need input from a native speaker on that one
|
||||
'ja',
|
||||
}
|
||||
|
||||
# Wait this many seconds between page fetches
|
||||
POLITENESS_DELAY = 5.0
|
||||
|
||||
|
||||
def scrape_nominatim_category_page(url, ignore_plurals=False):
|
||||
result = requests.get(url)
|
||||
|
||||
if not result or not result.content:
|
||||
return
|
||||
|
||||
for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
|
||||
if operator and operator != '-':
|
||||
continue
|
||||
|
||||
is_plural = plural == 'Y'
|
||||
if is_plural and ignore_plurals:
|
||||
continue
|
||||
|
||||
yield safe_decode(phrase).lower(), key, value, is_plural
|
||||
|
||||
|
||||
def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
|
||||
print('Fetching main page')
|
||||
result = requests.get(url)
|
||||
languages = {}
|
||||
if not result or not result.content:
|
||||
return languages
|
||||
|
||||
time.sleep(POLITENESS_DELAY)
|
||||
|
||||
for entity, anchor_text in wiki_link_re.findall(result.content):
|
||||
if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
|
||||
continue
|
||||
|
||||
lang = entity.rstrip('/').rsplit('/')[-1].lower()
|
||||
if lang in IGNORE_LANGUAGES:
|
||||
continue
|
||||
|
||||
link = WIKI_BASE_URL + entity.replace(' ', '_')
|
||||
|
||||
ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
|
||||
|
||||
print('Doing {}'.format(lang))
|
||||
phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
|
||||
time.sleep(POLITENESS_DELAY)
|
||||
|
||||
if not phrases:
|
||||
continue
|
||||
|
||||
languages[lang] = phrases
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
|
||||
languages = scrape_all_nominatim_category_pages(url=url)
|
||||
for lang, phrases in six.iteritems(languages):
|
||||
filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
|
||||
with open(filename, 'w') as f:
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
writer.writerow(('key', 'value', 'is_plural', 'phrase'))
|
||||
|
||||
for phrase, key, value, is_plural in phrases:
|
||||
writer.writerow((safe_encode(key), safe_encode(value),
|
||||
str(int(is_plural)), safe_encode(phrase)))
|
||||
|
||||
print('Done')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user