[categories] Scraper for Nominatim Special Phrases, translated into a number of languages
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import six
|
||||
import sys
|
||||
import time
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
|
||||
DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'categories')
|
||||
|
||||
|
||||
# Use Special:Export to get wiki markup
|
||||
WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
|
||||
NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
|
||||
NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
|
||||
|
||||
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
|
||||
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
|
||||
|
||||
IGNORE_PLURAL_LANGUAGES = {
|
||||
# For Japanese, seems to just put an s on the end, which doesn't seem right
|
||||
# Need input from a native speaker on that one
|
||||
'ja',
|
||||
}
|
||||
|
||||
# Wait this many seconds between page fetches
|
||||
POLITENESS_DELAY = 5.0
|
||||
|
||||
|
||||
def scrape_nominatim_category_page(url, ignore_plurals=False):
|
||||
result = requests.get(url)
|
||||
|
||||
if not result or not result.content:
|
||||
return
|
||||
|
||||
for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
|
||||
if operator and operator != '-':
|
||||
continue
|
||||
|
||||
is_plural = plural == 'Y'
|
||||
if is_plural and ignore_plurals:
|
||||
continue
|
||||
|
||||
yield safe_decode(phrase).lower(), key, value, is_plural
|
||||
|
||||
|
||||
def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
|
||||
print('Fetching main page')
|
||||
result = requests.get(url)
|
||||
languages = {}
|
||||
if not result or not result.content:
|
||||
return languages
|
||||
|
||||
time.sleep(POLITENESS_DELAY)
|
||||
|
||||
for entity, anchor_text in wiki_link_re.findall(result.content):
|
||||
if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
|
||||
continue
|
||||
|
||||
lang = entity.rstrip('/').rsplit('/')[-1].lower()
|
||||
|
||||
link = WIKI_BASE_URL + entity.replace(' ', '_')
|
||||
|
||||
ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
|
||||
|
||||
print('Doing {}'.format(lang))
|
||||
phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
|
||||
time.sleep(POLITENESS_DELAY)
|
||||
|
||||
if not phrases:
|
||||
continue
|
||||
|
||||
languages[lang] = phrases
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
|
||||
languages = scrape_all_nominatim_category_pages(url=url)
|
||||
for lang, phrases in six.iteritems(languages):
|
||||
filename = os.path.join(output_dir, '{}.csv'.format(lang.lower()))
|
||||
with open(filename, 'w') as f:
|
||||
writer = csv.writer(f)
|
||||
for phrase, key, value, is_plural in phrases:
|
||||
writer.writerow((safe_encode(phrase), key, value, str(int(is_plural))))
|
||||
|
||||
print('Done')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user