[categories] Scraper for Nominatim Special Phrases, translated into a number of languages

This commit is contained in:
Al
2016-03-18 17:52:28 -04:00
parent 40cf3d1933
commit e6b59980e7

View File

@@ -0,0 +1,96 @@
import csv
import os
import re
import requests
import six
import sys
import time
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_decode, safe_encode
DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'categories')
# Use Special:Export to get wiki markup
WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
IGNORE_PLURAL_LANGUAGES = {
# For Japanese, seems to just put an s on the end, which doesn't seem right
# Need input from a native speaker on that one
'ja',
}
# Wait this many seconds between page fetches
POLITENESS_DELAY = 5.0
def scrape_nominatim_category_page(url, ignore_plurals=False):
result = requests.get(url)
if not result or not result.content:
return
for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
if operator and operator != '-':
continue
is_plural = plural == 'Y'
if is_plural and ignore_plurals:
continue
yield safe_decode(phrase).lower(), key, value, is_plural
def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
print('Fetching main page')
result = requests.get(url)
languages = {}
if not result or not result.content:
return languages
time.sleep(POLITENESS_DELAY)
for entity, anchor_text in wiki_link_re.findall(result.content):
if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
continue
lang = entity.rstrip('/').rsplit('/')[-1].lower()
link = WIKI_BASE_URL + entity.replace(' ', '_')
ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
print('Doing {}'.format(lang))
phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
time.sleep(POLITENESS_DELAY)
if not phrases:
continue
languages[lang] = phrases
return languages
def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
languages = scrape_all_nominatim_category_pages(url=url)
for lang, phrases in six.iteritems(languages):
filename = os.path.join(output_dir, '{}.csv'.format(lang.lower()))
with open(filename, 'w') as f:
writer = csv.writer(f)
for phrase, key, value, is_plural in phrases:
writer.writerow((safe_encode(phrase), key, value, str(int(is_plural))))
print('Done')
if __name__ == '__main__':
main()