From e6b59980e7067a47cffb850f1c6a80c9bc74d92d Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 18 Mar 2016 17:52:28 -0400 Subject: [PATCH] [categories] Scraper for Nominatim Special Phrases, translated into a number of languages --- .../scrape_nominatim_special_phrases.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 scripts/geodata/categories/scrape_nominatim_special_phrases.py diff --git a/scripts/geodata/categories/scrape_nominatim_special_phrases.py b/scripts/geodata/categories/scrape_nominatim_special_phrases.py new file mode 100644 index 00000000..c4651f66 --- /dev/null +++ b/scripts/geodata/categories/scrape_nominatim_special_phrases.py @@ -0,0 +1,96 @@ +import csv +import os +import re +import requests +import six +import sys +import time + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.encoding import safe_decode, safe_encode + +DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'categories') + + +# Use Special:Export to get wiki markup +WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/' +NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases' +NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_') + +phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I) +wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])') + +IGNORE_PLURAL_LANGUAGES = { + # For Japanese, seems to just put an s on the end, which doesn't seem right + # Need input from a native speaker on that one + 'ja', +} + +# Wait this many seconds between page fetches +POLITENESS_DELAY = 5.0 + + +def scrape_nominatim_category_page(url, ignore_plurals=False): + result = requests.get(url) + + if not result or not result.content: + return + + for phrase, key, value, operator, plural in phrase_table_re.findall(result.content): + if operator and operator != '-': + continue + + is_plural = plural == 'Y' + if is_plural and ignore_plurals: + continue + + yield safe_decode(phrase).lower(), key, value, is_plural + + +def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL): + print('Fetching main page') + result = requests.get(url) + languages = {} + if not result or not result.content: + return languages + + time.sleep(POLITENESS_DELAY) + + for entity, anchor_text in wiki_link_re.findall(result.content): + if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX): + continue + + lang = entity.rstrip('/').rsplit('/')[-1].lower() + + link = WIKI_BASE_URL + entity.replace(' ', '_') + + ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES + + print('Doing {}'.format(lang)) + phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals)) + time.sleep(POLITENESS_DELAY) + + if not phrases: + continue + + languages[lang] = phrases + + return languages + + +def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR): + languages = scrape_all_nominatim_category_pages(url=url) + for lang, phrases in six.iteritems(languages): + filename = os.path.join(output_dir, '{}.csv'.format(lang.lower())) + with open(filename, 'w') as f: + writer = csv.writer(f) + for phrase, key, value, is_plural in phrases: + writer.writerow((safe_encode(phrase), key, value, str(int(is_plural)))) + + print('Done') + +if __name__ == '__main__': + main()