[categories] Scraper for Nominatim Special Phrases, translated into a number of languages

2016-03-18 17:52:28 -04:00
parent 40cf3d1933
commit e6b59980e7
1 changed files with 96 additions and 0 deletions
--- a/scripts/geodata/categories/scrape_nominatim_special_phrases.py
+++ b/scripts/geodata/categories/scrape_nominatim_special_phrases.py
@@ -0,0 +1,96 @@
+import csv
+import os
+import re
+import requests
+import six
+import sys
+import time
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode, safe_encode
+
+DEFAULT_CATEGORIES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                      'resources', 'categories')
+
+
+# Use Special:Export to get wiki markup
+WIKI_BASE_URL = 'https://wiki.openstreetmap.org/wiki/Special:Export/'
+NOMINATIM_SPECIAL_PHRASES_PREFIX = 'Nominatim/Special Phrases'
+NOMINATIM_SPECIAL_PHRASES_URL = WIKI_BASE_URL + NOMINATIM_SPECIAL_PHRASES_PREFIX.replace(' ', '_')
+
+phrase_table_re = re.compile('\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([^|]+) \|\| ([\-YN])', re.I)
+wiki_link_re = re.compile('(?:\[\[([^\|\]]+(?<=\S))[\s]*(?:\|[\s]*)?(?:([^\]]+))?\]\])')
+
+IGNORE_PLURAL_LANGUAGES = {
+    # For Japanese, seems to just put an s on the end, which doesn't seem right
+    # Need input from a native speaker on that one
+    'ja',
+}
+
+# Wait this many seconds between page fetches
+POLITENESS_DELAY = 5.0
+
+
+def scrape_nominatim_category_page(url, ignore_plurals=False):
+    result = requests.get(url)
+
+    if not result or not result.content:
+        return
+
+    for phrase, key, value, operator, plural in phrase_table_re.findall(result.content):
+        if operator and operator != '-':
+            continue
+
+        is_plural = plural == 'Y'
+        if is_plural and ignore_plurals:
+            continue
+
+        yield safe_decode(phrase).lower(), key, value, is_plural
+
+
+def scrape_all_nominatim_category_pages(url=NOMINATIM_SPECIAL_PHRASES_URL):
+    print('Fetching main page')
+    result = requests.get(url)
+    languages = {}
+    if not result or not result.content:
+        return languages
+
+    time.sleep(POLITENESS_DELAY)
+
+    for entity, anchor_text in wiki_link_re.findall(result.content):
+        if not entity.startswith(NOMINATIM_SPECIAL_PHRASES_PREFIX):
+            continue
+
+        lang = entity.rstrip('/').rsplit('/')[-1].lower()
+
+        link = WIKI_BASE_URL + entity.replace(' ', '_')
+
+        ignore_plurals = lang in IGNORE_PLURAL_LANGUAGES
+
+        print('Doing {}'.format(lang))
+        phrases = list(scrape_nominatim_category_page(link, ignore_plurals=ignore_plurals))
+        time.sleep(POLITENESS_DELAY)
+
+        if not phrases:
+            continue
+
+        languages[lang] = phrases
+
+    return languages
+
+
+def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
+    languages = scrape_all_nominatim_category_pages(url=url)
+    for lang, phrases in six.iteritems(languages):
+        filename = os.path.join(output_dir, '{}.csv'.format(lang.lower()))
+        with open(filename, 'w') as f:
+            writer = csv.writer(f)
+            for phrase, key, value, is_plural in phrases:
+                writer.writerow((safe_encode(phrase), key, value, str(int(is_plural))))
+
+    print('Done')
+
+if __name__ == '__main__':
+    main()