From d771da7c78fb457c72319c77a58ceee5fd5406a6 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Mon, 13 Apr 2015 19:01:46 -0400
Subject: [PATCH] [i18n] unicode scripts file downloaded and cached locally

---
 scripts/geodata/i18n/unicode_scripts.py | 33 ++++++++++++++++---------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/scripts/geodata/i18n/unicode_scripts.py b/scripts/geodata/i18n/unicode_scripts.py
index 16172a3f..fce08f98 100644
--- a/scripts/geodata/i18n/unicode_scripts.py
+++ b/scripts/geodata/i18n/unicode_scripts.py
@@ -12,6 +12,7 @@ import requests
 import sys
 import tempfile
 import requests
+import subprocess
 
 from cStringIO import StringIO
 
@@ -27,9 +28,14 @@ this_dir = os.path.realpath(os.path.dirname(__file__))
 sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
 
 from geodata.encoding import safe_encode, safe_decode
+from geodata.file_utils import ensure_dir
 
-from word_breaks import script_regex, regex_char_range
-from cldr_languages import *
+from geodata.i18n.cldr_languages import *
+from geodata.i18n.unicode_data import UNICODE_DATA_DIR
+from geodata.i18n.word_breaks import script_regex, regex_char_range
+
+SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
+LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
 
 SCRIPTS_HEADER = 'unicode_script_types.h'
 SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
@@ -85,22 +91,26 @@ def script_name_constant(i, u):
 UNKNOWN_SCRIPT = 'Unknown'
 
 
-def get_chars_by_script():
-    response = requests.get(SCRIPTS_URL)
+def download_scripts_file():
+    ensure_dir(SCRIPTS_DATA_DIR)
+    subprocess.check_call(['wget', SCRIPTS_URL, '-O', LOCAL_SCRIPTS_FILE])
 
-    chars = [None] * NUM_CHARS
+
+def get_chars_by_script(scripts_filename=LOCAL_SCRIPTS_FILE):
+    scripts_file = open(scripts_filename)
+    scripts = [None] * NUM_CHARS
 
     # Lines look like:
     # 0041..005A    ; Latin # L&  [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
-    for char_range, script, char_class in script_regex.findall(response.content):
+    for char_range, script, char_class in script_regex.findall(scripts_file.read()):
         script_range = [unicode_to_integer(u) for u in char_range.split('..') if len(u) < 5]
         if len(script_range) == 2:
             for i in xrange(script_range[0], script_range[1] + 1):
-                chars[i] = script
+                scripts[i] = script
         elif script_range:
-            chars[script_range[0]] = script
+            scripts[script_range[0]] = script
 
-    return chars
+    return scripts
 
 
 def build_master_scripts_list(chars):
@@ -166,8 +176,8 @@ def get_script_languages(script_codes):
     # to identify the language. We keep track of those single language scripts to inform
     # the language classifier
 
-    cldr_response = requests.get(CLDR_SUPPLEMENTAL_DATA)
-    cldr_xml = etree.fromstring(cldr_response.content)
+    cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
+    cldr_xml = etree.parse(cldr_supplemental_data)
     language_scripts = extract_language_scripts(cldr_xml)
 
     country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
@@ -201,6 +211,7 @@ def main(out_dir):
     out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
     out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
 
+    download_scripts_file()
     chars = get_chars_by_script()
     all_scripts = build_master_scripts_list(chars)
     script_codes = get_script_codes(all_scripts)