Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/i18n/init.py
+++ b/scripts/geodata/i18n/init.py
--- a/scripts/geodata/i18n/cldr_languages.py
+++ b/scripts/geodata/i18n/cldr_languages.py
@@ -0,0 +1,139 @@
+import argparse
+import csv
+import os
+import requests
+
+from collections import Counter
+
+from cStringIO import StringIO
+from lxml import etree
+
+from unicode_paths import CLDR_DIR
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+DEFAULT_LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                                     'resources', 'language', 'countries')
+
+CLDR_SUPPLEMENTAL_DATA = os.path.join(CLDR_DIR, 'common', 'supplemental',
+                                      'supplementalData.xml')
+
+ISO_639_3 = 'http://www-01.sil.org/iso639-3/iso-639-3.tab'
+ISO_MACROLANGUAGES = 'http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages.tab'
+
+ISO_LANGUAGES_FILENAME = 'iso_languages.tsv'
+MACROLANGUAGES_FILENAME = 'iso_macrolanguages.tsv'
+COUNTRY_LANGUAGES_FILENAME = 'country_language.tsv'
+SCRIPT_LANGUAGES_FILENAME = 'script_languages.tsv'
+
+REGIONAL = 'official_regional'
+UNKNOWN_COUNTRY = 'zz'
+UNKNOWN_LANGUAGES = ('und', 'zxx')
+
+
+def write_country_official_languages_file(xml, out_dir):
+    lang_file = open(os.path.join(out_dir, COUNTRY_LANGUAGES_FILENAME), 'w')
+    lang_writer = csv.writer(lang_file, delimiter='\t')
+
+    def get_population_pct(lang):
+        return int(lang.attrib.get('populationPercent', 0))
+
+    lang_scripts = {}
+    for lang in xml.xpath('//languageData/language'):
+        language_code = lang.attrib['type'].lower()
+        scripts = lang.get('scripts')
+        if not scripts:
+            continue
+        territories = lang.get('territories')
+        if (language_code, None) not in lang_scripts:
+            lang_scripts[(language_code, None)] = scripts
+
+        if not territories:
+            continue
+        for territory in territories.strip().split():
+            lang_scripts[(language_code, territory.lower())] = scripts
+
+    for territory in xml.xpath('//territoryInfo/territory'):
+        country_code = territory.attrib['type'].lower()
+        if country_code == UNKNOWN_COUNTRY:
+            continue
+        langs = territory.xpath('languagePopulation')
+        languages = Counter()
+        official = set()
+        regional = set()
+        for lang in langs:
+            language = lang.attrib['type'].lower().split('_')[0]
+            official_status = lang.attrib.get('officialStatus')
+            languages[language] += float(lang.attrib['populationPercent'])
+            if official_status and official_status != REGIONAL:
+                official.add(language)
+            elif official_status == REGIONAL:
+                regional.add(language)
+
+        if official:
+            languages = Counter({l: c for l, c in languages.iteritems()
+                                 if l in official or l in regional})
+        else:
+            languages = Counter({l: c for l, c in languages.most_common(1)})
+
+        for lang, pct in languages.most_common():
+            if lang in UNKNOWN_LANGUAGES:
+                continue
+
+            script = lang_scripts.get((lang, country_code), lang_scripts.get((lang, None), ''))
+
+            lang_writer.writerow((country_code, lang, script.replace(' ', ','),
+                                  str(min(pct, 100.0)), str(int(lang in official))))
+
+RETIRED = 'R'
+INDIVIDUAL = 'I'
+MACRO = 'M'
+LIVING = 'L'
+
+
+def write_languages_file(langs, macro, out_dir):
+    lang_file = open(os.path.join(out_dir, 'iso_languages.tsv'), 'w')
+    writer = csv.writer(lang_file, delimiter='\t')
+    writer.writerow(('ISO 639-3', 'ISO 639-2B', 'ISO 639-2T',
+                     'ISO 639-1', 'type', 'macro'))
+
+    macro_reader = csv.reader(StringIO(macro), delimiter='\t')
+    headers = macro_reader.next()
+    assert len(headers) == 3
+    macros = {minor_code: macro_code for (macro_code, minor_code, status)
+              in macro_reader if status != RETIRED}
+
+    lang_reader = csv.reader(StringIO(langs), delimiter='\t')
+    headers = lang_reader.next()
+    assert headers[:6] == ['Id', 'Part2B', 'Part2T',
+                           'Part1', 'Scope', 'Language_Type']
+
+    for line in lang_reader:
+        iso639_3, iso639_2b, iso639_2t, iso639_1, scope, lang_type = line[:6]
+        macro = macros.get(iso639_3, '')
+        # Only living languages that are either individual or macro
+        if scope in (INDIVIDUAL, MACRO) and lang_type == LIVING:
+            writer.writerow((iso639_3, iso639_2b, iso639_2t,
+                             iso639_1, scope, macro))
+
+
+def fetch_cldr_languages(out_dir=DEFAULT_LANGUAGES_DIR):
+    response = requests.get(ISO_639_3)
+    langs = response.content
+
+    response = requests.get(ISO_MACROLANGUAGES)
+    macro = response.content
+    write_languages_file(langs, macro, out_dir)
+
+    supplemental = open(CLDR_SUPPLEMENTAL_DATA)
+    xml = etree.parse(supplemental)
+    write_country_official_languages_file(xml, out_dir)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out',
+                        default=DEFAULT_LANGUAGES_DIR,
+                        help='Out directory')
+    args = parser.parse_args()
+
+    fetch_cldr_languages(args.out)
--- a/scripts/geodata/i18n/download_cldr.py
+++ b/scripts/geodata/i18n/download_cldr.py
@@ -0,0 +1,30 @@
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+from unicode_paths import CLDR_DIR
+from geodata.file_utils import ensure_dir
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+CLDR_URL = 'http://www.unicode.org/Public/cldr/latest/core.zip'
+
+
+def download_cldr(temp_dir=None):
+    if os.path.exists(CLDR_DIR):
+        shutil.rmtree(CLDR_DIR)
+    ensure_dir(CLDR_DIR)
+
+    if not temp_dir:
+        temp_dir = tempfile.gettempdir()
+
+    cldr_filename = os.path.join(temp_dir, CLDR_URL.rsplit('/', 1)[-1])
+
+    subprocess.check_call(['wget', CLDR_URL, '-O', cldr_filename])
+    subprocess.check_call(['unzip', cldr_filename, '-d', CLDR_DIR])
+
+if __name__ == '__main__':
+    download_cldr(*sys.argv[1:])
--- a/scripts/geodata/i18n/google.py
+++ b/scripts/geodata/i18n/google.py
@@ -0,0 +1,37 @@
+import re
+import requests
+import six.moves.urllib_parse as urlparse
+import ujson
+
+requests.models.json = ujson
+
+
+GOOGLE_I18N_API = 'http://i18napis.appspot.com'
+GOOGLE_ADDRESS_DATA_API = urlparse.urljoin(GOOGLE_I18N_API, 'address/data/')
+
+
+class GoogleI18N(object):
+    '''
+    Fetches data from e.g. http://i18napis.appspot.com/address/data/GB
+    and caches it in a dictionary for each country. These requests are
+    lightweight, so for a given run of a program, max 250 requests
+    will be made.
+    '''
+    def __init__(self):
+        self.responses = {}
+
+    def get(self, country_code):
+        ret = self.responses.get(country_code.lower())
+
+        if ret is None:
+            url = urlparse.urljoin(GOOGLE_ADDRESS_DATA_API, country_code.upper())
+            response = requests.get(url)
+            if response.ok:
+                ret = response.json()
+                self.responses[country_code.lower()] = ret
+            else:
+                self.responses[country_code.lower()] = {}
+        return ret
+
+
+google_i18n = GoogleI18N()
--- a/scripts/geodata/i18n/languages.py
+++ b/scripts/geodata/i18n/languages.py
@@ -0,0 +1,86 @@
+import os
+import csv
+import sys
+
+from collections import defaultdict, OrderedDict
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.csv_utils import unicode_csv_reader
+
+LANGUAGES_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
+                             'resources', 'language')
+
+country_languages = defaultdict(OrderedDict)
+# Only official and de facto official, no official_regional
+official_languages = defaultdict(OrderedDict)
+
+regional_languages = defaultdict(OrderedDict)
+road_language_overrides = defaultdict(OrderedDict)
+
+languages = set()
+all_languages = languages
+
+osm_admin1_ids = set()
+
+languages_initialized = False
+
+
+def init_languages(languages_dir=LANGUAGES_DIR):
+    global languages_initialized
+    if languages_initialized:
+        return
+    path = os.path.join(languages_dir, 'countries', 'country_language.tsv')
+    if not os.path.exists(path):
+        raise ValueError('File does not exist: {}'.format(path))
+
+    for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
+        country_languages[country][lang] = int(is_official)
+        languages.add(lang)
+
+    for country, lang, script, pct, is_official in unicode_csv_reader(open(path), delimiter='\t'):
+        if int(is_official) or len(country_languages[country]) == 1:
+            official_languages[country][lang] = 1
+
+    path = os.path.join(languages_dir, 'countries', 'road_sign_languages.tsv')
+    for country, lang, default in csv.reader(open(path), delimiter='\t'):
+        road_language_overrides[country][lang] = int(default)
+        if lang not in languages:
+            languages.add(lang)
+
+    path = os.path.join(languages_dir, 'regional', 'adm1.tsv')
+
+    for country, key, value, langs, default in unicode_csv_reader(open(path), delimiter='\t'):
+        if key == 'osm':
+            osm_admin1_ids.add(tuple(value.split(':')))
+        for lang in langs.split(','):
+            regional_languages[(country, key, value)][lang] = int(default)
+            if lang not in country_languages[country]:
+                country_languages[country][lang] = 0
+            if lang not in languages:
+                languages.add(lang)
+
+    languages_initialized = True
+
+
+init_languages()
+
+
+def get_country_languages(country, official=True, overrides=True):
+    if official:
+        languages = official_languages[country]
+    else:
+        languages = country_languages[country]
+
+    if overrides:
+        road_overrides = road_language_overrides.get(country)
+        if road_overrides and road_overrides.values()[0]:
+            languages = road_overrides
+        elif road_overrides:
+            languages.update(road_overrides)
+    return languages
+
+
+def get_regional_languages(country, key, value):
+    return regional_languages.get((country, key, value), OrderedDict())
--- a/scripts/geodata/i18n/normalize.py
+++ b/scripts/geodata/i18n/normalize.py
@@ -0,0 +1,5 @@
+import unicodedata
+
+
+def strip_accents(s):
+    return u''.join([c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'])
--- a/scripts/geodata/i18n/scanner.py
+++ b/scripts/geodata/i18n/scanner.py
@@ -0,0 +1,37 @@
+import re
+import os
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_decode
+
+class Scanner(object):
+    '''
+    Simple scanner implementation in Python using regular expression groups.
+    Used to create dynamic lexicons for parsing various CLDR files
+    without compiling a C scanner. Only C scanners are used at runtime
+    '''
+
+    def __init__(self, lexicon, flags=re.VERBOSE | re.I | re.UNICODE):
+        self.lexicon = lexicon
+
+        regexes, responses = zip(*lexicon)
+
+        self.regex = re.compile(u'|'.join([u'({})'.format(safe_decode(r)) for r in regexes]), flags)
+        self.responses = responses
+
+    def scan(self, s):
+
+        for match in self.regex.finditer(safe_decode(s)):
+            i = match.lastindex
+            response = self.responses[i - 1]
+            token = match.group(i)
+            if not callable(response):
+                yield (token, response)
+            else:
+                responses = response(match, token)
+                if responses is not None:
+                    for response, token in responses:
+                        yield (token, response)
--- a/scripts/geodata/i18n/transliteration_rules.py
+++ b/scripts/geodata/i18n/transliteration_rules.py
--- a/scripts/geodata/i18n/unicode_data.py
+++ b/scripts/geodata/i18n/unicode_data.py
@@ -0,0 +1,273 @@
+'''
+unicode_data.py
+---------------
+
+Python's unicodedata module uses an outdated spec (Unicode 5.2) and since
+e.g. unicode categories are used in tokenization, we'd like to keep this
+as up-to-date as possible with the latest standard.
+'''
+import csv
+import os
+import sys
+from collections import defaultdict, namedtuple
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.file_utils import download_file
+from geodata.string_utils import wide_unichr, wide_ord
+
+from unicode_properties import *
+
+from unicode_paths import UNICODE_DATA_DIR
+
+UNIDATA_URL = 'http://unicode.org/Public/UNIDATA/UnicodeData.txt'
+
+UNIDATA_DIR = os.path.join(UNICODE_DATA_DIR, 'unidata')
+LOCAL_UNIDATA_FILE = os.path.join(UNIDATA_DIR, 'UnicodeData.txt')
+
+unicode_categories = defaultdict(list)
+unicode_blocks = defaultdict(list)
+unicode_combining_classes = defaultdict(list)
+unicode_general_categories = defaultdict(list)
+unicode_scripts = defaultdict(list)
+unicode_properties = {}
+
+unicode_script_ids = {}
+
+unicode_blocks = {}
+unicode_category_aliases = {}
+unicode_property_aliases = {}
+unicode_property_value_aliases = {}
+unicode_word_breaks = {}
+
+
+# Ref: ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
+UNIDATA_FIELDS = [
+    'code',
+    'name',
+    'category',
+    'combining',
+    'bidi_category',
+    'decomp_mapping',
+    'decimal_value',
+    'digit_value',
+    'numeric_value',
+    'mirrored',
+    'unicode_1_name',
+    'comment',
+    'upper_mapping',
+    'lower_mapping',
+    'title_mapping',
+]
+
+UnicodeDataRow = namedtuple('UnicodeDataRow', ','.join(UNIDATA_FIELDS))
+
+
+def parse_unicode_data():
+    '''
+    Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
+    '''
+    if not os.path.exists(LOCAL_UNIDATA_FILE):
+        download_file(UNIDATA_URL, LOCAL_UNIDATA_FILE)
+    unidata_file = open(LOCAL_UNIDATA_FILE)
+
+    for line in csv.reader(unidata_file, delimiter=';'):
+        yield UnicodeDataRow(*line)
+
+
+def iter_unicode_combining_classes():
+    return unicode_combining_classes.iteritems()
+
+
+def iter_unicode_categories():
+    return unicode_categories.iteritems()
+
+
+def get_unicode_category(cat):
+    return unicode_categories[cat]
+
+
+def get_unicode_combining_class(c):
+    return unicode_combining_classes[c]
+
+
+def get_unicode_categories():
+    '''
+    Build dict of unicode categories e.g.
+
+    {
+        'Lu': ['A', 'B', 'C', ...]
+        'Ll': ['a', 'b', 'c', ...]
+    }
+    '''
+    categories = defaultdict(list)
+    for row in parse_unicode_data():
+        categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
+    return dict(categories)
+
+
+def get_unicode_combining_classes():
+    '''
+    Build dict of unicode combining classes e.g.
+
+    {
+        '0': ['\x00', '\x01', \x02', ...]
+    }
+    '''
+    combining_classes = defaultdict(list)
+    for row in parse_unicode_data():
+        combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
+    return dict(combining_classes)
+
+unicode_category_aliases = {
+    'letter': 'L',
+    'lower': 'Ll',
+    'lowercase': 'Ll',
+    'lowercaseletter': 'Ll',
+    'upper': 'Lu',
+    'uppercase': 'Lu',
+    'uppercaseletter': 'Lu',
+    'title': 'Lt',
+    'nonspacing mark': 'Mn',
+    'mark': 'M',
+}
+
+COMBINING_CLASS_PROP = 'canonical_combining_class'
+BLOCK_PROP = 'block'
+GENERAL_CATEGORY_PROP = 'general_category'
+SCRIPT_PROP = 'script'
+WORD_BREAK_PROP = 'word_break'
+
+
+def init_unicode_categories():
+    '''
+    Initialize module-level dictionaries
+    '''
+    global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
+    global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
+    global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
+
+    unicode_categories.update(get_unicode_categories())
+    unicode_combining_classes.update(get_unicode_combining_classes())
+
+    for key in unicode_categories.keys():
+        unicode_general_categories[key[0]].extend(unicode_categories[key])
+
+    script_chars = get_chars_by_script()
+    for i, script in enumerate(script_chars):
+        if script:
+            unicode_scripts[script.lower()].append(wide_unichr(i))
+
+    unicode_scripts = dict(unicode_scripts)
+
+    unicode_script_ids.update(build_master_scripts_list(script_chars))
+
+    unicode_blocks.update(get_unicode_blocks())
+    unicode_properties.update(get_unicode_properties())
+    unicode_property_aliases.update(get_property_aliases())
+
+    unicode_word_breaks.update(get_word_break_properties())
+
+    for key, value in get_property_value_aliases().iteritems():
+        key = unicode_property_aliases.get(key, key)
+        if key == GENERAL_CATEGORY_PROP:
+            for k, v in value.iteritems():
+                k = k.lower()
+                unicode_category_aliases[k] = v
+                if '_' in k:
+                    unicode_category_aliases[k.replace('_', '')] = v
+
+        unicode_property_value_aliases[key] = value
+
+
+regex_chars = re.compile('([\[\]\{\}\-\^])')
+
+
+def replace_regex_chars(s):
+    return regex_chars.sub(r'\\\1', s)
+
+
+def format_regex_char(i):
+    c = wide_unichr(i)
+    return replace_regex_chars(c.encode('unicode-escape'))
+
+
+def make_char_set_regex(chars):
+    '''
+    Build a regex character set from a list of characters
+    '''
+    group_start = None
+    group_end = None
+    last_ord = -2
+
+    ords = map(wide_ord, chars)
+    ords.sort()
+
+    ords.append(None)
+
+    groups = []
+
+    for i, o in enumerate(ords):
+        if o is not None and o == last_ord + 1:
+            group_end = o
+        elif group_start is not None and group_end is not None:
+            groups.append('-'.join((format_regex_char(group_start), format_regex_char(group_end))))
+            group_end = None
+            group_start = o
+        elif group_start is not None and group_end is None:
+            groups.append(format_regex_char(group_start))
+            group_start = o
+        else:
+            group_start = o
+
+        last_ord = o
+
+    return u'[{}]'.format(u''.join(groups))
+
+
+name_category = [
+    ('control_chars', 'Cc'),
+    ('other_format_chars', 'Cf'),
+    ('other_not_assigned_chars', 'Cn'),
+    ('other_private_use_chars', 'Co'),
+    ('other_surrogate_chars', 'Cs'),
+    ('letter_lower_chars', 'Ll'),
+    ('letter_modifier_chars', 'Lm'),
+    ('letter_other_chars', 'Lo'),
+    ('letter_title_chars', 'Lt'),
+    ('letter_upper_chars', 'Lu'),
+    ('mark_spacing_combining_chars', 'Mc'),
+    ('mark_enclosing_chars', 'Me'),
+    ('mark_nonspacing_chars', 'Mn'),
+    ('number_or_digit_chars', 'Nd'),
+    ('number_letter_chars', 'Nl'),
+    ('number_other_chars', 'No'),
+    ('punct_connector_chars', 'Pc'),
+    ('punct_dash_chars', 'Pd'),
+    ('punct_close_chars', 'Pe'),
+    ('punct_final_quote_chars', 'Pf'),
+    ('punct_initial_quote_chars', 'Pi'),
+    ('punct_other_chars', 'Po'),
+    ('punct_open_chars', 'Ps'),
+    ('currency_symbol_chars', 'Sc'),
+    ('symbol_modifier_chars', 'Sk'),
+    ('symbol_math_chars', 'Sm'),
+    ('symbol_other_chars', 'So'),
+    ('separator_line_chars', 'Zl'),
+    ('separator_paragraph_chars', 'Zp'),
+    ('space', 'Zs'),
+]
+
+
+def main():
+    init_unicode_categories()
+    for name, cat in name_category:
+        if cat not in unicode_categories:
+            continue
+        chars = unicode_categories[cat]
+        print u'{} = {};'.format(name, make_char_set_regex(chars))
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/geodata/i18n/unicode_paths.py
+++ b/scripts/geodata/i18n/unicode_paths.py
@@ -0,0 +1,11 @@
+import os
+import sys
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+DATA_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources')
+
+UNICODE_DATA_DIR = os.path.join(DATA_DIR, 'unicode')
+
+CLDR_DIR = os.path.join(UNICODE_DATA_DIR, 'cldr')
--- a/scripts/geodata/i18n/unicode_properties.py
+++ b/scripts/geodata/i18n/unicode_properties.py
@@ -0,0 +1,463 @@
+'''
+scripts.py
+
+This code uses the latest copy of Scripts.txt from unicode.org
+to generate a C file (and header) defining which script every character
+belongs to.
+'''
+
+import csv
+import os
+import requests
+import re
+import sys
+import tempfile
+import requests
+import subprocess
+
+from cStringIO import StringIO
+
+from collections import OrderedDict, defaultdict
+from itertools import islice
+
+from lxml import etree
+
+from operator import itemgetter
+
+from zipfile import ZipFile
+
+this_dir = os.path.realpath(os.path.dirname(__file__))
+sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
+
+from geodata.encoding import safe_encode, safe_decode
+from geodata.file_utils import ensure_dir, download_file
+from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
+
+from cldr_languages import *
+from download_cldr import download_cldr
+from languages import get_country_languages
+from unicode_paths import UNICODE_DATA_DIR
+from word_breaks import script_regex, regex_char_range
+
+SRC_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src')
+
+SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
+LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
+LOCAL_ISO_15924_FILE = os.path.join(SCRIPTS_DATA_DIR, 'iso15924.txt')
+
+BLOCKS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'blocks')
+LOCAL_BLOCKS_FILE = os.path.join(BLOCKS_DATA_DIR, 'Blocks.txt')
+
+PROPS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'props')
+LOCAL_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'PropList.txt')
+LOCAL_PROP_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyAliases.txt')
+LOCAL_PROP_VALUE_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyValueAliases.txt')
+LOCAL_DERIVED_CORE_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'DerivedCoreProperties.txt')
+
+WORD_BREAKS_DIR = os.path.join(UNICODE_DATA_DIR, 'word_breaks')
+LOCAL_WORD_BREAKS_FILE = os.path.join(WORD_BREAKS_DIR, 'WordBreakProperty.txt')
+
+SCRIPTS_HEADER = 'unicode_script_types.h'
+SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
+
+SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
+BLOCKS_URL = 'http://unicode.org/Public/UNIDATA/Blocks.txt'
+PROPS_URL = 'http://unicode.org/Public/UNIDATA/PropList.txt'
+PROP_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyAliases.txt'
+PROP_VALUE_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt'
+DERIVED_CORE_PROPS_URL = 'http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt'
+WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt'
+
+ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
+
+scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
+#define UNICODE_SCRIPT_TYPES_H
+
+#include <stdlib.h>
+
+#define NUM_CODEPOINTS {num_codepoints}
+#define MAX_LANGS {max_langs}
+
+typedef enum {{
+    {script_enum}
+    NUM_SCRIPTS
+}} script_t;
+
+#endif
+'''
+
+scripts_c_data_template = u'''
+script_t char_scripts[] = {{
+    {char_scripts}
+}};
+
+script_code_t script_codes[] = {{
+    {script_codes}
+}};
+
+script_languages_t script_languages[] = {{
+    {script_languages}
+}};
+'''
+
+script_code_template = '{{SCRIPT_{name}, "{code}"}}'
+
+script_language_template = '{{{num_langs}, {languages}}}'
+
+
+def unicode_to_integer(u):
+    return int('0x{}'.format(u), 16)
+
+
+def script_name_constant(i, u):
+    return u'SCRIPT_{} = {}'.format(u.upper(), i)
+
+
+UNKNOWN_SCRIPT = 'Unknown'
+COMMON_SCRIPT = 'Common'
+
+
+def parse_char_range(r):
+    return [unicode_to_integer(u) for u in r.split('..')]
+
+
+def get_chars_by_script():
+    scripts_file = open(LOCAL_SCRIPTS_FILE)
+    scripts = [None] * NUM_CODEPOINTS
+
+    # Lines look like:
+    # 0041..005A    ; Latin # L&  [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+    for char_range, script, char_class in script_regex.findall(scripts_file.read()):
+        script_range = parse_char_range(char_range)
+        if len(script_range) == 2:
+            for i in xrange(script_range[0], script_range[1] + 1):
+                scripts[i] = script
+        elif script_range:
+            scripts[script_range[0]] = script
+
+    return scripts
+
+
+COMMENT_CHAR = '#'
+DELIMITER_CHAR = ';'
+
+
+def parse_file(f):
+    for line in f:
+        line = line.split(COMMENT_CHAR)[0].strip()
+        if not line:
+            continue
+        tokens = line.split(DELIMITER_CHAR)
+        if tokens:
+            yield [t.strip() for t in tokens]
+
+
+def get_property_aliases():
+    prop_aliases_file = open(LOCAL_PROP_ALIASES_FILE)
+
+    aliases = {}
+
+    for line in parse_file(prop_aliases_file):
+        prop = line[1]
+        prop_aliases = [line[0]] + line[2:]
+
+        for alias in prop_aliases:
+            aliases[alias.lower()] = prop.lower()
+
+    return aliases
+
+
+def get_property_value_aliases():
+    prop_value_aliases_file = open(LOCAL_PROP_VALUE_ALIASES_FILE)
+
+    value_aliases = defaultdict(dict)
+
+    for line in parse_file(prop_value_aliases_file):
+        prop = line[0]
+        if prop not in ('ccc', 'gc'):
+            value = line[2]
+            aliases = [line[1]] + line[3:]
+        else:
+            value = line[1]
+            aliases = line[2:]
+
+        for alias in aliases:
+            value_aliases[prop.lower()][alias] = value
+
+    return dict(value_aliases)
+
+
+def get_unicode_blocks():
+    blocks_file = open(LOCAL_BLOCKS_FILE)
+
+    blocks = defaultdict(list)
+
+    for line in parse_file(blocks_file):
+        char_range, block = line
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                blocks[block.lower()].append(wide_unichr(i))
+        elif char_range:
+            blocks[block.lower()].append(wide_unichr(char_range[0]))
+
+    return dict(blocks)
+
+
+def get_unicode_properties():
+    props_file = open(LOCAL_PROPS_FILE)
+
+    props = defaultdict(list)
+
+    for line in parse_file(props_file):
+        char_range, prop = line
+
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                props[prop.lower()].append(wide_unichr(i))
+        elif char_range:
+            props[prop.lower()].append(wide_unichr(char_range[0]))
+
+    derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
+    for line in parse_file(derived_props_file):
+        char_range, prop = line
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                props[prop.lower()].append(wide_unichr(i))
+        elif char_range:
+            props[prop.lower()].append(wide_unichr(char_range[0]))
+
+    return dict(props)
+
+
+def get_word_break_properties():
+    props_file = open(LOCAL_WORD_BREAKS_FILE)
+
+    props = defaultdict(list)
+
+    for line in parse_file(props_file):
+        char_range, prop = line
+
+        char_range = parse_char_range(char_range)
+
+        if len(char_range) == 2:
+            for i in xrange(char_range[0], char_range[1] + 1):
+                props[prop].append(wide_unichr(i))
+        elif char_range:
+            props[prop].append(wide_unichr(char_range[0]))
+
+    return dict(props)
+
+
+def build_master_scripts_list(chars):
+    all_scripts = OrderedDict.fromkeys(filter(bool, chars))
+
+    for i, script in enumerate(all_scripts.keys()):
+        all_scripts[script] = i + 1
+
+    # Unknown script for all characters not covered
+    all_scripts[UNKNOWN_SCRIPT] = 0
+
+    return all_scripts
+
+
+SCRIPT_ALIASES_SUPPLEMENTAL = {
+    'Hant': 'Han',
+    'Hans': 'Han'
+}
+
+
+def get_script_codes(all_scripts):
+
+    if not os.path.exists(LOCAL_ISO_15924_FILE):
+        temp_dir = tempfile.gettempdir()
+
+        script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])
+
+        # This comes as a .zip
+        script_codes_response = requests.get(ISO_15924_URL)
+        zf = ZipFile(StringIO(script_codes_response.content))
+        iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]
+
+        # Strip out the comments, etc.
+        temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
+                                        if line.strip() and not line.strip().startswith('#')])
+
+        f = open(LOCAL_ISO_15924_FILE, 'w')
+        f.write(safe_encode(temp_iso15924_file))
+        f.close()
+
+    script_codes_file = open(LOCAL_ISO_15924_FILE)
+
+    script_codes = {}
+    seen_scripts = set()
+
+    # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
+    for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
+        if name in all_scripts:
+            script_codes[code] = name
+            seen_scripts.add(name)
+        else:
+            normalized_name = name.split('(')[0].strip()
+            if normalized_name in all_scripts and normalized_name not in seen_scripts:
+                script_codes[code] = normalized_name
+                seen_scripts.add(normalized_name)
+
+    value_aliases = get_property_value_aliases()
+    script_aliases = value_aliases['sc']
+
+    for code, script in script_aliases.iteritems():
+        if code not in script_codes and script in all_scripts:
+            script_codes[code] = script
+
+    script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)
+
+    return script_codes
+
+
+SCRIPT_CODE_ALIASES = {
+    'Jpan': ['Hani', 'Hira', 'Kana'],
+    'Kore': ['Hang', 'Han']
+}
+
+
+def extract_language_scripts(xml):
+    language_scripts = defaultdict(list)
+
+    for lang in xml.xpath('//languageData/language'):
+        language_code = lang.attrib['type'].lower()
+        scripts = lang.get('scripts')
+        if not scripts:
+            continue
+        for script in scripts.split():
+            script_aliases = SCRIPT_CODE_ALIASES.get(script)
+            if not script_aliases:
+                language_scripts[language_code].append(script)
+            else:
+                language_scripts[language_code].extend(script_aliases)
+
+    return language_scripts
+
+
+def batch_iter(iterable, batch_size):
+    source_iter = iter(iterable)
+    while True:
+        batch = list(islice(source_iter, batch_size))
+        if len(batch) > 0:
+            yield batch
+        else:
+            return
+
+
+def get_script_languages():
+    # For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
+    # to identify the language. We keep track of those single language scripts to inform
+    # the language classifier
+
+    chars = get_chars_by_script()
+    all_scripts = build_master_scripts_list(chars)
+    script_codes = get_script_codes(all_scripts)
+
+    cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
+    cldr_xml = etree.parse(cldr_supplemental_data)
+    language_scripts = extract_language_scripts(cldr_xml)
+
+    country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
+    if not os.path.exists(country_languages_path):
+        fetch_cldr_languages(DEFAULT_LANGUAGES_DIR)
+
+    country_language_file = open(country_languages_path)
+    country_language_reader = csv.reader(country_language_file, delimiter='\t')
+
+    countries = set([country for country, lang, script, pct, is_official
+                     in country_language_reader])
+
+    spoken_languages = set.union(*(set(get_country_languages(country)) for country in countries))
+
+    script_code_languages = defaultdict(list)
+    for language, scripts in language_scripts.iteritems():
+        if language not in spoken_languages:
+            continue
+        for script in scripts:
+            script_code_languages[script].append(language)
+
+    script_languages = defaultdict(list)
+
+    for script_code, script_name in script_codes.iteritems():
+        langs = script_code_languages.get(script_code, [])
+        script_languages[script_name].extend(langs)
+
+    for name in all_scripts.iterkeys():
+        script_languages.setdefault(name, [])
+
+    return script_languages
+
+
+def main(out_dir=SRC_DIR):
+    # Output is a C header and data file, see templates
+    out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
+    out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
+
+    download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
+    download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
+    download_file(PROPS_URL, LOCAL_PROPS_FILE)
+    download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
+    download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
+    download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
+    download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)
+
+    if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
+        download_cldr()
+
+    chars = get_chars_by_script()
+    all_scripts = build_master_scripts_list(chars)
+    script_codes = get_script_codes(all_scripts)
+
+    script_languages = get_script_languages()
+
+    max_langs = 0
+
+    for script, langs in script_languages.iteritems():
+        num_langs = len(langs)
+        if num_langs > max_langs:
+            max_langs = num_langs
+
+    # Generate C header and constants
+
+    script_enum = u'''
+    '''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])
+
+    out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
+                     max_langs=max_langs,
+                     script_enum=script_enum))
+    out_header.close()
+
+    # Generate C data file
+
+    char_scripts_data = u''',
+    '''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
+
+    script_codes_data = u''',
+    '''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])
+
+    sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]
+
+    script_language_data = u''',
+    '''.join([script_language_template.format(num_langs=len(langs),
+              languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL'))
+              for langs in sorted_lang_scripts])
+
+    out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
+                   char_scripts=char_scripts_data,
+                   script_codes=script_codes_data,
+                   script_languages=script_language_data))
+    out_file.close()
+
+
+if __name__ == '__main__':
+    main(*sys.argv[1:])
--- a/scripts/geodata/i18n/word_breaks.py
+++ b/scripts/geodata/i18n/word_breaks.py
@@ -0,0 +1,140 @@
+'''
+word_breaks.py
+
+This script is used to automatically build ranges of unicode characters
+from the unicode spec's word break properties. These ranges help us
+build a tokenizer that does the right thing in every language with regard
+to word segmentation. The lines outputted by this script can be pasted
+into scanner.re before compliation.
+'''
+
+import requests
+from collections import defaultdict
+import re
+
+# Operate on WordBreakProperty.txt file
+hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
+format_regex = re.compile('^([^\s]+)[\s]+; Format ')
+extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
+katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
+other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
+mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
+mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
+mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
+numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
+extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
+
+# Operate on Scripts.txt file
+other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
+
+script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
+
+WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
+HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
+SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
+
+ideographic_scripts = set([
+    'han',
+    'hiragana',
+    'hangul',
+    'tibetan',
+    'thai',
+    'lao',
+    'javanese',
+    'balinese',
+    'yi',
+])
+
+
+def regex_char_range(match):
+    r = match.split('..')
+    # Wide version
+    return u'-'.join([('\u{}'.format(c.lower()) if len(c) < 5 else '\U{}'.format(c.lower().rjust(8, '0'))) for c in r])
+
+
+def get_letter_range(text, *regexes):
+    char_ranges = []
+    for line in text.split('\n'):
+        for regex in regexes:
+            m = regex.match(line)
+            if m:
+                char_ranges.append(regex_char_range(m.group(1)))
+    return char_ranges
+
+
+def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
+    char_ranges = []
+    for char_range, script, char_class in script_regex.findall(text):
+        if script.lower() in scripts and char_class_regex.match(char_class):
+            char_ranges.append(regex_char_range(char_range))
+    return char_ranges
+
+
+def get_char_class(text, char_class_regex):
+    char_ranges = []
+    for char_range, script, char_class in script_regex.findall(text):
+        if char_class_regex.match(char_class):
+            char_ranges.append(regex_char_range(char_range))
+    return char_ranges
+
+
+hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
+
+
+def get_hangul_syllable_ranges(text):
+    char_ranges = defaultdict(list)
+    for line in text.split('\n'):
+        m = hangul_syllable_type_regex.match(line)
+        if m:
+            char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
+    return dict(char_ranges)
+
+
+name_funcs = [
+    ('hebrew_letter_chars', hebrew_letter_regex),
+    ('format_chars', format_regex),
+    ('extend_chars', extend_regex),
+    ('katakana_chars', katakana_regex),
+    ('letter_other_alpha_chars', other_alpha_letter_regex),
+    ('mid_letter_chars', mid_letter_regex),
+    ('mid_number_chars', mid_number_regex),
+    ('mid_num_letter_chars', mid_num_letter_regex),
+    ('numeric_chars', numeric_regex),
+    ('extend_num_letter_chars', extend_num_letter_regex),
+]
+
+IDEOGRAPHIC_CHARS = 'ideographic_chars'
+IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
+
+numbers_regex = re.compile('N[ol]', re.I)
+letters_regex = re.compile('L*', re.I)
+
+
+def main():
+    ''' Insert these lines into scanner.re '''
+    response = requests.get(WORD_BREAK_PROPERTIES_URL)
+
+    if response.ok:
+        for name, reg in name_funcs:
+            s = get_letter_range(response.content, reg)
+            print '{} = [{}];'.format(name, ''.join(s))
+
+    response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
+
+    if response.ok:
+        syllable_ranges = get_hangul_syllable_ranges(response.content)
+        for name, ranges in syllable_ranges.iteritems():
+            print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
+
+    response = requests.get(SCRIPTS_URL)
+    if response.ok:
+        s = ''.join(get_char_class(response.content, numbers_regex))
+
+        print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
+
+        s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
+        print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
+
+
+if __name__ == '__main__':
+    main()