410 lines
12 KiB
Python
410 lines
12 KiB
Python
'''
|
|
scripts.py
|
|
|
|
This code uses the latest copy of Scripts.txt from unicode.org
|
|
to generate a C file (and header) defining which script every character
|
|
belongs to.
|
|
'''
|
|
|
|
import csv
|
|
import os
|
|
import requests
|
|
import re
|
|
import sys
|
|
import tempfile
|
|
import requests
|
|
import subprocess
|
|
|
|
from cStringIO import StringIO
|
|
|
|
from collections import OrderedDict, defaultdict
|
|
|
|
from lxml import etree
|
|
|
|
from operator import itemgetter
|
|
|
|
from zipfile import ZipFile
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|
|
|
from geodata.encoding import safe_encode, safe_decode
|
|
from geodata.file_utils import ensure_dir, download_file
|
|
|
|
from cldr_languages import *
|
|
from unicode_paths import UNICODE_DATA_DIR
|
|
from word_breaks import script_regex, regex_char_range
|
|
|
|
SCRIPTS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'scripts')
|
|
LOCAL_SCRIPTS_FILE = os.path.join(SCRIPTS_DATA_DIR, 'Scripts.txt')
|
|
|
|
BLOCKS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'blocks')
|
|
LOCAL_BLOCKS_FILE = os.path.join(BLOCKS_DATA_DIR, 'Blocks.txt')
|
|
|
|
PROPS_DATA_DIR = os.path.join(UNICODE_DATA_DIR, 'props')
|
|
LOCAL_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'PropList.txt')
|
|
LOCAL_PROP_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyAliases.txt')
|
|
LOCAL_PROP_VALUE_ALIASES_FILE = os.path.join(PROPS_DATA_DIR, 'PropertyValueAliases.txt')
|
|
LOCAL_DERIVED_CORE_PROPS_FILE = os.path.join(PROPS_DATA_DIR, 'DerivedCoreProperties.txt')
|
|
|
|
WORD_BREAKS_DIR = os.path.join(UNICODE_DATA_DIR, 'word_breaks')
|
|
LOCAL_WORD_BREAKS_FILE = os.path.join(WORD_BREAKS_DIR, 'WordBreakProperty.txt')
|
|
|
|
SCRIPTS_HEADER = 'unicode_script_types.h'
|
|
SCRIPTS_DATA_FILENAME = 'unicode_scripts_data.c'
|
|
|
|
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
|
BLOCKS_URL = 'http://unicode.org/Public/UNIDATA/Blocks.txt'
|
|
PROPS_URL = 'http://unicode.org/Public/UNIDATA/PropList.txt'
|
|
PROP_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyAliases.txt'
|
|
PROP_VALUE_ALIASES_URL = 'http://unicode.org/Public/UNIDATA/PropertyValueAliases.txt'
|
|
DERIVED_CORE_PROPS_URL = 'http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt'
|
|
WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt'
|
|
|
|
ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
|
|
|
|
NUM_CHARS = 65536
|
|
|
|
scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
|
|
#define UNICODE_SCRIPT_TYPES_H
|
|
|
|
#include <stdlib.h>
|
|
|
|
#define NUM_CHARS {num_chars}
|
|
#define MAX_LANGS {max_langs}
|
|
|
|
typedef enum {{
|
|
{script_enum}
|
|
NUM_SCRIPTS
|
|
}} script_t;
|
|
|
|
#endif
|
|
'''
|
|
|
|
scripts_c_data_template = u'''
|
|
script_t char_scripts[] = {{
|
|
{char_scripts}
|
|
}};
|
|
|
|
script_code_t script_codes[] = {{
|
|
{script_codes}
|
|
}};
|
|
|
|
script_language_t script_languages[] = {{
|
|
{script_languages}
|
|
}};
|
|
'''
|
|
|
|
script_code_template = '{{SCRIPT_{name}, "{code}"}}'
|
|
|
|
script_language_template = '{{{num_langs}, {languages}}}'
|
|
|
|
|
|
def unicode_to_integer(u):
|
|
return int('0x{}'.format(u), 16)
|
|
|
|
|
|
def script_name_constant(i, u):
|
|
return u'SCRIPT_{} = {}'.format(u.upper(), i)
|
|
|
|
|
|
UNKNOWN_SCRIPT = 'Unknown'
|
|
|
|
|
|
def parse_char_range(r):
|
|
return [unicode_to_integer(u) for u in r.split('..') if len(u) < 5]
|
|
|
|
|
|
def get_chars_by_script():
|
|
scripts_file = open(LOCAL_SCRIPTS_FILE)
|
|
scripts = [None] * NUM_CHARS
|
|
|
|
# Lines look like:
|
|
# 0041..005A ; Latin # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
|
|
for char_range, script, char_class in script_regex.findall(scripts_file.read()):
|
|
script_range = parse_char_range(char_range)
|
|
if len(script_range) == 2:
|
|
for i in xrange(script_range[0], script_range[1] + 1):
|
|
scripts[i] = script
|
|
elif script_range:
|
|
scripts[script_range[0]] = script
|
|
|
|
return scripts
|
|
|
|
|
|
COMMENT_CHAR = '#'
|
|
DELIMITER_CHAR = ';'
|
|
|
|
|
|
def parse_file(f):
|
|
for line in f:
|
|
line = line.split(COMMENT_CHAR)[0].strip()
|
|
if not line:
|
|
continue
|
|
tokens = line.split(DELIMITER_CHAR)
|
|
if tokens:
|
|
yield [t.strip() for t in tokens]
|
|
|
|
|
|
def get_property_aliases():
|
|
prop_aliases_file = open(LOCAL_PROP_ALIASES_FILE)
|
|
|
|
aliases = {}
|
|
|
|
for line in parse_file(prop_aliases_file):
|
|
prop = line[1]
|
|
prop_aliases = [line[0]] + line[2:]
|
|
|
|
for alias in prop_aliases:
|
|
aliases[alias.lower()] = prop.lower()
|
|
|
|
return aliases
|
|
|
|
|
|
def get_property_value_aliases():
|
|
prop_value_aliases_file = open(LOCAL_PROP_VALUE_ALIASES_FILE)
|
|
|
|
value_aliases = defaultdict(dict)
|
|
|
|
for line in parse_file(prop_value_aliases_file):
|
|
prop = line[0]
|
|
if prop not in ('ccc', 'gc'):
|
|
value = line[2]
|
|
aliases = [line[1]] + line[3:]
|
|
else:
|
|
value = line[1]
|
|
aliases = line[2:]
|
|
|
|
for alias in aliases:
|
|
value_aliases[prop.lower()][alias] = value
|
|
|
|
return dict(value_aliases)
|
|
|
|
|
|
def get_unicode_blocks():
|
|
blocks_file = open(LOCAL_BLOCKS_FILE)
|
|
|
|
blocks = defaultdict(list)
|
|
|
|
for line in parse_file(blocks_file):
|
|
char_range, block = line
|
|
char_range = parse_char_range(char_range)
|
|
|
|
if len(char_range) == 2:
|
|
for i in xrange(char_range[0], char_range[1] + 1):
|
|
blocks[block.lower()].append(unichr(i))
|
|
elif char_range:
|
|
blocks[block.lower()].append(unichr(char_range[0]))
|
|
|
|
return dict(blocks)
|
|
|
|
|
|
def get_unicode_properties():
|
|
props_file = open(LOCAL_PROPS_FILE)
|
|
|
|
props = defaultdict(list)
|
|
|
|
for line in parse_file(props_file):
|
|
char_range, prop = line
|
|
|
|
char_range = parse_char_range(char_range)
|
|
|
|
if len(char_range) == 2:
|
|
for i in xrange(char_range[0], char_range[1] + 1):
|
|
props[prop.lower()].append(unichr(i))
|
|
elif char_range:
|
|
props[prop.lower()].append(unichr(char_range[0]))
|
|
|
|
derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
|
|
for line in parse_file(derived_props_file):
|
|
char_range, prop = line
|
|
char_range = parse_char_range(char_range)
|
|
|
|
if len(char_range) == 2:
|
|
for i in xrange(char_range[0], char_range[1] + 1):
|
|
props[prop.lower()].append(unichr(i))
|
|
elif char_range:
|
|
props[prop.lower()].append(unichr(char_range[0]))
|
|
|
|
return dict(props)
|
|
|
|
|
|
def get_word_break_properties():
|
|
props_file = open(LOCAL_WORD_BREAKS_FILE)
|
|
|
|
props = defaultdict(list)
|
|
|
|
for line in parse_file(props_file):
|
|
char_range, prop = line
|
|
|
|
char_range = parse_char_range(char_range)
|
|
|
|
if len(char_range) == 2:
|
|
for i in xrange(char_range[0], char_range[1] + 1):
|
|
props[prop].append(unichr(i))
|
|
elif char_range:
|
|
props[prop].append(unichr(char_range[0]))
|
|
|
|
return dict(props)
|
|
|
|
def build_master_scripts_list(chars):
|
|
all_scripts = OrderedDict.fromkeys(filter(bool, chars))
|
|
|
|
for i, script in enumerate(all_scripts.keys()):
|
|
all_scripts[script] = i + 1
|
|
|
|
# Unknown script for all characters not covered
|
|
all_scripts[UNKNOWN_SCRIPT] = 0
|
|
|
|
return all_scripts
|
|
|
|
|
|
def get_script_codes(all_scripts):
|
|
temp_dir = tempfile.gettempdir()
|
|
script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])
|
|
|
|
# This comes as a .zip
|
|
script_codes_response = requests.get(ISO_15924_URL)
|
|
zf = ZipFile(StringIO(script_codes_response.content))
|
|
iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]
|
|
|
|
# Strip out the comments, etc.
|
|
temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
|
|
if line.strip() and not line.strip().startswith('#')])
|
|
|
|
script_codes_file = StringIO(safe_encode(temp_iso15924_file))
|
|
|
|
script_codes = {}
|
|
seen_scripts = set()
|
|
|
|
# Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
|
|
for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
|
|
if name in all_scripts:
|
|
script_codes[code] = name
|
|
seen_scripts.add(name)
|
|
else:
|
|
normalized_name = name.split('(')[0].strip()
|
|
if normalized_name in all_scripts and normalized_name not in seen_scripts:
|
|
script_codes[code] = normalized_name
|
|
seen_scripts.add(normalized_name)
|
|
|
|
return script_codes
|
|
|
|
|
|
def extract_language_scripts(xml):
|
|
language_scripts = defaultdict(list)
|
|
|
|
for lang in xml.xpath('//languageData/language'):
|
|
language_code = lang.attrib['type'].lower()
|
|
scripts = lang.get('scripts')
|
|
if not scripts:
|
|
continue
|
|
for script in scripts.split():
|
|
language_scripts[language_code].append(script)
|
|
|
|
return language_scripts
|
|
|
|
|
|
def get_script_languages(script_codes):
|
|
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
|
|
# to identify the language. We keep track of those single language scripts to inform
|
|
# the language classifier
|
|
|
|
cldr_supplemental_data = open(CLDR_SUPPLEMENTAL_DATA)
|
|
cldr_xml = etree.parse(cldr_supplemental_data)
|
|
language_scripts = extract_language_scripts(cldr_xml)
|
|
|
|
country_languages_path = os.path.join(DEFAULT_LANGUAGES_DIR, COUNTRY_LANGUAGES_FILENAME)
|
|
if not os.path.exists(country_languages_path):
|
|
fetch_cldr_languages(DEFAULT_LANGUAGES_DIR)
|
|
|
|
country_language_file = open(country_languages_path)
|
|
country_language_reader = csv.reader(country_language_file, delimiter='\t')
|
|
|
|
spoken_languages = set([lang for country, lang, script, pct, is_official
|
|
in country_language_reader])
|
|
|
|
script_code_languages = defaultdict(list)
|
|
for language, scripts in language_scripts.iteritems():
|
|
if language not in spoken_languages:
|
|
continue
|
|
for script in scripts:
|
|
script_code_languages[script].append(language)
|
|
|
|
script_languages = {}
|
|
|
|
for script_code, script_name in script_codes.iteritems():
|
|
langs = script_code_languages.get(script_code, [])
|
|
script_languages[script_name] = langs
|
|
|
|
return script_languages
|
|
|
|
|
|
def main(out_dir):
|
|
# Output is a C header and data file, see templates
|
|
out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
|
|
out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')
|
|
|
|
download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
|
|
download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
|
|
download_file(PROPS_URL, LOCAL_PROPS_FILE)
|
|
download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
|
|
download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
|
|
download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
|
|
download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)
|
|
|
|
chars = get_chars_by_script()
|
|
all_scripts = build_master_scripts_list(chars)
|
|
script_codes = get_script_codes(all_scripts)
|
|
|
|
script_languages = get_script_languages(script_codes)
|
|
|
|
max_langs = 0
|
|
|
|
for script, langs in script_languages.iteritems():
|
|
num_langs = len(langs)
|
|
if num_langs > max_langs:
|
|
max_langs = num_langs
|
|
|
|
for name in all_scripts.iterkeys():
|
|
script_languages.setdefault(name, [])
|
|
|
|
# Generate C header and constants
|
|
|
|
script_enum = u'''
|
|
'''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])
|
|
|
|
out_header.write(scripts_header_template.format(num_chars=NUM_CHARS,
|
|
max_langs=max_langs,
|
|
script_enum=script_enum))
|
|
out_header.close()
|
|
|
|
# Generate C data file
|
|
|
|
char_scripts_data = u''',
|
|
'''.join(['SCRIPT_{}'.format((script or UNKNOWN_SCRIPT).upper()) for script in chars])
|
|
|
|
script_codes_data = u''',
|
|
'''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])
|
|
|
|
sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]
|
|
|
|
script_language_data = u''',
|
|
'''.join([script_language_template.format(num_langs=len(langs),
|
|
languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs])) if langs else 'NULL')
|
|
for langs in sorted_lang_scripts])
|
|
|
|
out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
|
|
char_scripts=char_scripts_data,
|
|
script_codes=script_codes_data,
|
|
script_languages=script_language_data))
|
|
out_file.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 2:
|
|
print 'Usage: python unicode_properties.py out_dir'
|
|
sys.exit(1)
|
|
|
|
main(sys.argv[1])
|