[unicode] Allowing wide chars in unicode properties

This commit is contained in:
Al
2015-09-23 00:34:07 -04:00
parent f13e9fad90
commit 13bcc35523

View File

@@ -18,6 +18,7 @@ import subprocess
from cStringIO import StringIO from cStringIO import StringIO
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from itertools import islice
from lxml import etree from lxml import etree
@@ -30,6 +31,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_encode, safe_decode from geodata.encoding import safe_encode, safe_decode
from geodata.file_utils import ensure_dir, download_file from geodata.file_utils import ensure_dir, download_file
from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
from cldr_languages import * from cldr_languages import *
from download_cldr import download_cldr from download_cldr import download_cldr
@@ -65,8 +67,6 @@ WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty
ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip' ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
NUM_CODEPOINTS = 65536
scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
#define UNICODE_SCRIPT_TYPES_H #define UNICODE_SCRIPT_TYPES_H
@@ -114,7 +114,7 @@ UNKNOWN_SCRIPT = 'Unknown'
def parse_char_range(r): def parse_char_range(r):
return [unicode_to_integer(u) for u in r.split('..') if len(u) < 5] return [unicode_to_integer(u) for u in r.split('..')]
def get_chars_by_script(): def get_chars_by_script():
@@ -194,9 +194,9 @@ def get_unicode_blocks():
if len(char_range) == 2: if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1): for i in xrange(char_range[0], char_range[1] + 1):
blocks[block.lower()].append(unichr(i)) blocks[block.lower()].append(wide_unichr(i))
elif char_range: elif char_range:
blocks[block.lower()].append(unichr(char_range[0])) blocks[block.lower()].append(wide_unichr(char_range[0]))
return dict(blocks) return dict(blocks)
@@ -213,9 +213,9 @@ def get_unicode_properties():
if len(char_range) == 2: if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1): for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(unichr(i)) props[prop.lower()].append(wide_unichr(i))
elif char_range: elif char_range:
props[prop.lower()].append(unichr(char_range[0])) props[prop.lower()].append(wide_unichr(char_range[0]))
derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE) derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
for line in parse_file(derived_props_file): for line in parse_file(derived_props_file):
@@ -224,9 +224,9 @@ def get_unicode_properties():
if len(char_range) == 2: if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1): for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(unichr(i)) props[prop.lower()].append(wide_unichr(i))
elif char_range: elif char_range:
props[prop.lower()].append(unichr(char_range[0])) props[prop.lower()].append(wide_unichr(char_range[0]))
return dict(props) return dict(props)
@@ -243,9 +243,9 @@ def get_word_break_properties():
if len(char_range) == 2: if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1): for i in xrange(char_range[0], char_range[1] + 1):
props[prop].append(unichr(i)) props[prop].append(wide_unichr(i))
elif char_range: elif char_range:
props[prop].append(unichr(char_range[0])) props[prop].append(wide_unichr(char_range[0]))
return dict(props) return dict(props)
@@ -340,6 +340,16 @@ def extract_language_scripts(xml):
return language_scripts return language_scripts
def batch_iter(iterable, batch_size):
source_iter = iter(iterable)
while True:
batch = list(islice(source_iter, batch_size))
if len(batch) > 0:
yield batch
else:
return
def get_script_languages(): def get_script_languages():
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient # For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
# to identify the language. We keep track of those single language scripts to inform # to identify the language. We keep track of those single language scripts to inform
@@ -398,6 +408,10 @@ def main(out_dir):
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA): if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
download_cldr() download_cldr()
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
script_languages = get_script_languages() script_languages = get_script_languages()
max_langs = 0 max_langs = 0
@@ -420,7 +434,7 @@ def main(out_dir):
# Generate C data file # Generate C data file
char_scripts_data = u''', char_scripts_data = u''',
'''.join(['SCRIPT_{}'.format((script or UNKNOWN_SCRIPT).upper()) for script in chars]) '''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
script_codes_data = u''', script_codes_data = u''',
'''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()]) '''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])