[unicode] Allowing wide chars in unicode properties

This commit is contained in:
Al
2015-09-23 00:34:07 -04:00
parent f13e9fad90
commit 13bcc35523

View File

@@ -18,6 +18,7 @@ import subprocess
from cStringIO import StringIO
from collections import OrderedDict, defaultdict
from itertools import islice
from lxml import etree
@@ -30,6 +31,7 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
from geodata.encoding import safe_encode, safe_decode
from geodata.file_utils import ensure_dir, download_file
from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
from cldr_languages import *
from download_cldr import download_cldr
@@ -65,8 +67,6 @@ WORD_BREAKS_URL = 'http://unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty
ISO_15924_URL = 'http://unicode.org/iso15924/iso15924.txt.zip'
NUM_CODEPOINTS = 65536
scripts_header_template = u'''#ifndef UNICODE_SCRIPT_TYPES_H
#define UNICODE_SCRIPT_TYPES_H
@@ -114,7 +114,7 @@ UNKNOWN_SCRIPT = 'Unknown'
def parse_char_range(r):
return [unicode_to_integer(u) for u in r.split('..') if len(u) < 5]
return [unicode_to_integer(u) for u in r.split('..')]
def get_chars_by_script():
@@ -194,9 +194,9 @@ def get_unicode_blocks():
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
blocks[block.lower()].append(unichr(i))
blocks[block.lower()].append(wide_unichr(i))
elif char_range:
blocks[block.lower()].append(unichr(char_range[0]))
blocks[block.lower()].append(wide_unichr(char_range[0]))
return dict(blocks)
@@ -213,9 +213,9 @@ def get_unicode_properties():
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(unichr(i))
props[prop.lower()].append(wide_unichr(i))
elif char_range:
props[prop.lower()].append(unichr(char_range[0]))
props[prop.lower()].append(wide_unichr(char_range[0]))
derived_props_file = open(LOCAL_DERIVED_CORE_PROPS_FILE)
for line in parse_file(derived_props_file):
@@ -224,9 +224,9 @@ def get_unicode_properties():
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop.lower()].append(unichr(i))
props[prop.lower()].append(wide_unichr(i))
elif char_range:
props[prop.lower()].append(unichr(char_range[0]))
props[prop.lower()].append(wide_unichr(char_range[0]))
return dict(props)
@@ -243,9 +243,9 @@ def get_word_break_properties():
if len(char_range) == 2:
for i in xrange(char_range[0], char_range[1] + 1):
props[prop].append(unichr(i))
props[prop].append(wide_unichr(i))
elif char_range:
props[prop].append(unichr(char_range[0]))
props[prop].append(wide_unichr(char_range[0]))
return dict(props)
@@ -340,6 +340,16 @@ def extract_language_scripts(xml):
return language_scripts
def batch_iter(iterable, batch_size):
source_iter = iter(iterable)
while True:
batch = list(islice(source_iter, batch_size))
if len(batch) > 0:
yield batch
else:
return
def get_script_languages():
# For some languages (Greek, Thai, etc.), use of an unambiguous script is sufficient
# to identify the language. We keep track of those single language scripts to inform
@@ -398,6 +408,10 @@ def main(out_dir):
if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
download_cldr()
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = get_script_codes(all_scripts)
script_languages = get_script_languages()
max_langs = 0
@@ -420,7 +434,7 @@ def main(out_dir):
# Generate C data file
char_scripts_data = u''',
'''.join(['SCRIPT_{}'.format((script or UNKNOWN_SCRIPT).upper()) for script in chars])
'''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])
script_codes_data = u''',
'''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])