[i18n] Generating Hangul syllable classes

This commit is contained in:
Al
2015-06-16 12:50:42 -04:00
parent cb2035867b
commit f04fad0e93

View File

@@ -9,6 +9,7 @@ into scanner.re before compliation.
'''
import requests
from collections import defaultdict
import re
# Operate on WordBreakProperty.txt file
@@ -29,6 +30,7 @@ other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
ideographic_scripts = set([
@@ -81,6 +83,18 @@ def get_char_class(text, char_class_regex):
return char_ranges
hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
def get_hangul_syllable_ranges(text):
char_ranges = defaultdict(list)
for line in text.split('\n'):
m = hangul_syllable_type_regex.match(line)
if m:
char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
return dict(char_ranges)
name_funcs = [
('hebrew_letter_chars', hebrew_letter_regex),
('format_chars', format_regex),
@@ -110,6 +124,13 @@ def main():
s = get_letter_range(response.content, reg)
print '{} = [{}];'.format(name, ''.join(s))
response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
if response.ok:
syllable_ranges = get_hangul_syllable_ranges(response.content)
for name, ranges in syllable_ranges.iteritems():
print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
response = requests.get(SCRIPTS_URL)
if response.ok:
s = ''.join(get_char_class(response.content, numbers_regex))