[i18n] Generating Hangul syllable classes

2015-06-16 12:50:42 -04:00
parent cb2035867b
commit f04fad0e93
1 changed files with 21 additions and 0 deletions
--- a/scripts/geodata/i18n/word_breaks.py
+++ b/scripts/geodata/i18n/word_breaks.py
@@ -9,6 +9,7 @@ into scanner.re before compliation.
 '''

 import requests
+from collections import defaultdict
 import re

 # Operate on WordBreakProperty.txt file
@@ -29,6 +30,7 @@ other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
 script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')

 WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
+HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
 SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'

 ideographic_scripts = set([
@@ -81,6 +83,18 @@ def get_char_class(text, char_class_regex):
    return char_ranges


+hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
+
+
+def get_hangul_syllable_ranges(text):
+    char_ranges = defaultdict(list)
+    for line in text.split('\n'):
+        m = hangul_syllable_type_regex.match(line)
+        if m:
+            char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
+    return dict(char_ranges)
+
+
 name_funcs = [
    ('hebrew_letter_chars', hebrew_letter_regex),
    ('format_chars', format_regex),
@@ -110,6 +124,13 @@ def main():
            s = get_letter_range(response.content, reg)
            print '{} = [{}];'.format(name, ''.join(s))

+    response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
+
+    if response.ok:
+        syllable_ranges = get_hangul_syllable_ranges(response.content)
+        for name, ranges in syllable_ranges.iteritems():
+            print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
+
    response = requests.get(SCRIPTS_URL)
    if response.ok:
        s = ''.join(get_char_class(response.content, numbers_regex))