[i18n] Generating Hangul syllable classes
This commit is contained in:
@@ -9,6 +9,7 @@ into scanner.re before compliation.
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from collections import defaultdict
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Operate on WordBreakProperty.txt file
|
# Operate on WordBreakProperty.txt file
|
||||||
@@ -29,6 +30,7 @@ other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
|
|||||||
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
|
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
|
||||||
|
|
||||||
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
|
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
|
||||||
|
HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
|
||||||
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
||||||
|
|
||||||
ideographic_scripts = set([
|
ideographic_scripts = set([
|
||||||
@@ -81,6 +83,18 @@ def get_char_class(text, char_class_regex):
|
|||||||
return char_ranges
|
return char_ranges
|
||||||
|
|
||||||
|
|
||||||
|
hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
|
||||||
|
|
||||||
|
|
||||||
|
def get_hangul_syllable_ranges(text):
|
||||||
|
char_ranges = defaultdict(list)
|
||||||
|
for line in text.split('\n'):
|
||||||
|
m = hangul_syllable_type_regex.match(line)
|
||||||
|
if m:
|
||||||
|
char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
|
||||||
|
return dict(char_ranges)
|
||||||
|
|
||||||
|
|
||||||
name_funcs = [
|
name_funcs = [
|
||||||
('hebrew_letter_chars', hebrew_letter_regex),
|
('hebrew_letter_chars', hebrew_letter_regex),
|
||||||
('format_chars', format_regex),
|
('format_chars', format_regex),
|
||||||
@@ -110,6 +124,13 @@ def main():
|
|||||||
s = get_letter_range(response.content, reg)
|
s = get_letter_range(response.content, reg)
|
||||||
print '{} = [{}];'.format(name, ''.join(s))
|
print '{} = [{}];'.format(name, ''.join(s))
|
||||||
|
|
||||||
|
response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
syllable_ranges = get_hangul_syllable_ranges(response.content)
|
||||||
|
for name, ranges in syllable_ranges.iteritems():
|
||||||
|
print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
|
||||||
|
|
||||||
response = requests.get(SCRIPTS_URL)
|
response = requests.get(SCRIPTS_URL)
|
||||||
if response.ok:
|
if response.ok:
|
||||||
s = ''.join(get_char_class(response.content, numbers_regex))
|
s = ''.join(get_char_class(response.content, numbers_regex))
|
||||||
|
|||||||
Reference in New Issue
Block a user