141 lines
4.6 KiB
Python
141 lines
4.6 KiB
Python
'''
|
|
word_breaks.py
|
|
|
|
This script is used to automatically build ranges of unicode characters
|
|
from the unicode spec's word break properties. These ranges help us
|
|
build a tokenizer that does the right thing in every language with regard
|
|
to word segmentation. The lines outputted by this script can be pasted
|
|
into scanner.re before compliation.
|
|
'''
|
|
|
|
import requests
|
|
from collections import defaultdict
|
|
import re
|
|
|
|
# Operate on WordBreakProperty.txt file
|
|
hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
|
|
format_regex = re.compile('^([^\s]+)[\s]+; Format ')
|
|
extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
|
|
katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
|
|
other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
|
|
mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
|
|
mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
|
|
mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
|
|
numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
|
|
extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
|
|
|
|
# Operate on Scripts.txt file
|
|
other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
|
|
|
|
script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
|
|
|
|
WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
|
|
HANGUL_SYLLABLE_TYPES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/HangulSyllableType.txt'
|
|
SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
|
|
|
|
ideographic_scripts = set([
|
|
'han',
|
|
'hiragana',
|
|
'hangul',
|
|
'tibetan',
|
|
'thai',
|
|
'lao',
|
|
'javanese',
|
|
'balinese',
|
|
'yi',
|
|
])
|
|
|
|
|
|
def regex_char_range(match):
|
|
r = match.split('..')
|
|
# Wide version
|
|
return u'-'.join([('\u{}'.format(c.lower()) if len(c) < 5 else '\U{}'.format(c.lower().rjust(8, '0'))) for c in r])
|
|
|
|
|
|
def get_letter_range(text, *regexes):
|
|
char_ranges = []
|
|
for line in text.split('\n'):
|
|
for regex in regexes:
|
|
m = regex.match(line)
|
|
if m:
|
|
char_ranges.append(regex_char_range(m.group(1)))
|
|
return char_ranges
|
|
|
|
|
|
def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
|
|
char_ranges = []
|
|
for char_range, script, char_class in script_regex.findall(text):
|
|
if script.lower() in scripts and char_class_regex.match(char_class):
|
|
char_ranges.append(regex_char_range(char_range))
|
|
return char_ranges
|
|
|
|
|
|
def get_char_class(text, char_class_regex):
|
|
char_ranges = []
|
|
for char_range, script, char_class in script_regex.findall(text):
|
|
if char_class_regex.match(char_class):
|
|
char_ranges.append(regex_char_range(char_range))
|
|
return char_ranges
|
|
|
|
|
|
hangul_syllable_type_regex = re.compile('^([^\s]+)[\s]+; ([A-Z]+)')
|
|
|
|
|
|
def get_hangul_syllable_ranges(text):
|
|
char_ranges = defaultdict(list)
|
|
for line in text.split('\n'):
|
|
m = hangul_syllable_type_regex.match(line)
|
|
if m:
|
|
char_ranges[m.group(2)].append(regex_char_range(m.group(1)))
|
|
return dict(char_ranges)
|
|
|
|
|
|
name_funcs = [
|
|
('hebrew_letter_chars', hebrew_letter_regex),
|
|
('format_chars', format_regex),
|
|
('extend_chars', extend_regex),
|
|
('katakana_chars', katakana_regex),
|
|
('letter_other_alpha_chars', other_alpha_letter_regex),
|
|
('mid_letter_chars', mid_letter_regex),
|
|
('mid_number_chars', mid_number_regex),
|
|
('mid_num_letter_chars', mid_num_letter_regex),
|
|
('numeric_chars', numeric_regex),
|
|
('extend_num_letter_chars', extend_num_letter_regex),
|
|
]
|
|
|
|
IDEOGRAPHIC_CHARS = 'ideographic_chars'
|
|
IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
|
|
|
|
numbers_regex = re.compile('N[ol]', re.I)
|
|
letters_regex = re.compile('L*', re.I)
|
|
|
|
|
|
def main():
|
|
''' Insert these lines into scanner.re '''
|
|
response = requests.get(WORD_BREAK_PROPERTIES_URL)
|
|
|
|
if response.ok:
|
|
for name, reg in name_funcs:
|
|
s = get_letter_range(response.content, reg)
|
|
print '{} = [{}];'.format(name, ''.join(s))
|
|
|
|
response = requests.get(HANGUL_SYLLABLE_TYPES_URL)
|
|
|
|
if response.ok:
|
|
syllable_ranges = get_hangul_syllable_ranges(response.content)
|
|
for name, ranges in syllable_ranges.iteritems():
|
|
print 'hangul_syllable_class_{} = [{}];'.format(name, u''.join(ranges))
|
|
|
|
response = requests.get(SCRIPTS_URL)
|
|
if response.ok:
|
|
s = ''.join(get_char_class(response.content, numbers_regex))
|
|
|
|
print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
|
|
|
|
s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
|
|
print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|