[tokenization] Script to generate TR-29 ranges for re2c scanner

2015-04-14 15:50:36 -04:00
parent 5fa03587fb
commit 24e62b1c6c
2 changed files with 121 additions and 0 deletions
--- a/scripts/geodata/i18n/init.py
+++ b/scripts/geodata/i18n/init.py
--- a/scripts/geodata/i18n/word_breaks.py
+++ b/scripts/geodata/i18n/word_breaks.py
@@ -0,0 +1,121 @@
 '''
 word_breaks.py
 This script is used to automatically build ranges of unicode characters
 from the unicode spec's word break properties. These ranges help us
 build a tokenizer that does the right thing in every language with regard
 to word segmentation. The lines outputted by this script can be pasted
 into scanner.re before compliation.
 '''
 import requests
 import re
 # Operate on WordBreakProperty.txt file
 hebrew_letter_regex = re.compile('^([^\s]+)[\s]+; Hebrew_Letter ')
 format_regex = re.compile('^([^\s]+)[\s]+; Format ')
 extend_regex = re.compile('^([^\s]+)[\s]+; Extend ')
 katakana_regex = re.compile('^([^\s]+)[\s]+; Katakana ')
 other_alpha_letter_regex = re.compile('^([^\s]+)[\s]+; ALetter # Lo (?!.*(?:HANGUL|TIBETAN|JAVANESE|BALINESE|YI) )')
 mid_letter_regex = re.compile('^([^\s]+)[\s]+; MidLetter')
 mid_number_regex = re.compile('^([^\s]+)[\s]+; MidNum ')
 mid_num_letter_regex = re.compile('^([^\s]+)[\s]+; MidNumLet ')
 numeric_regex = re.compile('^([^\s]+)[\s]+; Numeric ')
 extend_num_letter_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
 # Operate on Scripts.txt file
 other_number_regex = re.compile('^([^\s]+)[\s]+; ExtendNumLet ')
 script_regex = re.compile('([^\s]+)[\s]+;[\s]*([^\s]+)[\s]*#[\s]*([^\s]+)')
 WORD_BREAK_PROPERTIES_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt'
 SCRIPTS_URL = 'http://unicode.org/Public/UNIDATA/Scripts.txt'
 ideographic_scripts = set([
    'han',
    'hiragana',
    'hangul',
    'tibetan',
    'thai',
    'lao',
    'javanese',
    'balinese',
    'yi',
 ])
 def regex_char_range(match):
    r = match.split('..')
    if len(r[0]) < 5 and len(r[-1]) < 5:
        return '-'.join(['\u{}'.format(c.lower()) for c in r])
    else:
        return ''
 def get_letter_range(text, *regexes):
    char_ranges = []
    for line in text.split('\n'):
        for regex in regexes:
            m = regex.match(line)
            if m:
                char_ranges.append(regex_char_range(m.group(1)))
    return char_ranges
 def get_letter_ranges_for_scripts(text, scripts, char_class_regex):
    char_ranges = []
    for char_range, script, char_class in script_regex.findall(text):
        if script.lower() in scripts and char_class_regex.match(char_class):
            char_ranges.append(regex_char_range(char_range))
    return char_ranges
 def get_char_class(text, char_class_regex):
    char_ranges = []
    for char_range, script, char_class in script_regex.findall(text):
        if char_class_regex.match(char_class):
            char_ranges.append(regex_char_range(char_range))
    return char_ranges
 name_funcs = [
    ('hebrew_letter_chars', hebrew_letter_regex),
    ('format_chars', format_regex),
    ('extend_chars', extend_regex),
    ('katakana_chars', katakana_regex),
    ('letter_other_alpha_chars', other_alpha_letter_regex),
    ('mid_letter_chars', mid_letter_regex),
    ('mid_number_chars', mid_number_regex),
    ('mid_num_letter_chars', mid_num_letter_regex),
    ('numeric_chars', numeric_regex),
    ('extend_num_letter_chars', extend_num_letter_regex),
 ]
 IDEOGRAPHIC_CHARS = 'ideographic_chars'
 IDEOGRAPHIC_NUMERIC_CHARS = 'ideographic_numeric_chars'
 numbers_regex = re.compile('N[ol]', re.I)
 letters_regex = re.compile('L*', re.I)
 def main():
    ''' Insert these lines into scanner.re '''
    response = requests.get(WORD_BREAK_PROPERTIES_URL)
    if response.ok:
        for name, reg in name_funcs:
            s = get_letter_range(response.content, reg)
            print '{} = [{}];'.format(name, ''.join(s))
    response = requests.get(SCRIPTS_URL)
    if response.ok:
        s = ''.join(get_char_class(response.content, numbers_regex))
        print '{} = [{}];'.format(IDEOGRAPHIC_NUMERIC_CHARS, ''.join(s))
        s = ''.join(get_letter_ranges_for_scripts(response.content, ideographic_scripts, letters_regex))
        print '{} = [{}];'.format(IDEOGRAPHIC_CHARS, ''.join(s))
 if __name__ == '__main__':
    main()