274 lines
7.3 KiB
Python
274 lines
7.3 KiB
Python
'''
|
|
unicode_data.py
|
|
---------------
|
|
|
|
Python's unicodedata module uses an outdated spec (Unicode 5.2) and since
|
|
e.g. unicode categories are used in tokenization, we'd like to keep this
|
|
as up-to-date as possible with the latest standard.
|
|
'''
|
|
import csv
|
|
import os
|
|
import sys
|
|
from collections import defaultdict, namedtuple
|
|
|
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
|
|
|
from geodata.file_utils import download_file
|
|
from geodata.string_utils import wide_unichr, wide_ord
|
|
|
|
from unicode_properties import *
|
|
|
|
from unicode_paths import UNICODE_DATA_DIR
|
|
|
|
UNIDATA_URL = 'http://unicode.org/Public/UNIDATA/UnicodeData.txt'
|
|
|
|
UNIDATA_DIR = os.path.join(UNICODE_DATA_DIR, 'unidata')
|
|
LOCAL_UNIDATA_FILE = os.path.join(UNIDATA_DIR, 'UnicodeData.txt')
|
|
|
|
unicode_categories = defaultdict(list)
|
|
unicode_blocks = defaultdict(list)
|
|
unicode_combining_classes = defaultdict(list)
|
|
unicode_general_categories = defaultdict(list)
|
|
unicode_scripts = defaultdict(list)
|
|
unicode_properties = {}
|
|
|
|
unicode_script_ids = {}
|
|
|
|
unicode_blocks = {}
|
|
unicode_category_aliases = {}
|
|
unicode_property_aliases = {}
|
|
unicode_property_value_aliases = {}
|
|
unicode_word_breaks = {}
|
|
|
|
|
|
# Ref: ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
|
|
UNIDATA_FIELDS = [
|
|
'code',
|
|
'name',
|
|
'category',
|
|
'combining',
|
|
'bidi_category',
|
|
'decomp_mapping',
|
|
'decimal_value',
|
|
'digit_value',
|
|
'numeric_value',
|
|
'mirrored',
|
|
'unicode_1_name',
|
|
'comment',
|
|
'upper_mapping',
|
|
'lower_mapping',
|
|
'title_mapping',
|
|
]
|
|
|
|
UnicodeDataRow = namedtuple('UnicodeDataRow', ','.join(UNIDATA_FIELDS))
|
|
|
|
|
|
def parse_unicode_data():
|
|
'''
|
|
Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
|
|
'''
|
|
if not os.path.exists(LOCAL_UNIDATA_FILE):
|
|
download_file(UNIDATA)
|
|
unidata_file = open(LOCAL_UNIDATA_FILE)
|
|
|
|
for line in csv.reader(unidata_file, delimiter=';'):
|
|
yield UnicodeDataRow(*line)
|
|
|
|
|
|
def iter_unicode_combining_classes():
|
|
return unicode_combining_classes.iteritems()
|
|
|
|
|
|
def iter_unicode_categories():
|
|
return unicode_categories.iteritems()
|
|
|
|
|
|
def get_unicode_category(cat):
|
|
return unicode_categories[cat]
|
|
|
|
|
|
def get_unicode_combining_class(c):
|
|
return unicode_combining_classes[c]
|
|
|
|
|
|
def get_unicode_categories():
|
|
'''
|
|
Build dict of unicode categories e.g.
|
|
|
|
{
|
|
'Lu': ['A', 'B', 'C', ...]
|
|
'Ll': ['a', 'b', 'c', ...]
|
|
}
|
|
'''
|
|
categories = defaultdict(list)
|
|
for row in parse_unicode_data():
|
|
categories[row.category].append(wide_unichr(unicode_to_integer(row.code)))
|
|
return dict(categories)
|
|
|
|
|
|
def get_unicode_combining_classes():
|
|
'''
|
|
Build dict of unicode combining classes e.g.
|
|
|
|
{
|
|
'0': ['\x00', '\x01', \x02', ...]
|
|
}
|
|
'''
|
|
combining_classes = defaultdict(list)
|
|
for row in parse_unicode_data():
|
|
combining_classes[int(row.combining)].append(wide_unichr(unicode_to_integer(row.code)))
|
|
return dict(combining_classes)
|
|
|
|
unicode_category_aliases = {
|
|
'letter': 'L',
|
|
'lower': 'Ll',
|
|
'lowercase': 'Ll',
|
|
'lowercaseletter': 'Ll',
|
|
'upper': 'Lu',
|
|
'uppercase': 'Lu',
|
|
'uppercaseletter': 'Lu',
|
|
'title': 'Lt',
|
|
'nonspacing mark': 'Mn',
|
|
'mark': 'M',
|
|
}
|
|
|
|
COMBINING_CLASS_PROP = 'canonical_combining_class'
|
|
BLOCK_PROP = 'block'
|
|
GENERAL_CATEGORY_PROP = 'general_category'
|
|
SCRIPT_PROP = 'script'
|
|
WORD_BREAK_PROP = 'word_break'
|
|
|
|
|
|
def init_unicode_categories():
|
|
'''
|
|
Initialize module-level dictionaries
|
|
'''
|
|
global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
|
|
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
|
|
global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
|
|
|
|
unicode_categories.update(get_unicode_categories())
|
|
unicode_combining_classes.update(get_unicode_combining_classes())
|
|
|
|
for key in unicode_categories.keys():
|
|
unicode_general_categories[key[0]].extend(unicode_categories[key])
|
|
|
|
script_chars = get_chars_by_script()
|
|
for i, script in enumerate(script_chars):
|
|
if script:
|
|
unicode_scripts[script.lower()].append(wide_unichr(i))
|
|
|
|
unicode_scripts = dict(unicode_scripts)
|
|
|
|
unicode_script_ids.update(build_master_scripts_list(script_chars))
|
|
|
|
unicode_blocks.update(get_unicode_blocks())
|
|
unicode_properties.update(get_unicode_properties())
|
|
unicode_property_aliases.update(get_property_aliases())
|
|
|
|
unicode_word_breaks.update(get_word_break_properties())
|
|
|
|
for key, value in get_property_value_aliases().iteritems():
|
|
key = unicode_property_aliases.get(key, key)
|
|
if key == GENERAL_CATEGORY_PROP:
|
|
for k, v in value.iteritems():
|
|
k = k.lower()
|
|
unicode_category_aliases[k] = v
|
|
if '_' in k:
|
|
unicode_category_aliases[k.replace('_', '')] = v
|
|
|
|
unicode_property_value_aliases[key] = value
|
|
|
|
|
|
regex_chars = re.compile('([\[\]\{\}\-\^])')
|
|
|
|
|
|
def replace_regex_chars(s):
|
|
return regex_chars.sub(r'\\\1', s)
|
|
|
|
|
|
def format_regex_char(i):
|
|
c = wide_unichr(i)
|
|
return replace_regex_chars(c.encode('unicode-escape'))
|
|
|
|
|
|
def make_char_set_regex(chars):
|
|
'''
|
|
Build a regex character set from a list of characters
|
|
'''
|
|
group_start = None
|
|
group_end = None
|
|
last_ord = -2
|
|
|
|
ords = map(wide_ord, chars)
|
|
ords.sort()
|
|
|
|
ords.append(None)
|
|
|
|
groups = []
|
|
|
|
for i, o in enumerate(ords):
|
|
if o is not None and o == last_ord + 1:
|
|
group_end = o
|
|
elif group_start is not None and group_end is not None:
|
|
groups.append('-'.join((format_regex_char(group_start), format_regex_char(group_end))))
|
|
group_end = None
|
|
group_start = o
|
|
elif group_start is not None and group_end is None:
|
|
groups.append(format_regex_char(group_start))
|
|
group_start = o
|
|
else:
|
|
group_start = o
|
|
|
|
last_ord = o
|
|
|
|
return u'[{}]'.format(u''.join(groups))
|
|
|
|
|
|
name_category = [
|
|
('control_chars', 'Cc'),
|
|
('other_format_chars', 'Cf'),
|
|
('other_not_assigned_chars', 'Cn'),
|
|
('other_private_use_chars', 'Co'),
|
|
('other_surrogate_chars', 'Cs'),
|
|
('letter_lower_chars', 'Ll'),
|
|
('letter_modifier_chars', 'Lm'),
|
|
('letter_other_chars', 'Lo'),
|
|
('letter_title_chars', 'Lt'),
|
|
('letter_upper_chars', 'Lu'),
|
|
('mark_spacing_combining_chars', 'Mc'),
|
|
('mark_enclosing_chars', 'Me'),
|
|
('mark_nonspacing_chars', 'Mn'),
|
|
('number_or_digit_chars', 'Nd'),
|
|
('number_letter_chars', 'Nl'),
|
|
('number_other_chars', 'No'),
|
|
('punct_connector_chars', 'Pc'),
|
|
('punct_dash_chars', 'Pd'),
|
|
('punct_close_chars', 'Pe'),
|
|
('punct_final_quote_chars', 'Pf'),
|
|
('punct_initial_quote_chars', 'Pi'),
|
|
('punct_other_chars', 'Po'),
|
|
('punct_open_chars', 'Ps'),
|
|
('currency_symbol_chars', 'Sc'),
|
|
('symbol_modifier_chars', 'Sk'),
|
|
('symbol_math_chars', 'Sm'),
|
|
('symbol_other_chars', 'So'),
|
|
('separator_line_chars', 'Zl'),
|
|
('separator_paragraph_chars', 'Zp'),
|
|
('space', 'Zs'),
|
|
]
|
|
|
|
|
|
def main():
|
|
init_unicode_categories()
|
|
for name, cat in name_category:
|
|
if cat not in unicode_categories:
|
|
continue
|
|
chars = unicode_categories[cat]
|
|
print u'{} = {};'.format(name, make_char_set_regex(chars))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|