[unicode/tokenization] Using new character classes including wide chars in scanner

This commit is contained in:
Al
2015-09-23 00:33:14 -04:00
parent a76831df7a
commit b4593b6f88
2 changed files with 32 additions and 28 deletions

View File

@@ -30,6 +30,7 @@ from scanner import Scanner
from unicode_properties import *
from unicode_paths import CLDR_DIR
from geodata.encoding import safe_decode, safe_encode
from geodata.string_utils import NUM_CODEPOINTS, wide_unichr
CLDR_TRANSFORMS_DIR = os.path.join(CLDR_DIR, 'common', 'transforms')
@@ -150,9 +151,10 @@ def init_unicode_categories():
global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
for i in xrange(NUM_CHARS):
unicode_categories[unicodedata.category(unichr(i))].append(unichr(i))
unicode_combining_classes[str(unicodedata.combining(unichr(i)))].append(unichr(i))
for i in xrange(NUM_CODEPOINTS):
c = wide_unichr(i)
unicode_categories[unicodedata.category(c)].append(c)
unicode_combining_classes[str(unicodedata.combining(c))].append(c)
unicode_categories = dict(unicode_categories)
unicode_combining_classes = dict(unicode_combining_classes)
@@ -165,7 +167,7 @@ def init_unicode_categories():
script_chars = get_chars_by_script()
for i, script in enumerate(script_chars):
if script:
unicode_scripts[script.lower()].append(unichr(i))
unicode_scripts[script.lower()].append(wide_unichr(i))
unicode_scripts = dict(unicode_scripts)
@@ -379,9 +381,7 @@ char_set_scanner = Scanner([
(r'[^\s]', CHARACTER),
])
NUM_CHARS = 65536
all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])
all_chars = set([wide_unichr(i) for i in xrange(NUM_CODEPOINTS)])
control_chars = set([c for c in all_chars if unicodedata.category(c) in ('Cc', 'Cn', 'Cs')])
@@ -392,20 +392,22 @@ def get_transforms():
def replace_html_entity(ent):
name = ent.strip('&;')
return unichr(htmlentitydefs.name2codepoint[name])
return wide_unichr(htmlentitydefs.name2codepoint[name])
def parse_regex_char_range(regex):
prev_char = None
ranges = range_regex.findall(regex)
regex = range_regex.sub('', regex)
chars = [ord(c) for c in regex]
chars = [wide_ord(c) for c in regex]
for start, end in ranges:
start_ord = wide_ord(start)
end_ord = wide_ord(end)
if ord(end) > ord(start):
if end_ord > start_ord:
# Ranges are inclusive
chars.extend([unichr(c) for c in range(ord(start), ord(end) + 1)])
chars.extend([wide_unichr(c) for c in range(start_ord, end_ord + 1)])
return chars
@@ -554,13 +556,15 @@ def parse_regex_char_set(s, current_filter=all_chars):
elif token_class == CHARACTER and token not in control_chars:
this_group.add(token)
real_chars.add(token)
elif token_class == UNICODE_CHARACTER:
elif token_class in (UNICODE_CHARACTER, UNICODE_WIDE_CHARACTER):
token = token.decode('unicode-escape')
if token not in control_chars:
this_group.add(token)
real_chars.add(token)
elif token_class in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER):
continue
elif token_class == WIDE_CHARACTER:
if token not in control_chars:
this_group.add(token)
real_chars.add(token)
elif token_class == BRACKETED_CHARACTER:
if token.strip('{{}}') not in control_chars:
this_group.add(token)
@@ -794,11 +798,11 @@ def char_permutations(s, current_filter=all_chars, reverse=False):
add_char_type(current_chars, [token])
elif token_type == SINGLE_QUOTE:
add_char_type(current_chars, ["'"])
elif token_type == UNICODE_CHARACTER:
elif token_type in (UNICODE_CHARACTER, UNICODE_WIDE_CHARACTER):
token = token.decode('unicode-escape')
add_char_type(current_chars, [token])
elif token_type in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER):
continue
elif token_type == WIDE_CHARACTER:
add_char_type(current_chars, [token])
if in_group and last_token_group_start:
start_group = len(current_chars)
@@ -1185,12 +1189,12 @@ EXISTING_STEP = 'EXISTING_STEP'
PREPEND_STEP = 'PREPEND_STEP'
html_escapes = {'&{};'.format(name): safe_encode(unichr(value))
html_escapes = {'&{};'.format(name): safe_encode(wide_unichr(value))
for name, value in htmlentitydefs.name2codepoint.iteritems()
}
html_escapes.update({'&#{};'.format(i): safe_encode(unichr(i))
for i in xrange(NUM_CHARS)
html_escapes.update({'&#{};'.format(i): safe_encode(wide_unichr(i))
for i in xrange(NUM_CODEPOINTS)
})
# [[:Latin] & [:Ll:]]

File diff suppressed because one or more lines are too long