[fix] for transliteration rules, allowing the parsing of set differencees and arbitrarily nested character set expressions, using non-NUL byte for the empty transition. Adding resulting data file.

This commit is contained in:
Al
2015-05-08 17:14:22 -04:00
parent 10ebaf147a
commit 2a69488f9b
2 changed files with 100 additions and 60 deletions

View File

@@ -55,7 +55,8 @@ WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)
word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
EMPTY_TRANSITION = u'\u0000'
WORD_BOUNDARY_CHAR = u'\u0001'
EMPTY_TRANSITION = u'\u0004'
EXCLUDE_TRANSLITERATORS = set([
'Hangul-Latin',
@@ -187,6 +188,7 @@ OPEN_GROUP = 'OPEN_GROUP'
CLOSE_GROUP = 'CLOSE_GROUP'
GROUP_REF = 'GROUP_REF'
CHAR_SET = 'CHAR_SET'
CHAR_MULTI_SET = 'CHAR_MULTI_SET'
CHAR_CLASS = 'CHAR_CLASS'
OPTIONAL = 'OPTIONAL'
CHARACTER = 'CHARACTER'
@@ -235,21 +237,24 @@ CHAR_RANGE = 'CHAR_RANGE'
WORD_BOUNDARY = 'WORD_BOUNDARY'
NEGATION = 'NEGATION'
INTERSECTION = 'INTERSECTION'
DIFFERENCE = 'DIFFERENCE'
# Scanner for a character set (yes, a regex regex)
char_set_scanner = Scanner([
('^\^', NEGATION),
(r'[\\]?[^\\]\-[\\]?.', CHAR_RANGE),
(r'[\\]?[^\\\s]\-[\\]?[^\s]', CHAR_RANGE),
(r'[\\].', ESCAPED_CHARACTER),
(r'\'\'', SINGLE_QUOTE),
(r'\'.*?\'', QUOTED_STRING),
(':[^:]+:', CHAR_CLASS),
# Char set
('\[[^\[\]]+\]', CHAR_SET),
('\[.*\]', CHAR_MULTI_SET),
('\[', OPEN_SET),
('\]', CLOSE_SET),
('&', INTERSECTION),
('(?<=[\s])-(?=[\s])', DIFFERENCE),
('\$', WORD_BOUNDARY),
(r'[^\s]', CHARACTER),
])
@@ -258,6 +263,8 @@ NUM_CHARS = 65536
all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])
control_chars = set([c for c in all_chars if unicodedata.category(c) == 'Cc'])
def get_transforms():
return [f for f in os.listdir(CLDR_TRANSFORMS_DIR) if f.endswith('.xml')]
@@ -310,9 +317,36 @@ def parse_regex_char_class(c):
chars = []
if is_negation:
chars = sorted(all_chars - set(chars))
chars = all_chars - set(chars)
return chars
return sorted(set(chars) - control_chars)
def parse_balanced_sets(s):
open_brackets = 0
max_nesting = 0
skip = False
for i, ch in enumerate(s):
if ch == '[':
if open_brackets == 0:
start = i
max_nesting
open_brackets += 1
elif ch == ']':
open_brackets -= 1
if open_brackets == 0:
skip = False
yield (s[start:i + 1], CHAR_MULTI_SET)
(start, i + 1)
elif open_brackets == 0 and not skip:
for token, token_class in char_set_scanner.scan(s[i:]):
if token_class not in (CHAR_SET, CHAR_MULTI_SET, OPEN_SET, CLOSE_SET):
yield token, token_class
else:
break
skip = True
def parse_regex_char_set(s):
@@ -330,9 +364,10 @@ def parse_regex_char_set(s):
is_negation = False
this_group = set()
is_intersection = False
is_difference = False
is_word_boundary = False
for token, token_class in char_set_scanner.scan(s):
for token, token_class in parse_balanced_sets(s):
if token_class == CHAR_RANGE:
this_char_set = set(parse_regex_char_range(token))
this_group |= this_char_set
@@ -347,16 +382,21 @@ def parse_regex_char_set(s):
is_negation = True
elif token_class == CHAR_CLASS:
this_group |= set(parse_regex_char_class(token))
elif token_class == CHAR_SET:
elif token_class in (CHAR_SET, CHAR_MULTI_SET):
# Recursive calls, as performance doesn't matter here and nesting is shallow
this_char_set = set(parse_regex_char_set(token))
# Shouldn't be complex set expression logic here
if is_intersection:
this_group &= this_char_set
is_intersection = False
elif is_difference:
this_group -= this_char_set
is_difference = False
else:
this_group |= this_char_set
elif token_class == INTERSECTION:
is_intersection = True
elif token_class == DIFFERENCE:
is_difference = True
elif token_class == CHARACTER:
this_group.add(token)
elif token_class == WORD_BOUNDARY:
@@ -365,7 +405,7 @@ def parse_regex_char_set(s):
if is_negation:
this_group = all_chars - this_group
return sorted(this_group) + (['$'] if is_word_boundary else [])
return sorted(this_group - control_chars) + (['$'] if is_word_boundary else [])
for name, regex_range in unicode_property_regexes: