[fix] for transliteration rules, allowing the parsing of set differencees and arbitrarily nested character set expressions, using non-NUL byte for the empty transition. Adding resulting data file.
This commit is contained in:
@@ -55,7 +55,8 @@ WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)
|
|||||||
|
|
||||||
word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
|
word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
|
||||||
|
|
||||||
EMPTY_TRANSITION = u'\u0000'
|
WORD_BOUNDARY_CHAR = u'\u0001'
|
||||||
|
EMPTY_TRANSITION = u'\u0004'
|
||||||
|
|
||||||
EXCLUDE_TRANSLITERATORS = set([
|
EXCLUDE_TRANSLITERATORS = set([
|
||||||
'Hangul-Latin',
|
'Hangul-Latin',
|
||||||
@@ -187,6 +188,7 @@ OPEN_GROUP = 'OPEN_GROUP'
|
|||||||
CLOSE_GROUP = 'CLOSE_GROUP'
|
CLOSE_GROUP = 'CLOSE_GROUP'
|
||||||
GROUP_REF = 'GROUP_REF'
|
GROUP_REF = 'GROUP_REF'
|
||||||
CHAR_SET = 'CHAR_SET'
|
CHAR_SET = 'CHAR_SET'
|
||||||
|
CHAR_MULTI_SET = 'CHAR_MULTI_SET'
|
||||||
CHAR_CLASS = 'CHAR_CLASS'
|
CHAR_CLASS = 'CHAR_CLASS'
|
||||||
OPTIONAL = 'OPTIONAL'
|
OPTIONAL = 'OPTIONAL'
|
||||||
CHARACTER = 'CHARACTER'
|
CHARACTER = 'CHARACTER'
|
||||||
@@ -235,21 +237,24 @@ CHAR_RANGE = 'CHAR_RANGE'
|
|||||||
WORD_BOUNDARY = 'WORD_BOUNDARY'
|
WORD_BOUNDARY = 'WORD_BOUNDARY'
|
||||||
NEGATION = 'NEGATION'
|
NEGATION = 'NEGATION'
|
||||||
INTERSECTION = 'INTERSECTION'
|
INTERSECTION = 'INTERSECTION'
|
||||||
|
DIFFERENCE = 'DIFFERENCE'
|
||||||
|
|
||||||
# Scanner for a character set (yes, a regex regex)
|
# Scanner for a character set (yes, a regex regex)
|
||||||
|
|
||||||
char_set_scanner = Scanner([
|
char_set_scanner = Scanner([
|
||||||
('^\^', NEGATION),
|
('^\^', NEGATION),
|
||||||
(r'[\\]?[^\\]\-[\\]?.', CHAR_RANGE),
|
(r'[\\]?[^\\\s]\-[\\]?[^\s]', CHAR_RANGE),
|
||||||
(r'[\\].', ESCAPED_CHARACTER),
|
(r'[\\].', ESCAPED_CHARACTER),
|
||||||
(r'\'\'', SINGLE_QUOTE),
|
(r'\'\'', SINGLE_QUOTE),
|
||||||
(r'\'.*?\'', QUOTED_STRING),
|
(r'\'.*?\'', QUOTED_STRING),
|
||||||
(':[^:]+:', CHAR_CLASS),
|
(':[^:]+:', CHAR_CLASS),
|
||||||
# Char set
|
# Char set
|
||||||
('\[[^\[\]]+\]', CHAR_SET),
|
('\[[^\[\]]+\]', CHAR_SET),
|
||||||
|
('\[.*\]', CHAR_MULTI_SET),
|
||||||
('\[', OPEN_SET),
|
('\[', OPEN_SET),
|
||||||
('\]', CLOSE_SET),
|
('\]', CLOSE_SET),
|
||||||
('&', INTERSECTION),
|
('&', INTERSECTION),
|
||||||
|
('(?<=[\s])-(?=[\s])', DIFFERENCE),
|
||||||
('\$', WORD_BOUNDARY),
|
('\$', WORD_BOUNDARY),
|
||||||
(r'[^\s]', CHARACTER),
|
(r'[^\s]', CHARACTER),
|
||||||
])
|
])
|
||||||
@@ -258,6 +263,8 @@ NUM_CHARS = 65536
|
|||||||
|
|
||||||
all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])
|
all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])
|
||||||
|
|
||||||
|
control_chars = set([c for c in all_chars if unicodedata.category(c) == 'Cc'])
|
||||||
|
|
||||||
|
|
||||||
def get_transforms():
|
def get_transforms():
|
||||||
return [f for f in os.listdir(CLDR_TRANSFORMS_DIR) if f.endswith('.xml')]
|
return [f for f in os.listdir(CLDR_TRANSFORMS_DIR) if f.endswith('.xml')]
|
||||||
@@ -310,9 +317,36 @@ def parse_regex_char_class(c):
|
|||||||
chars = []
|
chars = []
|
||||||
|
|
||||||
if is_negation:
|
if is_negation:
|
||||||
chars = sorted(all_chars - set(chars))
|
chars = all_chars - set(chars)
|
||||||
|
|
||||||
return chars
|
return sorted(set(chars) - control_chars)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_balanced_sets(s):
|
||||||
|
open_brackets = 0
|
||||||
|
max_nesting = 0
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
for i, ch in enumerate(s):
|
||||||
|
if ch == '[':
|
||||||
|
if open_brackets == 0:
|
||||||
|
start = i
|
||||||
|
max_nesting
|
||||||
|
open_brackets += 1
|
||||||
|
elif ch == ']':
|
||||||
|
open_brackets -= 1
|
||||||
|
if open_brackets == 0:
|
||||||
|
skip = False
|
||||||
|
yield (s[start:i + 1], CHAR_MULTI_SET)
|
||||||
|
(start, i + 1)
|
||||||
|
elif open_brackets == 0 and not skip:
|
||||||
|
for token, token_class in char_set_scanner.scan(s[i:]):
|
||||||
|
if token_class not in (CHAR_SET, CHAR_MULTI_SET, OPEN_SET, CLOSE_SET):
|
||||||
|
yield token, token_class
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
|
||||||
def parse_regex_char_set(s):
|
def parse_regex_char_set(s):
|
||||||
@@ -330,9 +364,10 @@ def parse_regex_char_set(s):
|
|||||||
is_negation = False
|
is_negation = False
|
||||||
this_group = set()
|
this_group = set()
|
||||||
is_intersection = False
|
is_intersection = False
|
||||||
|
is_difference = False
|
||||||
is_word_boundary = False
|
is_word_boundary = False
|
||||||
|
|
||||||
for token, token_class in char_set_scanner.scan(s):
|
for token, token_class in parse_balanced_sets(s):
|
||||||
if token_class == CHAR_RANGE:
|
if token_class == CHAR_RANGE:
|
||||||
this_char_set = set(parse_regex_char_range(token))
|
this_char_set = set(parse_regex_char_range(token))
|
||||||
this_group |= this_char_set
|
this_group |= this_char_set
|
||||||
@@ -347,16 +382,21 @@ def parse_regex_char_set(s):
|
|||||||
is_negation = True
|
is_negation = True
|
||||||
elif token_class == CHAR_CLASS:
|
elif token_class == CHAR_CLASS:
|
||||||
this_group |= set(parse_regex_char_class(token))
|
this_group |= set(parse_regex_char_class(token))
|
||||||
elif token_class == CHAR_SET:
|
elif token_class in (CHAR_SET, CHAR_MULTI_SET):
|
||||||
# Recursive calls, as performance doesn't matter here and nesting is shallow
|
# Recursive calls, as performance doesn't matter here and nesting is shallow
|
||||||
this_char_set = set(parse_regex_char_set(token))
|
this_char_set = set(parse_regex_char_set(token))
|
||||||
# Shouldn't be complex set expression logic here
|
|
||||||
if is_intersection:
|
if is_intersection:
|
||||||
this_group &= this_char_set
|
this_group &= this_char_set
|
||||||
|
is_intersection = False
|
||||||
|
elif is_difference:
|
||||||
|
this_group -= this_char_set
|
||||||
|
is_difference = False
|
||||||
else:
|
else:
|
||||||
this_group |= this_char_set
|
this_group |= this_char_set
|
||||||
elif token_class == INTERSECTION:
|
elif token_class == INTERSECTION:
|
||||||
is_intersection = True
|
is_intersection = True
|
||||||
|
elif token_class == DIFFERENCE:
|
||||||
|
is_difference = True
|
||||||
elif token_class == CHARACTER:
|
elif token_class == CHARACTER:
|
||||||
this_group.add(token)
|
this_group.add(token)
|
||||||
elif token_class == WORD_BOUNDARY:
|
elif token_class == WORD_BOUNDARY:
|
||||||
@@ -365,7 +405,7 @@ def parse_regex_char_set(s):
|
|||||||
if is_negation:
|
if is_negation:
|
||||||
this_group = all_chars - this_group
|
this_group = all_chars - this_group
|
||||||
|
|
||||||
return sorted(this_group) + (['$'] if is_word_boundary else [])
|
return sorted(this_group - control_chars) + (['$'] if is_word_boundary else [])
|
||||||
|
|
||||||
|
|
||||||
for name, regex_range in unicode_property_regexes:
|
for name, regex_range in unicode_property_regexes:
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user