[transliteration] fixing variable assignments, literal wide characters (for narrow Python builds), ignoring rules related to spaced Han
This commit is contained in:
@@ -53,6 +53,11 @@ REVISIT_INDICATOR = '|'
|
|||||||
WORD_BOUNDARY_VAR_NAME = 'wordBoundary'
|
WORD_BOUNDARY_VAR_NAME = 'wordBoundary'
|
||||||
WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)
|
WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)
|
||||||
|
|
||||||
|
START_OF_HAN_VAR_NAME = 'startOfHanMarker'
|
||||||
|
START_OF_HAN_VAR = '${}'.format(START_OF_HAN_VAR_NAME)
|
||||||
|
|
||||||
|
start_of_han_regex = re.compile(START_OF_HAN_VAR.replace('$', '\$'))
|
||||||
|
|
||||||
word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
|
word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
|
||||||
|
|
||||||
WORD_BOUNDARY_CHAR = u'\u0001'
|
WORD_BOUNDARY_CHAR = u'\u0001'
|
||||||
@@ -70,11 +75,12 @@ BEGIN_SET_CHAR = u"\x07"
|
|||||||
END_SET_CHAR = u"\x08"
|
END_SET_CHAR = u"\x08"
|
||||||
GROUP_INDICATOR_CHAR = u"\x09"
|
GROUP_INDICATOR_CHAR = u"\x09"
|
||||||
|
|
||||||
|
|
||||||
EXCLUDE_TRANSLITERATORS = set([
|
EXCLUDE_TRANSLITERATORS = set([
|
||||||
'hangul-latin',
|
'hangul-latin',
|
||||||
'interindic-latin',
|
'interindic-latin',
|
||||||
'jamo-latin',
|
'jamo-latin',
|
||||||
# Don't care about spaced Han because
|
# Don't care about spaced Han because our tokenizer does it already
|
||||||
'han-spacedhan',
|
'han-spacedhan',
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -197,7 +203,8 @@ all_transforms = set()
|
|||||||
|
|
||||||
pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE)
|
pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE)
|
||||||
pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\(.*\)[\s]*)?', re.UNICODE)
|
pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\(.*\)[\s]*)?', re.UNICODE)
|
||||||
transform_regex = re.compile(u"(?:[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)((?:<>)|[←<→>↔=])(?:[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)", re.UNICODE)
|
assignment_regex = re.compile(u"(?:[\s]*(\$[^\s\=]+)[\s]*\=[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)", re.UNICODE)
|
||||||
|
transform_regex = re.compile(u"(?:[\s]*(?!=[\s])(.*?)(?<![\s])[\s]*)((?:<>)|[←<→>↔])(?:[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)", re.UNICODE)
|
||||||
|
|
||||||
quoted_string_regex = re.compile(r'\'.*?\'', re.UNICODE)
|
quoted_string_regex = re.compile(r'\'.*?\'', re.UNICODE)
|
||||||
|
|
||||||
@@ -208,7 +215,8 @@ END_CHAR = ';'
|
|||||||
def unescape_unicode_char(m):
|
def unescape_unicode_char(m):
|
||||||
return m.group(0).decode('unicode-escape')
|
return m.group(0).decode('unicode-escape')
|
||||||
|
|
||||||
escaped_unicode_regex = re.compile(r'(?:\\u[0-9A-Fa-f]{4}|\\U[0-9A-Fa-f]{8})')
|
escaped_unicode_regex = re.compile(r'\\u[0-9A-Fa-f]{4}')
|
||||||
|
escaped_wide_unicode_regex = re.compile(r'\\U[0-9A-Fa-f]{8}')
|
||||||
|
|
||||||
literal_space_regex = re.compile(r'(?:\\u0020|\\U00000020)')
|
literal_space_regex = re.compile(r'(?:\\u0020|\\U00000020)')
|
||||||
|
|
||||||
@@ -259,6 +267,7 @@ CHAR_MULTI_SET = 'CHAR_MULTI_SET'
|
|||||||
CHAR_CLASS = 'CHAR_CLASS'
|
CHAR_CLASS = 'CHAR_CLASS'
|
||||||
OPTIONAL = 'OPTIONAL'
|
OPTIONAL = 'OPTIONAL'
|
||||||
CHARACTER = 'CHARACTER'
|
CHARACTER = 'CHARACTER'
|
||||||
|
WIDE_CHARACTER = 'WIDE_CHARACTER'
|
||||||
REVISIT = 'REVISIT'
|
REVISIT = 'REVISIT'
|
||||||
REPEAT = 'REPEAT'
|
REPEAT = 'REPEAT'
|
||||||
LPAREN = 'LPAREN'
|
LPAREN = 'LPAREN'
|
||||||
@@ -270,6 +279,7 @@ HTML_ENTITY = 'HTML_ENTITY'
|
|||||||
SINGLE_QUOTE = 'SINGLE_QUOTE'
|
SINGLE_QUOTE = 'SINGLE_QUOTE'
|
||||||
ESCAPED_CHARACTER = 'ESCAPED_CHARACTER'
|
ESCAPED_CHARACTER = 'ESCAPED_CHARACTER'
|
||||||
|
|
||||||
|
|
||||||
BEFORE_CONTEXT = '{'
|
BEFORE_CONTEXT = '{'
|
||||||
AFTER_CONTEXT = '}'
|
AFTER_CONTEXT = '}'
|
||||||
|
|
||||||
@@ -297,6 +307,7 @@ transform_scanner = Scanner([
|
|||||||
('\)', RPAREN),
|
('\)', RPAREN),
|
||||||
('\|', REVISIT),
|
('\|', REVISIT),
|
||||||
('[\s]+', WHITESPACE),
|
('[\s]+', WHITESPACE),
|
||||||
|
(r'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
|
||||||
(r'[\\]?[^\s]', CHARACTER),
|
(r'[\\]?[^\s]', CHARACTER),
|
||||||
], re.UNICODE)
|
], re.UNICODE)
|
||||||
|
|
||||||
@@ -325,6 +336,7 @@ char_set_scanner = Scanner([
|
|||||||
('&', INTERSECTION),
|
('&', INTERSECTION),
|
||||||
('(?<=[\s])-(?=[\s])', DIFFERENCE),
|
('(?<=[\s])-(?=[\s])', DIFFERENCE),
|
||||||
('\$', WORD_BOUNDARY),
|
('\$', WORD_BOUNDARY),
|
||||||
|
(ur'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
|
||||||
(r'[^\s]', CHARACTER),
|
(r'[^\s]', CHARACTER),
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -332,7 +344,7 @@ NUM_CHARS = 65536
|
|||||||
|
|
||||||
all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])
|
all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])
|
||||||
|
|
||||||
control_chars = set([c for c in all_chars if unicodedata.category(c) == 'Cc'])
|
control_chars = set([c for c in all_chars if unicodedata.category(c) in ('Cc', 'Cn', 'Cs')])
|
||||||
|
|
||||||
|
|
||||||
def get_transforms():
|
def get_transforms():
|
||||||
@@ -465,6 +477,8 @@ def parse_regex_char_set(s, current_filter=all_chars):
|
|||||||
is_difference = False
|
is_difference = False
|
||||||
is_word_boundary = False
|
is_word_boundary = False
|
||||||
|
|
||||||
|
real_chars = set()
|
||||||
|
|
||||||
for token, token_class in parse_balanced_sets(s):
|
for token, token_class in parse_balanced_sets(s):
|
||||||
if token_class == CHAR_RANGE:
|
if token_class == CHAR_RANGE:
|
||||||
this_char_set = set(parse_regex_char_range(token))
|
this_char_set = set(parse_regex_char_range(token))
|
||||||
@@ -472,10 +486,15 @@ def parse_regex_char_set(s, current_filter=all_chars):
|
|||||||
elif token_class == ESCAPED_CHARACTER:
|
elif token_class == ESCAPED_CHARACTER:
|
||||||
token = token.strip('\\')
|
token = token.strip('\\')
|
||||||
this_group.add(token)
|
this_group.add(token)
|
||||||
|
real_chars.add(token)
|
||||||
elif token_class == SINGLE_QUOTE:
|
elif token_class == SINGLE_QUOTE:
|
||||||
this_group.add("'")
|
t = "'"
|
||||||
|
this_group.add(t)
|
||||||
|
real_chars.add(t)
|
||||||
elif token_class == QUOTED_STRING:
|
elif token_class == QUOTED_STRING:
|
||||||
this_group.add(token.strip("'"))
|
t = token.strip("'")
|
||||||
|
this_group.add(t)
|
||||||
|
real_chars.add(t)
|
||||||
elif token_class == NEGATION:
|
elif token_class == NEGATION:
|
||||||
is_negation = True
|
is_negation = True
|
||||||
elif token_class in (CHAR_CLASS, CHAR_CLASS_PCRE):
|
elif token_class in (CHAR_CLASS, CHAR_CLASS_PCRE):
|
||||||
@@ -495,15 +514,18 @@ def parse_regex_char_set(s, current_filter=all_chars):
|
|||||||
is_intersection = True
|
is_intersection = True
|
||||||
elif token_class == DIFFERENCE:
|
elif token_class == DIFFERENCE:
|
||||||
is_difference = True
|
is_difference = True
|
||||||
elif token_class == CHARACTER:
|
elif token_class == CHARACTER and token not in control_chars:
|
||||||
this_group.add(token)
|
this_group.add(token)
|
||||||
|
real_chars.add(token)
|
||||||
|
elif token_class == WIDE_CHARACTER:
|
||||||
|
continue
|
||||||
elif token_class == WORD_BOUNDARY:
|
elif token_class == WORD_BOUNDARY:
|
||||||
is_word_boundary = True
|
is_word_boundary = True
|
||||||
|
|
||||||
if is_negation:
|
if is_negation:
|
||||||
this_group = current_filter - this_group
|
this_group = current_filter - this_group
|
||||||
|
|
||||||
return sorted((this_group & current_filter) - control_chars) + ([WORD_BOUNDARY_CHAR] if is_word_boundary else [])
|
return sorted((this_group & (current_filter | real_chars)) - control_chars) + ([WORD_BOUNDARY_CHAR] if is_word_boundary else [])
|
||||||
|
|
||||||
|
|
||||||
for name, regex_range in unicode_property_regexes:
|
for name, regex_range in unicode_property_regexes:
|
||||||
@@ -535,6 +557,7 @@ def get_raw_rules_and_variables(xml):
|
|||||||
|
|
||||||
rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip())
|
rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip())
|
||||||
rule = literal_space_regex.sub(replace_literal_space, rule)
|
rule = literal_space_regex.sub(replace_literal_space, rule)
|
||||||
|
rule = escaped_wide_unicode_regex.sub('', rule)
|
||||||
rule = escaped_unicode_regex.sub(unescape_unicode_char, rule)
|
rule = escaped_unicode_regex.sub(unescape_unicode_char, rule)
|
||||||
rule = rule.rstrip(END_CHAR).strip()
|
rule = rule.rstrip(END_CHAR).strip()
|
||||||
|
|
||||||
@@ -548,12 +571,17 @@ def get_raw_rules_and_variables(xml):
|
|||||||
in_compound_rule = False
|
in_compound_rule = False
|
||||||
compound_rule = []
|
compound_rule = []
|
||||||
|
|
||||||
|
assignment = assignment_regex.match(rule)
|
||||||
transform = transform_regex.match(rule)
|
transform = transform_regex.match(rule)
|
||||||
pre_transform = pre_transform_full_regex.match(rule)
|
pre_transform = pre_transform_full_regex.match(rule)
|
||||||
|
|
||||||
if pre_transform:
|
if pre_transform:
|
||||||
rules.append((PRE_TRANSFORM, pre_transform.group(1)))
|
rules.append((PRE_TRANSFORM, pre_transform.group(1)))
|
||||||
|
elif assignment:
|
||||||
|
lvalue, rvalue = assignment.groups()
|
||||||
|
var_name = lvalue.strip().lstrip('$')
|
||||||
|
rvalue = rvalue.strip()
|
||||||
|
variables[var_name] = rvalue
|
||||||
elif transform:
|
elif transform:
|
||||||
lvalue, op, rvalue = transform.groups()
|
lvalue, op, rvalue = transform.groups()
|
||||||
lvalue = lvalue.strip()
|
lvalue = lvalue.strip()
|
||||||
@@ -565,11 +593,6 @@ def get_raw_rules_and_variables(xml):
|
|||||||
rules.append((BIDIRECTIONAL_TRANSFORM, (lvalue, rvalue)))
|
rules.append((BIDIRECTIONAL_TRANSFORM, (lvalue, rvalue)))
|
||||||
elif op in BACKWARD_TRANSFORM_OPS:
|
elif op in BACKWARD_TRANSFORM_OPS:
|
||||||
rules.append((BACKWARD_TRANSFORM, (lvalue, rvalue)))
|
rules.append((BACKWARD_TRANSFORM, (lvalue, rvalue)))
|
||||||
elif op == ASSIGNMENT_OP:
|
|
||||||
var_name = lvalue.lstrip('$')
|
|
||||||
variables[var_name] = rvalue
|
|
||||||
else:
|
|
||||||
print 'non-rule', rule, get_source_and_target(xml)
|
|
||||||
|
|
||||||
return rules, variables
|
return rules, variables
|
||||||
|
|
||||||
@@ -626,7 +649,9 @@ def char_permutations(s, current_filter=all_chars):
|
|||||||
open_brackets -= 1
|
open_brackets -= 1
|
||||||
current_set.append(token)
|
current_set.append(token)
|
||||||
if open_brackets == 0:
|
if open_brackets == 0:
|
||||||
char_types.append(parse_regex_char_set(u''.join(current_set), current_filter=current_filter))
|
char_set = parse_regex_char_set(u''.join(current_set), current_filter=current_filter)
|
||||||
|
if char_set:
|
||||||
|
char_types.append(char_set)
|
||||||
current_set = []
|
current_set = []
|
||||||
elif token_type == QUOTED_STRING:
|
elif token_type == QUOTED_STRING:
|
||||||
token = token.strip("'")
|
token = token.strip("'")
|
||||||
@@ -648,7 +673,8 @@ def char_permutations(s, current_filter=all_chars):
|
|||||||
char_types.append([replace_html_entity(token)])
|
char_types.append([replace_html_entity(token)])
|
||||||
elif token_type == CHARACTER:
|
elif token_type == CHARACTER:
|
||||||
char_types.append([token])
|
char_types.append([token])
|
||||||
|
elif token_type == WIDE_CHARACTER:
|
||||||
|
continue
|
||||||
if in_group and last_token_group_start:
|
if in_group and last_token_group_start:
|
||||||
start_group = len(char_types)
|
start_group = len(char_types)
|
||||||
last_token_group_start = False
|
last_token_group_start = False
|
||||||
@@ -721,7 +747,6 @@ def format_groups(char_types, groups):
|
|||||||
group_regex.append(char_types_string(char_types[last_end + 1:]))
|
group_regex.append(char_types_string(char_types[last_end + 1:]))
|
||||||
return u''.join(group_regex)
|
return u''.join(group_regex)
|
||||||
|
|
||||||
|
|
||||||
charset_regex = re.compile(r'(?<!\\)\[')
|
charset_regex = re.compile(r'(?<!\\)\[')
|
||||||
|
|
||||||
|
|
||||||
@@ -823,18 +848,23 @@ def parse_transform_rules(xml):
|
|||||||
break
|
break
|
||||||
|
|
||||||
variables[WORD_BOUNDARY_VAR_NAME] = WORD_BOUNDARY_VAR
|
variables[WORD_BOUNDARY_VAR_NAME] = WORD_BOUNDARY_VAR
|
||||||
|
variables[START_OF_HAN_VAR_NAME] = START_OF_HAN_VAR
|
||||||
|
|
||||||
current_filter = all_chars
|
current_filter = all_chars
|
||||||
|
|
||||||
for rule_type, rule in rules:
|
for rule_type, rule in rules:
|
||||||
if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM):
|
if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM):
|
||||||
left, right = rule
|
left, right = rule
|
||||||
|
|
||||||
left = var_regex.sub(get_var, left)
|
left = var_regex.sub(get_var, left)
|
||||||
right = var_regex.sub(get_var, right)
|
right = var_regex.sub(get_var, right)
|
||||||
|
|
||||||
left_pre_context, left, left_post_context = context_regex.match(left).groups()
|
left_pre_context, left, left_post_context = context_regex.match(left).groups()
|
||||||
right_pre_context, right, right_post_context = context_regex.match(right).groups()
|
right_pre_context, right, right_post_context = context_regex.match(right).groups()
|
||||||
|
|
||||||
|
if start_of_han_regex.search(left) or start_of_han_regex.search(right):
|
||||||
|
continue
|
||||||
|
|
||||||
left_pre_context_max_len = 0
|
left_pre_context_max_len = 0
|
||||||
left_post_context_max_len = 0
|
left_post_context_max_len = 0
|
||||||
|
|
||||||
@@ -849,6 +879,9 @@ def parse_transform_rules(xml):
|
|||||||
if left_pre_context.strip() == WORD_BOUNDARY_VAR:
|
if left_pre_context.strip() == WORD_BOUNDARY_VAR:
|
||||||
left_pre_context = None
|
left_pre_context = None
|
||||||
left_pre_context_type = CONTEXT_TYPE_WORD_BOUNDARY
|
left_pre_context_type = CONTEXT_TYPE_WORD_BOUNDARY
|
||||||
|
elif left_pre_context.strip() == START_OF_HAN_VAR:
|
||||||
|
left_pre_context = None
|
||||||
|
left_pre_context_type = CONTEXT_TYPE_NONE
|
||||||
else:
|
else:
|
||||||
left_pre_context, _, _ = char_permutations(left_pre_context.strip(), current_filter=current_filter)
|
left_pre_context, _, _ = char_permutations(left_pre_context.strip(), current_filter=current_filter)
|
||||||
left_pre_context_max_len = len(left_pre_context or [])
|
left_pre_context_max_len = len(left_pre_context or [])
|
||||||
@@ -871,6 +904,9 @@ def parse_transform_rules(xml):
|
|||||||
if left_post_context.strip() == WORD_BOUNDARY_VAR:
|
if left_post_context.strip() == WORD_BOUNDARY_VAR:
|
||||||
left_post_context = None
|
left_post_context = None
|
||||||
left_post_context_type = CONTEXT_TYPE_WORD_BOUNDARY
|
left_post_context_type = CONTEXT_TYPE_WORD_BOUNDARY
|
||||||
|
elif left_post_context.strip() == START_OF_HAN_VAR:
|
||||||
|
left_pre_context_type = None
|
||||||
|
left_pre_context_type = CONTEXT_TYPE_NONE
|
||||||
else:
|
else:
|
||||||
left_post_context, _, _ = char_permutations(left_post_context.strip(), current_filter=current_filter)
|
left_post_context, _, _ = char_permutations(left_post_context.strip(), current_filter=current_filter)
|
||||||
left_post_context_max_len = len(left_post_context or [])
|
left_post_context_max_len = len(left_post_context or [])
|
||||||
@@ -881,6 +917,8 @@ def parse_transform_rules(xml):
|
|||||||
left_post_context_type = CONTEXT_TYPE_STRING
|
left_post_context_type = CONTEXT_TYPE_STRING
|
||||||
|
|
||||||
if right:
|
if right:
|
||||||
|
if start_of_han_regex.search(right):
|
||||||
|
continue
|
||||||
right, move, right_groups = char_permutations(right.strip(), current_filter=current_filter)
|
right, move, right_groups = char_permutations(right.strip(), current_filter=current_filter)
|
||||||
right = char_types_string(right)
|
right = char_types_string(right)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user