[transliteration] fixing transliteration rules, fixing escape characters, adding sizes to all the strings as they may have null characters

This commit is contained in:
Al
2015-04-26 19:45:06 -04:00
parent ff9b6735f8
commit 6ebea11640
3 changed files with 113 additions and 54 deletions

View File

@@ -50,7 +50,10 @@ POST_CONTEXT_INDICATOR = '}'
REVISIT_INDICATOR = '|' REVISIT_INDICATOR = '|'
WORD_BOUNDARY_VAR = 'wordBoundary' WORD_BOUNDARY_VAR_NAME = 'wordBoundary'
WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)
word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
EMPTY_TRANSITION = u'\u007f' EMPTY_TRANSITION = u'\u007f'
@@ -503,8 +506,28 @@ def char_permutations(s):
return list(itertools.product(char_types)), move return list(itertools.product(char_types)), move
string_replacements = {
u'[': u'\[',
u']': u'\]',
u'': EMPTY_TRANSITION,
u'*': u'\*',
u'+': u'\+',
PLUS: u'+',
STAR: u'*',
}
escape_sequence_long_regex = re.compile(r'(\\x[0-9a-f]{2})([0-9a-f])', re.I)
def replace_long_escape_sequence(s):
def replace_match(m):
return u'{}""{}'.format(m.group(1), m.group(2))
return escape_sequence_long_regex.sub(replace_match, s)
def quote_string(s): def quote_string(s):
return u'"{}"'.format(s.replace('"', '\\"')) return u'"{}"'.format(replace_long_escape_sequence(safe_decode(s).replace('"', '\\"')))
def char_types_string(char_types): def char_types_string(char_types):
@@ -519,19 +542,7 @@ def char_types_string(char_types):
template = u'{}' if len(chars) == 1 else u'[{}]' template = u'{}' if len(chars) == 1 else u'[{}]'
norm = [] norm = []
for c in chars: for c in chars:
if c == '[': c = string_replacements.get(c, c)
c = '\['
elif c == '':
c = EMPTY_TRANSITION
elif c == '*':
c = '\*'
elif c == '+':
c = '\+'
elif c == PLUS:
c = '+'
elif c == STAR:
c = '*'
norm.append(c) norm.append(c)
ret.append(template.format(u''.join(norm))) ret.append(template.format(u''.join(norm)))
@@ -555,48 +566,68 @@ def format_groups(char_types, groups):
charset_regex = re.compile(r'(?<!\\)\[') charset_regex = re.compile(r'(?<!\\)\[')
def encode_string(s):
return safe_encode(s).encode('string-escape')
def format_rule(rule): def format_rule(rule):
''' '''
Creates the C literal for a given transliteration rule Creates the C literal for a given transliteration rule
''' '''
pre_context = rule[0] key = rule[0]
if not pre_context:
pre_context_type = CONTEXT_TYPE_NONE pre_context_type = rule[1]
elif charset_regex.search(pre_context): pre_context = rule[2]
pre_context_type = CONTEXT_TYPE_REGEX if pre_context is None:
pre_context = 'NULL'
pre_context_len = 0
else: else:
pre_context_type = CONTEXT_TYPE_STRING pre_context_len = len(pre_context)
pre_context = quote_string(encode_string(pre_context))
pre_context_max_len = rule[1] pre_context_max_len = rule[3]
key = rule[2] post_context_type = rule[4]
post_context = rule[5]
post_context = rule[3] if post_context is None:
if not post_context: post_context = 'NULL'
post_context_type = CONTEXT_TYPE_NONE post_context_len = 0
elif charset_regex.search(post_context):
post_context_type = CONTEXT_TYPE_REGEX
else: else:
post_context_type = CONTEXT_TYPE_STRING post_context_len = len(post_context)
post_context = quote_string(encode_string(post_context))
post_context_max_len = rule[4] post_context_max_len = rule[6]
groups = rule[5]
replacement = rule[6] groups = rule[7]
move = rule[7] if not groups:
groups = 'NULL'
groups_len = 0
else:
groups_len = len(groups)
groups = quote_string(encode_string(groups))
replacement = rule[8]
move = rule[9]
output_rule = ( output_rule = (
quote_string(key), quote_string(encode_string(key)),
str(len(key)),
pre_context_type, pre_context_type,
str(pre_context_max_len), str(pre_context_max_len),
u'NULL' if pre_context_type == CONTEXT_TYPE_NONE else quote_string(pre_context), pre_context,
str(pre_context_len),
post_context_type, post_context_type,
str(post_context_max_len), str(post_context_max_len),
u'NULL' if post_context_type == CONTEXT_TYPE_NONE else quote_string(post_context), post_context,
str(post_context_len),
quote_string(replacement), quote_string(encode_string(replacement)),
str(len(replacement)),
str(move), str(move),
u'NULL' if not groups else quote_string(groups), groups,
str(groups_len),
) )
return output_rule return output_rule
@@ -627,6 +658,8 @@ def parse_transform_rules(xml):
if num_found == 0: if num_found == 0:
break break
variables[WORD_BOUNDARY_VAR_NAME] = WORD_BOUNDARY_VAR
for rule_type, rule in rules: for rule_type, rule in rules:
if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM): if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM):
left, right = rule left, right = rule
@@ -639,14 +672,25 @@ def parse_transform_rules(xml):
left_pre_context_max_len = 0 left_pre_context_max_len = 0
left_post_context_max_len = 0 left_post_context_max_len = 0
left_pre_context_type = CONTEXT_TYPE_NONE
left_post_context_type = CONTEXT_TYPE_NONE
move = 0 move = 0
left_groups = [] left_groups = []
right_groups = [] right_groups = []
if left_pre_context: if left_pre_context:
left_pre_context, _, _ = char_permutations(left_pre_context.strip()) if left_pre_context.strip() == WORD_BOUNDARY_VAR:
left_pre_context_max_len = len(left_pre_context or []) left_pre_context = None
left_pre_context = char_types_string(left_pre_context) left_pre_context_type = CONTEXT_TYPE_WORD_BOUNDARY
else:
left_pre_context, _, _ = char_permutations(left_pre_context.strip())
left_pre_context_max_len = len(left_pre_context or [])
left_pre_context = char_types_string(left_pre_context)
if charset_regex.search(left_pre_context):
left_pre_context_type = CONTEXT_TYPE_REGEX
else:
left_pre_context_type = CONTEXT_TYPE_STRING
if left: if left:
left, _, left_groups = char_permutations(left.strip()) left, _, left_groups = char_permutations(left.strip())
@@ -657,16 +701,24 @@ def parse_transform_rules(xml):
left = char_types_string(left) left = char_types_string(left)
if left_post_context: if left_post_context:
left_post_context, _, _ = char_permutations(left_post_context.strip()) if left_post_context.strip() == WORD_BOUNDARY_VAR:
left_post_context_max_len = len(left_post_context or []) left_post_context = None
left_post_context = char_types_string(left_post_context) left_post_context_type = CONTEXT_TYPE_WORD_BOUNDARY
else:
left_post_context, _, _ = char_permutations(left_post_context.strip())
left_post_context_max_len = len(left_post_context or [])
left_post_context = char_types_string(left_post_context)
if charset_regex.search(left_post_context):
left_post_context_type = CONTEXT_TYPE_REGEX
else:
left_post_context_type = CONTEXT_TYPE_STRING
if right: if right:
right, move, right_groups = char_permutations(right.strip()) right, move, right_groups = char_permutations(right.strip())
right = char_types_string(right) right = char_types_string(right)
yield RULE, (left_pre_context, left_pre_context_max_len, left, yield RULE, (left, left_pre_context_type, left_pre_context, left_pre_context_max_len,
left_post_context, left_post_context_max_len, left_groups, right, move) left_post_context_type, left_post_context, left_post_context_max_len, left_groups, right, move)
elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule: elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule:
continue continue
@@ -688,10 +740,10 @@ EXISTING_STEP = 'EXISTING_STEP'
supplemental_transliterations = { supplemental_transliterations = {
'latin-ascii': (EXISTING_STEP, [ 'latin-ascii': (EXISTING_STEP, [
# German transliterations not handled by standard NFD normalization # German transliterations not handled by standard NFD normalization
(u'"ä"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ae"', '0', 'NULL'), (u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'), # ä => ae
(u'"ö"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"oe"', '0', 'NULL'), (u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'), # ö => oe
(u'"ü"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ue"', '0', 'NULL'), (u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'), # ü => ue
(u'"ß"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ss"', '0', 'NULL'), (u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'), # ß => ss
]), ]),
} }
@@ -766,7 +818,7 @@ def get_all_transform_rules():
elif step_type == STEP_TRANSFORM: elif step_type == STEP_TRANSFORM:
step = (STEP_TRANSFORM, '-1', '-1', quote_string(data)) step = (STEP_TRANSFORM, '-1', '-1', quote_string(data))
elif step_type == STEP_UNICODE_NORMALIZATION: elif step_type == STEP_UNICODE_NORMALIZATION:
step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', data) step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', quote_string(data))
all_steps.append(step) all_steps.append(step)
internal = int(name not in to_latin) internal = int(name not in to_latin)
@@ -778,9 +830,6 @@ def get_all_transform_rules():
transliteration_data_template = u'''#include <stdlib.h> transliteration_data_template = u'''#include <stdlib.h>
#include <stdbool.h>
#include "transliteration_rule.h"
transliteration_rule_source_t rules_source[] = {{ transliteration_rule_source_t rules_source[] = {{
{all_rules} {all_rules}

Binary file not shown.

View File

@@ -13,21 +13,31 @@ typedef enum {
typedef struct transliteration_rule_source { typedef struct transliteration_rule_source {
char *key; char *key;
size_t key_len;
context_type_t pre_context_type; context_type_t pre_context_type;
size_t pre_context_max_len; size_t pre_context_max_len;
char *pre_context; char *pre_context;
size_t pre_context_len;
context_type_t post_context_type; context_type_t post_context_type;
size_t post_context_max_len; size_t post_context_max_len;
char *post_context; char *post_context;
size_t post_context_len;
char *replacement; char *replacement;
size_t replacement_len;
int move; int move;
char *group_regex_str; char *group_regex_str;
size_t group_regex_len;
} transliteration_rule_source_t; } transliteration_rule_source_t;
typedef enum {
STEP_RULESET,
STEP_TRANSFORM,
STEP_UNICODE_NORMALIZATION
} step_type_t;
typedef struct transliteration_step_source { typedef struct transliteration_step_source {
step_type_t type; step_type_t type;