[transliteration] fixing transliteration rules, fixing escape characters, adding sizes to all the strings as they may have null characters
This commit is contained in:
@@ -50,7 +50,10 @@ POST_CONTEXT_INDICATOR = '}'
|
|||||||
|
|
||||||
REVISIT_INDICATOR = '|'
|
REVISIT_INDICATOR = '|'
|
||||||
|
|
||||||
WORD_BOUNDARY_VAR = 'wordBoundary'
|
WORD_BOUNDARY_VAR_NAME = 'wordBoundary'
|
||||||
|
WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)
|
||||||
|
|
||||||
|
word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
|
||||||
|
|
||||||
EMPTY_TRANSITION = u'\u007f'
|
EMPTY_TRANSITION = u'\u007f'
|
||||||
|
|
||||||
@@ -503,8 +506,28 @@ def char_permutations(s):
|
|||||||
return list(itertools.product(char_types)), move
|
return list(itertools.product(char_types)), move
|
||||||
|
|
||||||
|
|
||||||
|
string_replacements = {
|
||||||
|
u'[': u'\[',
|
||||||
|
u']': u'\]',
|
||||||
|
u'': EMPTY_TRANSITION,
|
||||||
|
u'*': u'\*',
|
||||||
|
u'+': u'\+',
|
||||||
|
PLUS: u'+',
|
||||||
|
STAR: u'*',
|
||||||
|
}
|
||||||
|
|
||||||
|
escape_sequence_long_regex = re.compile(r'(\\x[0-9a-f]{2})([0-9a-f])', re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_long_escape_sequence(s):
|
||||||
|
def replace_match(m):
|
||||||
|
return u'{}""{}'.format(m.group(1), m.group(2))
|
||||||
|
|
||||||
|
return escape_sequence_long_regex.sub(replace_match, s)
|
||||||
|
|
||||||
|
|
||||||
def quote_string(s):
|
def quote_string(s):
|
||||||
return u'"{}"'.format(s.replace('"', '\\"'))
|
return u'"{}"'.format(replace_long_escape_sequence(safe_decode(s).replace('"', '\\"')))
|
||||||
|
|
||||||
|
|
||||||
def char_types_string(char_types):
|
def char_types_string(char_types):
|
||||||
@@ -519,19 +542,7 @@ def char_types_string(char_types):
|
|||||||
template = u'{}' if len(chars) == 1 else u'[{}]'
|
template = u'{}' if len(chars) == 1 else u'[{}]'
|
||||||
norm = []
|
norm = []
|
||||||
for c in chars:
|
for c in chars:
|
||||||
if c == '[':
|
c = string_replacements.get(c, c)
|
||||||
c = '\['
|
|
||||||
elif c == '':
|
|
||||||
c = EMPTY_TRANSITION
|
|
||||||
elif c == '*':
|
|
||||||
c = '\*'
|
|
||||||
elif c == '+':
|
|
||||||
c = '\+'
|
|
||||||
elif c == PLUS:
|
|
||||||
c = '+'
|
|
||||||
elif c == STAR:
|
|
||||||
c = '*'
|
|
||||||
|
|
||||||
norm.append(c)
|
norm.append(c)
|
||||||
|
|
||||||
ret.append(template.format(u''.join(norm)))
|
ret.append(template.format(u''.join(norm)))
|
||||||
@@ -555,48 +566,68 @@ def format_groups(char_types, groups):
|
|||||||
charset_regex = re.compile(r'(?<!\\)\[')
|
charset_regex = re.compile(r'(?<!\\)\[')
|
||||||
|
|
||||||
|
|
||||||
|
def encode_string(s):
|
||||||
|
return safe_encode(s).encode('string-escape')
|
||||||
|
|
||||||
|
|
||||||
def format_rule(rule):
|
def format_rule(rule):
|
||||||
'''
|
'''
|
||||||
Creates the C literal for a given transliteration rule
|
Creates the C literal for a given transliteration rule
|
||||||
'''
|
'''
|
||||||
pre_context = rule[0]
|
key = rule[0]
|
||||||
if not pre_context:
|
|
||||||
pre_context_type = CONTEXT_TYPE_NONE
|
pre_context_type = rule[1]
|
||||||
elif charset_regex.search(pre_context):
|
pre_context = rule[2]
|
||||||
pre_context_type = CONTEXT_TYPE_REGEX
|
if pre_context is None:
|
||||||
|
pre_context = 'NULL'
|
||||||
|
pre_context_len = 0
|
||||||
else:
|
else:
|
||||||
pre_context_type = CONTEXT_TYPE_STRING
|
pre_context_len = len(pre_context)
|
||||||
|
pre_context = quote_string(encode_string(pre_context))
|
||||||
|
|
||||||
pre_context_max_len = rule[1]
|
pre_context_max_len = rule[3]
|
||||||
|
|
||||||
key = rule[2]
|
post_context_type = rule[4]
|
||||||
|
post_context = rule[5]
|
||||||
|
|
||||||
post_context = rule[3]
|
if post_context is None:
|
||||||
if not post_context:
|
post_context = 'NULL'
|
||||||
post_context_type = CONTEXT_TYPE_NONE
|
post_context_len = 0
|
||||||
elif charset_regex.search(post_context):
|
|
||||||
post_context_type = CONTEXT_TYPE_REGEX
|
|
||||||
else:
|
else:
|
||||||
post_context_type = CONTEXT_TYPE_STRING
|
post_context_len = len(post_context)
|
||||||
|
post_context = quote_string(encode_string(post_context))
|
||||||
|
|
||||||
post_context_max_len = rule[4]
|
post_context_max_len = rule[6]
|
||||||
groups = rule[5]
|
|
||||||
replacement = rule[6]
|
groups = rule[7]
|
||||||
move = rule[7]
|
if not groups:
|
||||||
|
groups = 'NULL'
|
||||||
|
groups_len = 0
|
||||||
|
else:
|
||||||
|
groups_len = len(groups)
|
||||||
|
groups = quote_string(encode_string(groups))
|
||||||
|
|
||||||
|
replacement = rule[8]
|
||||||
|
move = rule[9]
|
||||||
|
|
||||||
output_rule = (
|
output_rule = (
|
||||||
quote_string(key),
|
quote_string(encode_string(key)),
|
||||||
|
str(len(key)),
|
||||||
pre_context_type,
|
pre_context_type,
|
||||||
str(pre_context_max_len),
|
str(pre_context_max_len),
|
||||||
u'NULL' if pre_context_type == CONTEXT_TYPE_NONE else quote_string(pre_context),
|
pre_context,
|
||||||
|
str(pre_context_len),
|
||||||
|
|
||||||
post_context_type,
|
post_context_type,
|
||||||
str(post_context_max_len),
|
str(post_context_max_len),
|
||||||
u'NULL' if post_context_type == CONTEXT_TYPE_NONE else quote_string(post_context),
|
post_context,
|
||||||
|
str(post_context_len),
|
||||||
|
|
||||||
quote_string(replacement),
|
quote_string(encode_string(replacement)),
|
||||||
|
str(len(replacement)),
|
||||||
str(move),
|
str(move),
|
||||||
u'NULL' if not groups else quote_string(groups),
|
groups,
|
||||||
|
str(groups_len),
|
||||||
)
|
)
|
||||||
|
|
||||||
return output_rule
|
return output_rule
|
||||||
@@ -627,6 +658,8 @@ def parse_transform_rules(xml):
|
|||||||
if num_found == 0:
|
if num_found == 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
variables[WORD_BOUNDARY_VAR_NAME] = WORD_BOUNDARY_VAR
|
||||||
|
|
||||||
for rule_type, rule in rules:
|
for rule_type, rule in rules:
|
||||||
if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM):
|
if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM):
|
||||||
left, right = rule
|
left, right = rule
|
||||||
@@ -639,14 +672,25 @@ def parse_transform_rules(xml):
|
|||||||
left_pre_context_max_len = 0
|
left_pre_context_max_len = 0
|
||||||
left_post_context_max_len = 0
|
left_post_context_max_len = 0
|
||||||
|
|
||||||
|
left_pre_context_type = CONTEXT_TYPE_NONE
|
||||||
|
left_post_context_type = CONTEXT_TYPE_NONE
|
||||||
|
|
||||||
move = 0
|
move = 0
|
||||||
left_groups = []
|
left_groups = []
|
||||||
right_groups = []
|
right_groups = []
|
||||||
|
|
||||||
if left_pre_context:
|
if left_pre_context:
|
||||||
left_pre_context, _, _ = char_permutations(left_pre_context.strip())
|
if left_pre_context.strip() == WORD_BOUNDARY_VAR:
|
||||||
left_pre_context_max_len = len(left_pre_context or [])
|
left_pre_context = None
|
||||||
left_pre_context = char_types_string(left_pre_context)
|
left_pre_context_type = CONTEXT_TYPE_WORD_BOUNDARY
|
||||||
|
else:
|
||||||
|
left_pre_context, _, _ = char_permutations(left_pre_context.strip())
|
||||||
|
left_pre_context_max_len = len(left_pre_context or [])
|
||||||
|
left_pre_context = char_types_string(left_pre_context)
|
||||||
|
if charset_regex.search(left_pre_context):
|
||||||
|
left_pre_context_type = CONTEXT_TYPE_REGEX
|
||||||
|
else:
|
||||||
|
left_pre_context_type = CONTEXT_TYPE_STRING
|
||||||
|
|
||||||
if left:
|
if left:
|
||||||
left, _, left_groups = char_permutations(left.strip())
|
left, _, left_groups = char_permutations(left.strip())
|
||||||
@@ -657,16 +701,24 @@ def parse_transform_rules(xml):
|
|||||||
left = char_types_string(left)
|
left = char_types_string(left)
|
||||||
|
|
||||||
if left_post_context:
|
if left_post_context:
|
||||||
left_post_context, _, _ = char_permutations(left_post_context.strip())
|
if left_post_context.strip() == WORD_BOUNDARY_VAR:
|
||||||
left_post_context_max_len = len(left_post_context or [])
|
left_post_context = None
|
||||||
left_post_context = char_types_string(left_post_context)
|
left_post_context_type = CONTEXT_TYPE_WORD_BOUNDARY
|
||||||
|
else:
|
||||||
|
left_post_context, _, _ = char_permutations(left_post_context.strip())
|
||||||
|
left_post_context_max_len = len(left_post_context or [])
|
||||||
|
left_post_context = char_types_string(left_post_context)
|
||||||
|
if charset_regex.search(left_post_context):
|
||||||
|
left_post_context_type = CONTEXT_TYPE_REGEX
|
||||||
|
else:
|
||||||
|
left_post_context_type = CONTEXT_TYPE_STRING
|
||||||
|
|
||||||
if right:
|
if right:
|
||||||
right, move, right_groups = char_permutations(right.strip())
|
right, move, right_groups = char_permutations(right.strip())
|
||||||
right = char_types_string(right)
|
right = char_types_string(right)
|
||||||
|
|
||||||
yield RULE, (left_pre_context, left_pre_context_max_len, left,
|
yield RULE, (left, left_pre_context_type, left_pre_context, left_pre_context_max_len,
|
||||||
left_post_context, left_post_context_max_len, left_groups, right, move)
|
left_post_context_type, left_post_context, left_post_context_max_len, left_groups, right, move)
|
||||||
|
|
||||||
elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule:
|
elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule:
|
||||||
continue
|
continue
|
||||||
@@ -688,10 +740,10 @@ EXISTING_STEP = 'EXISTING_STEP'
|
|||||||
supplemental_transliterations = {
|
supplemental_transliterations = {
|
||||||
'latin-ascii': (EXISTING_STEP, [
|
'latin-ascii': (EXISTING_STEP, [
|
||||||
# German transliterations not handled by standard NFD normalization
|
# German transliterations not handled by standard NFD normalization
|
||||||
(u'"ä"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ae"', '0', 'NULL'),
|
(u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'), # ä => ae
|
||||||
(u'"ö"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"oe"', '0', 'NULL'),
|
(u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'), # ö => oe
|
||||||
(u'"ü"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ue"', '0', 'NULL'),
|
(u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'), # ü => ue
|
||||||
(u'"ß"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ss"', '0', 'NULL'),
|
(u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'), # ß => ss
|
||||||
]),
|
]),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -766,7 +818,7 @@ def get_all_transform_rules():
|
|||||||
elif step_type == STEP_TRANSFORM:
|
elif step_type == STEP_TRANSFORM:
|
||||||
step = (STEP_TRANSFORM, '-1', '-1', quote_string(data))
|
step = (STEP_TRANSFORM, '-1', '-1', quote_string(data))
|
||||||
elif step_type == STEP_UNICODE_NORMALIZATION:
|
elif step_type == STEP_UNICODE_NORMALIZATION:
|
||||||
step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', data)
|
step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', quote_string(data))
|
||||||
all_steps.append(step)
|
all_steps.append(step)
|
||||||
|
|
||||||
internal = int(name not in to_latin)
|
internal = int(name not in to_latin)
|
||||||
@@ -778,9 +830,6 @@ def get_all_transform_rules():
|
|||||||
|
|
||||||
|
|
||||||
transliteration_data_template = u'''#include <stdlib.h>
|
transliteration_data_template = u'''#include <stdlib.h>
|
||||||
#include <stdbool.h>
|
|
||||||
|
|
||||||
#include "transliteration_rule.h"
|
|
||||||
|
|
||||||
transliteration_rule_source_t rules_source[] = {{
|
transliteration_rule_source_t rules_source[] = {{
|
||||||
{all_rules}
|
{all_rules}
|
||||||
|
|||||||
Binary file not shown.
@@ -13,21 +13,31 @@ typedef enum {
|
|||||||
|
|
||||||
typedef struct transliteration_rule_source {
|
typedef struct transliteration_rule_source {
|
||||||
char *key;
|
char *key;
|
||||||
|
size_t key_len;
|
||||||
|
|
||||||
context_type_t pre_context_type;
|
context_type_t pre_context_type;
|
||||||
size_t pre_context_max_len;
|
size_t pre_context_max_len;
|
||||||
char *pre_context;
|
char *pre_context;
|
||||||
|
size_t pre_context_len;
|
||||||
|
|
||||||
context_type_t post_context_type;
|
context_type_t post_context_type;
|
||||||
size_t post_context_max_len;
|
size_t post_context_max_len;
|
||||||
char *post_context;
|
char *post_context;
|
||||||
|
size_t post_context_len;
|
||||||
|
|
||||||
char *replacement;
|
char *replacement;
|
||||||
|
size_t replacement_len;
|
||||||
|
|
||||||
int move;
|
int move;
|
||||||
char *group_regex_str;
|
char *group_regex_str;
|
||||||
|
size_t group_regex_len;
|
||||||
} transliteration_rule_source_t;
|
} transliteration_rule_source_t;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
STEP_RULESET,
|
||||||
|
STEP_TRANSFORM,
|
||||||
|
STEP_UNICODE_NORMALIZATION
|
||||||
|
} step_type_t;
|
||||||
|
|
||||||
typedef struct transliteration_step_source {
|
typedef struct transliteration_step_source {
|
||||||
step_type_t type;
|
step_type_t type;
|
||||||
|
|||||||
Reference in New Issue
Block a user