[transliteration] fixing transliteration rules, fixing escape characters, adding sizes to all the strings as they may have null characters

2015-04-26 19:45:06 -04:00
parent ff9b6735f8
commit 6ebea11640
3 changed files with 113 additions and 54 deletions
--- a/scripts/geodata/i18n/transliteration_rules.py
+++ b/scripts/geodata/i18n/transliteration_rules.py
@@ -50,7 +50,10 @@ POST_CONTEXT_INDICATOR = '}'
 REVISIT_INDICATOR = '|'
-WORD_BOUNDARY_VAR = 'wordBoundary'
+WORD_BOUNDARY_VAR_NAME = 'wordBoundary'
 WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)
 word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))
 EMPTY_TRANSITION = u'\u007f'
@@ -503,8 +506,28 @@ def char_permutations(s):
    return list(itertools.product(char_types)), move
 string_replacements = {
    u'[': u'\[',
    u']': u'\]',
    u'': EMPTY_TRANSITION,
    u'*': u'\*',
    u'+': u'\+',
    PLUS: u'+',
    STAR: u'*',
 }
 escape_sequence_long_regex = re.compile(r'(\\x[0-9a-f]{2})([0-9a-f])', re.I)
 def replace_long_escape_sequence(s):
    def replace_match(m):
        return u'{}""{}'.format(m.group(1), m.group(2))
    return escape_sequence_long_regex.sub(replace_match, s)
 def quote_string(s):
-    return u'"{}"'.format(s.replace('"', '\\"'))
+    return u'"{}"'.format(replace_long_escape_sequence(safe_decode(s).replace('"', '\\"')))
 def char_types_string(char_types):
@@ -519,19 +542,7 @@ def char_types_string(char_types):
        template = u'{}' if len(chars) == 1 else u'[{}]'
        norm = []
        for c in chars:
-            if c == '[':
+            c = string_replacements.get(c, c)
                c = '\['
            elif c == '':
                c = EMPTY_TRANSITION
            elif c == '*':
                c = '\*'
            elif c == '+':
                c = '\+'
            elif c == PLUS:
                c = '+'
            elif c == STAR:
                c = '*'
            norm.append(c)
        ret.append(template.format(u''.join(norm)))
@@ -555,48 +566,68 @@ def format_groups(char_types, groups):
 charset_regex = re.compile(r'(?<!\\)\[')
 def encode_string(s):
    return safe_encode(s).encode('string-escape')
 def format_rule(rule):
    '''
    Creates the C literal for a given transliteration rule
    '''
-    pre_context = rule[0]
+    key = rule[0]
-    if not pre_context:
+
-        pre_context_type = CONTEXT_TYPE_NONE
+    pre_context_type = rule[1]
-    elif charset_regex.search(pre_context):
+    pre_context = rule[2]
-        pre_context_type = CONTEXT_TYPE_REGEX
+    if pre_context is None:
        pre_context = 'NULL'
        pre_context_len = 0
    else:
-        pre_context_type = CONTEXT_TYPE_STRING
+        pre_context_len = len(pre_context)
        pre_context = quote_string(encode_string(pre_context))
-    pre_context_max_len = rule[1]
+    pre_context_max_len = rule[3]
-    key = rule[2]
+    post_context_type = rule[4]
    post_context = rule[5]
-    post_context = rule[3]
+    if post_context is None:
-    if not post_context:
+        post_context = 'NULL'
-        post_context_type = CONTEXT_TYPE_NONE
+        post_context_len = 0
    elif charset_regex.search(post_context):
        post_context_type = CONTEXT_TYPE_REGEX
    else:
-        post_context_type = CONTEXT_TYPE_STRING
+        post_context_len = len(post_context)
        post_context = quote_string(encode_string(post_context))
-    post_context_max_len = rule[4]
+    post_context_max_len = rule[6]
-    groups = rule[5]
+
-    replacement = rule[6]
+    groups = rule[7]
-    move = rule[7]
+    if not groups:
        groups = 'NULL'
        groups_len = 0
    else:
        groups_len = len(groups)
        groups = quote_string(encode_string(groups))
    replacement = rule[8]
    move = rule[9]
    output_rule = (
-        quote_string(key),
+        quote_string(encode_string(key)),
        str(len(key)),
        pre_context_type,
        str(pre_context_max_len),
-        u'NULL' if pre_context_type == CONTEXT_TYPE_NONE else quote_string(pre_context),
+        pre_context,
        str(pre_context_len),
        post_context_type,
        str(post_context_max_len),
-        u'NULL' if post_context_type == CONTEXT_TYPE_NONE else quote_string(post_context),
+        post_context,
        str(post_context_len),
-        quote_string(replacement),
+        quote_string(encode_string(replacement)),
        str(len(replacement)),
        str(move),
-        u'NULL' if not groups else quote_string(groups),
+        groups,
        str(groups_len),
    )
    return output_rule
@@ -627,6 +658,8 @@ def parse_transform_rules(xml):
        if num_found == 0:
            break
    variables[WORD_BOUNDARY_VAR_NAME] = WORD_BOUNDARY_VAR
    for rule_type, rule in rules:
        if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM):
            left, right = rule
@@ -639,14 +672,25 @@ def parse_transform_rules(xml):
            left_pre_context_max_len = 0
            left_post_context_max_len = 0
            left_pre_context_type = CONTEXT_TYPE_NONE
            left_post_context_type = CONTEXT_TYPE_NONE
            move = 0
            left_groups = []
            right_groups = []
            if left_pre_context:
-                left_pre_context, _, _ = char_permutations(left_pre_context.strip())
+                if left_pre_context.strip() == WORD_BOUNDARY_VAR:
-                left_pre_context_max_len = len(left_pre_context or [])
+                    left_pre_context = None
-                left_pre_context = char_types_string(left_pre_context)
+                    left_pre_context_type = CONTEXT_TYPE_WORD_BOUNDARY
                else:
                    left_pre_context, _, _ = char_permutations(left_pre_context.strip())
                    left_pre_context_max_len = len(left_pre_context or [])
                    left_pre_context = char_types_string(left_pre_context)
                    if charset_regex.search(left_pre_context):
                        left_pre_context_type = CONTEXT_TYPE_REGEX
                    else:
                        left_pre_context_type = CONTEXT_TYPE_STRING
            if left:
                left, _, left_groups = char_permutations(left.strip())
@@ -657,16 +701,24 @@ def parse_transform_rules(xml):
                left = char_types_string(left)
            if left_post_context:
-                left_post_context, _, _ = char_permutations(left_post_context.strip())
+                if left_post_context.strip() == WORD_BOUNDARY_VAR:
-                left_post_context_max_len = len(left_post_context or [])
+                    left_post_context = None
-                left_post_context = char_types_string(left_post_context)
+                    left_post_context_type = CONTEXT_TYPE_WORD_BOUNDARY
                else:
                    left_post_context, _, _ = char_permutations(left_post_context.strip())
                    left_post_context_max_len = len(left_post_context or [])
                    left_post_context = char_types_string(left_post_context)
                    if charset_regex.search(left_post_context):
                        left_post_context_type = CONTEXT_TYPE_REGEX
                    else:
                        left_post_context_type = CONTEXT_TYPE_STRING
            if right:
                right, move, right_groups = char_permutations(right.strip())
                right = char_types_string(right)
-            yield RULE, (left_pre_context, left_pre_context_max_len, left,
+            yield RULE, (left, left_pre_context_type, left_pre_context, left_pre_context_max_len,
-                         left_post_context, left_post_context_max_len, left_groups, right, move)
+                         left_post_context_type, left_post_context, left_post_context_max_len, left_groups, right, move)
        elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule:
            continue
@@ -688,10 +740,10 @@ EXISTING_STEP = 'EXISTING_STEP'
 supplemental_transliterations = {
    'latin-ascii': (EXISTING_STEP, [
        # German transliterations not handled by standard NFD normalization
-        (u'"ä"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ae"', '0', 'NULL'),
+        (u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'),       # ä => ae
-        (u'"ö"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"oe"', '0', 'NULL'),
+        (u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'),       # ö => oe
-        (u'"ü"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ue"', '0', 'NULL'),
+        (u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'),       # ü => ue
-        (u'"ß"', CONTEXT_TYPE_NONE, '0', 'NULL', CONTEXT_TYPE_NONE, '0', 'NULL', u'"ss"', '0', 'NULL'),
+        (u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'),       # ß => ss
    ]),
 }
@@ -766,7 +818,7 @@ def get_all_transform_rules():
            elif step_type == STEP_TRANSFORM:
                step = (STEP_TRANSFORM, '-1', '-1', quote_string(data))
            elif step_type == STEP_UNICODE_NORMALIZATION:
-                step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', data)
+                step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', quote_string(data))
            all_steps.append(step)
        internal = int(name not in to_latin)
@@ -778,9 +830,6 @@ def get_all_transform_rules():
 transliteration_data_template = u'''#include <stdlib.h>
 #include <stdbool.h>
 #include "transliteration_rule.h"
 transliteration_rule_source_t rules_source[] = {{
    {all_rules}
--- a/src/transliteration_data.c
+++ b/src/transliteration_data.c
--- a/src/transliteration_rule.h
+++ b/src/transliteration_rule.h
@@ -13,21 +13,31 @@ typedef enum {
 typedef struct transliteration_rule_source {
    char *key;
    size_t key_len;
    context_type_t pre_context_type;
    size_t pre_context_max_len;
    char *pre_context;
    size_t pre_context_len;
    context_type_t post_context_type;
    size_t post_context_max_len;
    char *post_context;
    size_t post_context_len;
    char *replacement;
    size_t replacement_len;
    int move;
    char *group_regex_str;
    size_t group_regex_len;
 } transliteration_rule_source_t;
 typedef enum {
    STEP_RULESET,
    STEP_TRANSFORM,
    STEP_UNICODE_NORMALIZATION
 } step_type_t;
 typedef struct transliteration_step_source {
    step_type_t type;