[transliteration] Escaped single quotes in rules + ignoring rules with codepoints > \uffff

2015-05-17 18:31:35 -04:00
parent c39a19a352
commit e25f039ee4
1 changed files with 9 additions and 3 deletions
--- a/scripts/geodata/i18n/transliteration_rules.py
+++ b/scripts/geodata/i18n/transliteration_rules.py
@@ -708,11 +708,14 @@ def char_permutations(s, current_filter=all_chars):
            char_types.append([replace_html_entity(token)])
        elif token_type == CHARACTER:
            char_types.append([token])
+        elif token_type == SINGLE_QUOTE:
+            char_types.append(["'"])
        elif token_type == UNICODE_CHARACTER:
            token = token.decode('unicode-escape')
            char_types.append([token])
        elif token_type in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER):
            continue
+
        if in_group and last_token_group_start:
            start_group = len(char_types)
            last_token_group_start = False
@@ -1006,12 +1009,15 @@ def parse_transform_rules(xml):
                left_pre_context_type = CONTEXT_TYPE_NONE

            if left:
-                left, _, left_groups = char_permutations(left.strip(), current_filter=current_filter)
+                left_chars, _, left_groups = char_permutations(left.strip(), current_filter=current_filter)
+                if not left_chars and (left.strip() or not (left_pre_context and left_post_context)):
+                    print 'ignoring', rule
+                    continue
                if left_groups:
-                    left_groups = format_groups(left, left_groups)
+                    left_groups = format_groups(left_chars, left_groups)
                else:
                    left_groups = None
-                left = char_types_string(left)
+                left = char_types_string(left_chars)

            if left_post_context:
                if left_post_context.strip() == WORD_BOUNDARY_VAR: