[transliteration] using a proper lexer on the entire rule to correct some parses, allowing bracketed multiple characters in sets, fixing optionals

2015-05-14 16:34:03 -04:00
parent 2d49369e78
commit 1f3ac0c3f9
1 changed files with 120 additions and 25 deletions
--- a/scripts/geodata/i18n/transliteration_rules.py
+++ b/scripts/geodata/i18n/transliteration_rules.py
@@ -71,9 +71,9 @@ POST_CONTEXT_CHAR = u"\x03"
 EMPTY_TRANSITION_CHAR = u"\x04"
 REPEAT_ZERO_CHAR = u"\x05"
 REPEAT_ONE_CHAR = u"\x06"
-BEGIN_SET_CHAR = u"\x07"
-END_SET_CHAR = u"\x08"
-GROUP_INDICATOR_CHAR = u"\x09"
+BEGIN_SET_CHAR = u"\x0e"
+END_SET_CHAR = u"\x0f"
+GROUP_INDICATOR_CHAR = u"\x10"


 EXCLUDE_TRANSLITERATORS = set([
@@ -246,7 +246,7 @@ char_class_regex_str = '\[(?:[^\[\]]*\[[^\[\]]*\][^\[\]]*)*[^\[\]]*\]'
 nested_char_class_regex = re.compile('\[(?:[^\[\]]*\[[^\[\]]*\][^\[\]]*)+[^\[\]]*\]', re.UNICODE)

 range_regex = re.compile(r'[\\]?([^\\])\-[\\]?([^\\])', re.UNICODE)
-var_regex = re.compile('\$([A-Za-z_\-]+)')
+var_regex = re.compile('[\s]*\$([A-Za-z_\-]+)[\s]*')

 context_regex = re.compile(u'(?:[\s]*(?!=[\s])(.*?)(?<![\s])[\s]*{)?(?:[\s]*([^}{]*)[\s]*)(?:}[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)?', re.UNICODE)

@@ -280,12 +280,25 @@ SINGLE_QUOTE = 'SINGLE_QUOTE'
 ESCAPED_CHARACTER = 'ESCAPED_CHARACTER'


-BEFORE_CONTEXT = '{'
-AFTER_CONTEXT = '}'
+BEFORE_CONTEXT = 'BEFORE_CONTEXT'
+AFTER_CONTEXT = 'AFTER_CONTEXT'

 PLUS = 'PLUS'
 STAR = 'STAR'

+rule_scanner = Scanner([
+    (r'[\\].', ESCAPED_CHARACTER),
+    ('\[', OPEN_SET),
+    ('\]', CLOSE_SET),
+    ('\(', OPEN_GROUP),
+    ('\)', CLOSE_GROUP),
+    ('\{', BEFORE_CONTEXT),
+    ('\}', AFTER_CONTEXT),
+    ('[\s]+', WHITESPACE),
+    (r'[^\s]', CHARACTER),
+])
+
+
 # Scanner for the lvalue or rvalue of a transform rule

 transform_scanner = Scanner([
@@ -302,13 +315,13 @@ transform_scanner = Scanner([
    (r'&.*?;', HTML_ENTITY),
    (r'(?<![\\])\*', REPEAT),
    (r'(?<![\\])\+', PLUS),
-    ('\?', OPTIONAL),
+    ('(?<=[^\s])\?', OPTIONAL),
    ('\(', LPAREN),
    ('\)', RPAREN),
    ('\|', REVISIT),
    ('[\s]+', WHITESPACE),
-    (r'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
-    (r'[\\]?[^\s]', CHARACTER),
+    (ur'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
+    (r'[^\s]', CHARACTER),
 ], re.UNICODE)

 CHAR_RANGE = 'CHAR_RANGE'
@@ -317,6 +330,7 @@ WORD_BOUNDARY = 'WORD_BOUNDARY'
 NEGATION = 'NEGATION'
 INTERSECTION = 'INTERSECTION'
 DIFFERENCE = 'DIFFERENCE'
+BRACKETED_CHARACTER = 'BRACKETED_CHARACTER'

 # Scanner for a character set (yes, a regex regex)

@@ -337,6 +351,7 @@ char_set_scanner = Scanner([
    ('(?<=[\s])-(?=[\s])', DIFFERENCE),
    ('\$', WORD_BOUNDARY),
    (ur'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
+    (r'\{[^\s]+\}', BRACKETED_CHARACTER),
    (r'[^\s]', CHARACTER),
 ])

@@ -519,6 +534,10 @@ def parse_regex_char_set(s, current_filter=all_chars):
            real_chars.add(token)
        elif token_class == WIDE_CHARACTER:
            continue
+        elif token_class == BRACKETED_CHARACTER:
+            if token.strip('{{}}') not in control_chars:
+                this_group.add(token)
+                real_chars.add(token)
        elif token_class == WORD_BOUNDARY:
            is_word_boundary = True

@@ -859,8 +878,75 @@ def parse_transform_rules(xml):
            left = var_regex.sub(get_var, left)
            right = var_regex.sub(get_var, right)

-            left_pre_context, left, left_post_context = context_regex.match(left).groups()
-            right_pre_context, right, right_post_context = context_regex.match(right).groups()
+            left_pre_context = None
+            left_post_context = None
+            have_post_context = False
+            current_token = []
+
+            in_set = False
+            in_group = False
+            open_brackets = 0
+
+            for token, token_type in rule_scanner.scan(left):
+                if token_type == ESCAPED_CHARACTER:
+                    current_token.append(token)
+                elif token_type == OPEN_SET:
+                    in_set = True
+                    open_brackets += 1
+                    current_token.append(token)
+                elif token_type == CLOSE_SET:
+                    open_brackets -= 1
+                    current_token.append(token)
+                    if open_brackets == 0:
+                        in_set = False
+                elif token_type == BEFORE_CONTEXT and not in_set:
+                    left_pre_context = u''.join(current_token)
+                    current_token = []
+                elif token_type == AFTER_CONTEXT and not in_set:
+                    have_post_context = True
+                    left = u''.join(current_token)
+                    current_token = []
+                else:
+                    current_token.append(token)
+
+            if have_post_context:
+                left_post_context = u''.join(current_token)
+            else:
+                left = u''.join(current_token).strip()
+
+            right_pre_context = None
+            right_post_context = None
+            have_post_context = False
+            current_token = []
+
+            in_set = False
+            in_group = False
+            open_brackets = 0
+
+            for token, token_type in rule_scanner.scan(right):
+                if token_type == OPEN_SET:
+                    in_set = True
+                    open_brackets += 1
+                    current_token.append(token)
+                elif token_type == CLOSE_SET:
+                    open_brackets -= 1
+                    current_token.append(token)
+                    if open_brackets == 0:
+                        in_set = False
+                elif token_type == BEFORE_CONTEXT and not in_set:
+                    right_pre_context = u''.join(current_token)
+                    current_token = []
+                elif token_type == AFTER_CONTEXT and not in_set:
+                    have_post_context = True
+                    right = u''.join(current_token)
+                    current_token = []
+                else:
+                    current_token.append(token)
+
+            if have_post_context:
+                right_post_context = u''.join(current_token)
+            else:
+                right = u''.join(current_token)

            if start_of_han_regex.search(left) or start_of_han_regex.search(right):
                continue
@@ -884,15 +970,19 @@ def parse_transform_rules(xml):
                    left_pre_context_type = CONTEXT_TYPE_NONE
                else:
                    left_pre_context, _, _ = char_permutations(left_pre_context.strip(), current_filter=current_filter)
-                    left_pre_context_max_len = len(left_pre_context or [])
-                    left_pre_context = char_types_string(left_pre_context)
+                    if left_pre_context:
+                        left_pre_context_max_len = len(left_pre_context or [])
+                        left_pre_context = char_types_string(left_pre_context)

-                    if charset_regex.search(left_pre_context):
-                        left_pre_context_type = CONTEXT_TYPE_REGEX
+                        if charset_regex.search(left_pre_context):
+                            left_pre_context_type = CONTEXT_TYPE_REGEX
+                        else:
+                            left_pre_context_type = CONTEXT_TYPE_STRING
                    else:
-                        left_pre_context_type = CONTEXT_TYPE_STRING
+                        left_pre_context = None
+                        left_pre_context_type = CONTEXT_TYPE_NONE

-            if left:
+            if left is not None:
                left, _, left_groups = char_permutations(left.strip(), current_filter=current_filter)
                if left_groups:
                    left_groups = format_groups(left, left_groups)
@@ -909,16 +999,18 @@ def parse_transform_rules(xml):
                    left_pre_context_type = CONTEXT_TYPE_NONE
                else:
                    left_post_context, _, _ = char_permutations(left_post_context.strip(), current_filter=current_filter)
-                    left_post_context_max_len = len(left_post_context or [])
-                    left_post_context = char_types_string(left_post_context)
-                    if charset_regex.search(left_post_context):
-                        left_post_context_type = CONTEXT_TYPE_REGEX
+                    if left_post_context:
+                        left_post_context_max_len = len(left_post_context or [])
+                        left_post_context = char_types_string(left_post_context)
+                        if charset_regex.search(left_post_context):
+                            left_post_context_type = CONTEXT_TYPE_REGEX
+                        elif left_post_context:
+                            left_post_context_type = CONTEXT_TYPE_STRING
                    else:
-                        left_post_context_type = CONTEXT_TYPE_STRING
+                        left_post_context = None
+                        left_post_context_type = CONTEXT_TYPE_NONE

            if right:
-                if start_of_han_regex.search(right):
-                    continue
                right, move, right_groups = char_permutations(right.strip(), current_filter=current_filter)
                right = char_types_string(right)

@@ -974,7 +1066,10 @@ def get_all_transform_rules():
        xml = etree.parse(f)
        source, target = get_source_and_target(xml)

-        if (target.lower() == 'latin' or name == 'latin-ascii') and name not in EXCLUDE_TRANSLITERATORS:
+        if name in EXCLUDE_TRANSLITERATORS:
+            continue
+
+        if (target.lower() == 'latin' or name == 'latin-ascii'):
            to_latin.add(name)
            retain_transforms.add(name)