[fix] for transliteration rules, allowing the parsing of set differencees and arbitrarily nested character set expressions, using non-NUL byte for the empty transition. Adding resulting data file.

2015-05-08 17:14:22 -04:00
parent 10ebaf147a
commit 2a69488f9b
2 changed files with 100 additions and 60 deletions
--- a/scripts/geodata/i18n/transliteration_rules.py
+++ b/scripts/geodata/i18n/transliteration_rules.py
@@ -55,7 +55,8 @@ WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)

 word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))

-EMPTY_TRANSITION = u'\u0000'
+WORD_BOUNDARY_CHAR = u'\u0001'
+EMPTY_TRANSITION = u'\u0004'

 EXCLUDE_TRANSLITERATORS = set([
    'Hangul-Latin',
@@ -187,6 +188,7 @@ OPEN_GROUP = 'OPEN_GROUP'
 CLOSE_GROUP = 'CLOSE_GROUP'
 GROUP_REF = 'GROUP_REF'
 CHAR_SET = 'CHAR_SET'
+CHAR_MULTI_SET = 'CHAR_MULTI_SET'
 CHAR_CLASS = 'CHAR_CLASS'
 OPTIONAL = 'OPTIONAL'
 CHARACTER = 'CHARACTER'
@@ -235,21 +237,24 @@ CHAR_RANGE = 'CHAR_RANGE'
 WORD_BOUNDARY = 'WORD_BOUNDARY'
 NEGATION = 'NEGATION'
 INTERSECTION = 'INTERSECTION'
+DIFFERENCE = 'DIFFERENCE'

 # Scanner for a character set (yes, a regex regex)

 char_set_scanner = Scanner([
    ('^\^', NEGATION),
-    (r'[\\]?[^\\]\-[\\]?.', CHAR_RANGE),
+    (r'[\\]?[^\\\s]\-[\\]?[^\s]', CHAR_RANGE),
    (r'[\\].', ESCAPED_CHARACTER),
    (r'\'\'', SINGLE_QUOTE),
    (r'\'.*?\'', QUOTED_STRING),
    (':[^:]+:', CHAR_CLASS),
    # Char set
    ('\[[^\[\]]+\]', CHAR_SET),
+    ('\[.*\]', CHAR_MULTI_SET),
    ('\[', OPEN_SET),
    ('\]', CLOSE_SET),
    ('&', INTERSECTION),
+    ('(?<=[\s])-(?=[\s])', DIFFERENCE),
    ('\$', WORD_BOUNDARY),
    (r'[^\s]', CHARACTER),
 ])
@@ -258,6 +263,8 @@ NUM_CHARS = 65536

 all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])

+control_chars = set([c for c in all_chars if unicodedata.category(c) == 'Cc'])
+

 def get_transforms():
    return [f for f in os.listdir(CLDR_TRANSFORMS_DIR) if f.endswith('.xml')]
@@ -310,9 +317,36 @@ def parse_regex_char_class(c):
        chars = []

    if is_negation:
-        chars = sorted(all_chars - set(chars))
+        chars = all_chars - set(chars)

-    return chars
+    return sorted(set(chars) - control_chars)
+
+
+def parse_balanced_sets(s):
+    open_brackets = 0
+    max_nesting = 0
+
+    skip = False
+
+    for i, ch in enumerate(s):
+        if ch == '[':
+            if open_brackets == 0:
+                start = i
+            max_nesting
+            open_brackets += 1
+        elif ch == ']':
+            open_brackets -= 1
+            if open_brackets == 0:
+                skip = False
+                yield (s[start:i + 1], CHAR_MULTI_SET)
+                (start, i + 1)
+        elif open_brackets == 0 and not skip:
+            for token, token_class in char_set_scanner.scan(s[i:]):
+                if token_class not in (CHAR_SET, CHAR_MULTI_SET, OPEN_SET, CLOSE_SET):
+                    yield token, token_class
+                else:
+                    break
+            skip = True


 def parse_regex_char_set(s):
@@ -330,9 +364,10 @@ def parse_regex_char_set(s):
    is_negation = False
    this_group = set()
    is_intersection = False
+    is_difference = False
    is_word_boundary = False

-    for token, token_class in char_set_scanner.scan(s):
+    for token, token_class in parse_balanced_sets(s):
        if token_class == CHAR_RANGE:
            this_char_set = set(parse_regex_char_range(token))
            this_group |= this_char_set
@@ -347,16 +382,21 @@ def parse_regex_char_set(s):
            is_negation = True
        elif token_class == CHAR_CLASS:
            this_group |= set(parse_regex_char_class(token))
-        elif token_class == CHAR_SET:
+        elif token_class in (CHAR_SET, CHAR_MULTI_SET):
            # Recursive calls, as performance doesn't matter here and nesting is shallow
            this_char_set = set(parse_regex_char_set(token))
-            # Shouldn't be complex set expression logic here
            if is_intersection:
                this_group &= this_char_set
+                is_intersection = False
+            elif is_difference:
+                this_group -= this_char_set
+                is_difference = False
            else:
                this_group |= this_char_set
        elif token_class == INTERSECTION:
            is_intersection = True
+        elif token_class == DIFFERENCE:
+            is_difference = True
        elif token_class == CHARACTER:
            this_group.add(token)
        elif token_class == WORD_BOUNDARY:
@@ -365,7 +405,7 @@ def parse_regex_char_set(s):
    if is_negation:
        this_group = all_chars - this_group

-    return sorted(this_group) + (['$'] if is_word_boundary else [])
+    return sorted(this_group - control_chars) + (['$'] if is_word_boundary else [])


 for name, regex_range in unicode_property_regexes: