[transliteration] converting one of the more complicated and frequently used rules to its utf8proc equivalent, adding better support for escaped unicode characters and set differences, generating a header file indicating which unicode script/language pairs warrant various transliterators.

2015-05-16 23:12:29 -04:00
parent 5983cb6af0
commit 99115fa53c
1 changed files with 208 additions and 29 deletions
--- a/scripts/geodata/i18n/transliteration_rules.py
+++ b/scripts/geodata/i18n/transliteration_rules.py
@@ -88,6 +88,7 @@ NFD = 'NFD'
 NFKD = 'NFKD'
 NFC = 'NFC'
 NFKC = 'NFKC'
 STRIP_MARK = 'STRIP_MARK'
 LOWER = 'lower'
 UPPER = 'upper'
@@ -98,6 +99,7 @@ UNICODE_NORMALIZATION_TRANSFORMS = set([
    NFKD,
    NFC,
    NFKC,
    STRIP_MARK,
 ])
 unicode_category_aliases = {
@@ -120,6 +122,8 @@ unicode_general_categories = defaultdict(list)
 unicode_scripts = defaultdict(list)
 unicode_properties = {}
 unicode_script_ids = {}
 unicode_blocks = {}
 unicode_category_aliases = {}
 unicode_property_aliases = {}
@@ -140,9 +144,9 @@ class TransliterationParseError(Exception):
 def init_unicode_categories():
    global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases
    global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases
-    global unicode_property_value_aliases, unicode_scripts, unicode_word_breaks
+    global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks
-    for i in xrange(65536):
+    for i in xrange(NUM_CHARS):
        unicode_categories[unicodedata.category(unichr(i))].append(unichr(i))
        unicode_combining_classes[str(unicodedata.combining(unichr(i)))].append(unichr(i))
@@ -161,6 +165,8 @@ def init_unicode_categories():
    unicode_scripts = dict(unicode_scripts)
    unicode_script_ids.update(build_master_scripts_list(script_chars))
    unicode_blocks.update(get_unicode_blocks())
    unicode_properties.update(get_unicode_properties())
    unicode_property_aliases.update(get_property_aliases())
@@ -226,10 +232,8 @@ unicode_property_regexes = [
    ('logical_order_exception', '[เ-ไ ເ-ໄ ꪵ ꪶ ꪹ ꪻ ꪼ]'),
 ]
-char_set_map = {
+rule_map = {
-    '[^[:ccc=Not_Reordered:][:ccc=Above:]]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
+    u'[:Latin:] { [:Mn:]+ → ;': ':: {}'.format(STRIP_MARK)
    '[[:^ccc=0:] & [:^ccc=230:]]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
    '[^\p{ccc=0}\p{ccc=above}]': u'[֑ ֖ ֚ ֛ ֢-֧ ֪ ֭ ֮ ֽ ׅ ؘ-ؚ ۣ ۪ ۭ ݄ ݈ ࣭-࣯ ॒ ༘ ༙ ༵ ༷ ࿆ ᩿ ᭬ ᳔-᳙ ᳜-᳟ ᳢-᳨ ⵿ ︨ ︪-︭ 𝅥𐋠-𝅩𝅭-𝅲 𝅻-𝆂 𝆊 𝆋 𞣐-𞣖 ̲ ̸ ̧ ̨ ̕ ̚ ͝ ͞ ᷍ ᷎ ̖-̙ ̜-̠ ̩-̬ ̯ ̳ ̺-̼ ͇-͉ ͍ ͎ ͓-͖ ͙ ͚ ͜ ͟ ͢ ݂ ݆ ࡙-࡛ ᪵-᪺ ᪽ ᷂ ᷏ ᷐ ᷼ ᷽ ᷿ ⃬-⃯ ︧ 𐨍 𐫦 ̶ ̷ ⃘-⃚ ⃥ ⃪ ⃫ 𛲞 ゙ ゚ ̵ ̛ ̡-̦ ̭ ̮ ̰ ̱ ̴ ̹ ͅ ͘ ͠ ︩ ͡ ְ-ָ ׇ ֹ-ֻ ׂ ׁ ּ ֿ ﬞ ً ࣰ ٌ ࣱ ٍ ࣲ ࣩ َ-ِ ࣦ ࣶ ّ ْ ٕ ٟ ٖ ٜ ࣹ ࣺ ٰ ܑ ܱ ܴ ܷ-ܹ ܻ ܼ ܾ ߲ 𖫰-𖫴 ़ ় ਼ ઼ ଼ ಼ ᬴ ᯦ ᰷ ꦳ 𑂺 𑅳 𑈶 𑋩 𑌼 𑓃 𑗀 𑚷 ᳭ 𐨹 𐨺 ่-๋ ່-໋ ༹ ꤫-꤭ ့ ᤹ ᤻ 〪-〯 ⃒ ⃓ ⃦ ⃨ 𐇽 ᷊ ् ্ ੍ ્ ୍ ் ్ ౕ ౖ ್ ് ් ꯭ ꫶ ꠆ ꣄ 𑂹 𑇀 𑈵 𑋪 𑍍 𑓂 𑖿 𑘿 𑚶 ᮪ ᮫ 𑁆 𑁿 𐨿 ุ-ฺ ຸ ູ ꪴ ཱ ི ྀ ུ ེ-ཽ ྄ ᜔ ᜴ ᨘ ᯲ ᯳ ꥓ ္ ် ႍ 𑄳 𑄴 ្ ᩠ ᭄ ꧀ ᢩ]',
 }
 unicode_properties = {}
@@ -277,6 +281,8 @@ QUOTED_STRING = 'QUOTED_STRING'
 SINGLE_QUOTE = 'SINGLE_QUOTE'
 HTML_ENTITY = 'HTML_ENTITY'
 SINGLE_QUOTE = 'SINGLE_QUOTE'
 UNICODE_CHARACTER = 'UNICODE_CHARACTER'
 UNICODE_WIDE_CHARACTER = 'UNICODE_WIDE_CHARACTER'
 ESCAPED_CHARACTER = 'ESCAPED_CHARACTER'
@@ -302,6 +308,8 @@ rule_scanner = Scanner([
 # Scanner for the lvalue or rvalue of a transform rule
 transform_scanner = Scanner([
    (r'\\u[0-9A-Fa-f]{4}', UNICODE_CHARACTER),
    (r'\\U[0-9A-Fa-f]{8}', UNICODE_WIDE_CHARACTER),
    (r'[\\].', ESCAPED_CHARACTER),
    (r'\'\'', SINGLE_QUOTE),
    (r'\'.*?\'', QUOTED_STRING),
@@ -338,6 +346,8 @@ char_set_scanner = Scanner([
    ('^\^', NEGATION),
    (r'\\p\{[^\{\}]+\}', CHAR_CLASS_PCRE),
    (r'[\\]?[^\\\s]\-[\\]?[^\s]', CHAR_RANGE),
    (r'\\u[0-9A-Fa-f]{4}', UNICODE_CHARACTER),
    (r'\\U[0-9A-Fa-f]{8}', UNICODE_WIDE_CHARACTER),
    (r'[\\].', ESCAPED_CHARACTER),
    (r'\'\'', SINGLE_QUOTE),
    (r'\'.*?\'', QUOTED_STRING),
@@ -348,7 +358,7 @@ char_set_scanner = Scanner([
    ('\[', OPEN_SET),
    ('\]', CLOSE_SET),
    ('&', INTERSECTION),
-    ('(?<=[\s])-(?=[\s])', DIFFERENCE),
+    ('-', DIFFERENCE),
    ('\$', WORD_BOUNDARY),
    (ur'[\ud800-\udbff][\udc00-\udfff]', WIDE_CHARACTER),
    (r'\{[^\s]+\}', BRACKETED_CHARACTER),
@@ -482,8 +492,6 @@ def parse_regex_char_set(s, current_filter=all_chars):
    Parse into a single, flat character set without the unicode properties,
    ranges, unions/intersections, etc.
    '''
    if s in char_set_map:
        s = char_set_map[s]
    s = s[1:-1]
    is_negation = False
@@ -532,7 +540,12 @@ def parse_regex_char_set(s, current_filter=all_chars):
        elif token_class == CHARACTER and token not in control_chars:
            this_group.add(token)
            real_chars.add(token)
-        elif token_class == WIDE_CHARACTER:
+        elif token_class == UNICODE_CHARACTER:
            token = token.decode('unicode-escape')
            if token not in control_chars:
                this_group.add(token)
                real_chars.add(token)
        elif token_class in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER):
            continue
        elif token_class == BRACKETED_CHARACTER:
            if token.strip('{{}}') not in control_chars:
@@ -575,10 +588,11 @@ def get_raw_rules_and_variables(xml):
            continue
        rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip())
-        rule = literal_space_regex.sub(replace_literal_space, rule)
+        if rule not in rule_map:
-        rule = escaped_wide_unicode_regex.sub('', rule)
+            rule = literal_space_regex.sub(replace_literal_space, rule)
-        rule = escaped_unicode_regex.sub(unescape_unicode_char, rule)
+            rule = rule.rstrip(END_CHAR).strip()
-        rule = rule.rstrip(END_CHAR).strip()
+        else:
            rule = rule_map[rule]
        if rule.strip().endswith('\\'):
            compound_rule.append(rule.rstrip('\\'))
@@ -692,7 +706,10 @@ def char_permutations(s, current_filter=all_chars):
            char_types.append([replace_html_entity(token)])
        elif token_type == CHARACTER:
            char_types.append([token])
-        elif token_type == WIDE_CHARACTER:
+        elif token_type == UNICODE_CHARACTER:
            token = token.decode('unicode-escape')
            char_types.append([token])
        elif token_type in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER):
            continue
        if in_group and last_token_group_start:
            start_group = len(char_types)
@@ -760,10 +777,10 @@ def format_groups(char_types, groups):
    for start, end in groups:
        group_regex.append(char_types_string(char_types[last_end:start]))
        group_regex.append(u'(')
-        group_regex.append(char_types_string(char_types[start:end + 1]))
+        group_regex.append(char_types_string(char_types[start:end]))
        group_regex.append(u')')
        last_end = end
-    group_regex.append(char_types_string(char_types[last_end + 1:]))
+    group_regex.append(char_types_string(char_types[last_end:]))
    return u''.join(group_regex)
 charset_regex = re.compile(r'(?<!\\)\[')
@@ -901,6 +918,7 @@ def parse_transform_rules(xml):
                        in_set = False
                elif token_type == BEFORE_CONTEXT and not in_set:
                    left_pre_context = u''.join(current_token)
                    current_token = []
                elif token_type == AFTER_CONTEXT and not in_set:
                    have_post_context = True
@@ -968,7 +986,7 @@ def parse_transform_rules(xml):
                elif left_pre_context.strip() == START_OF_HAN_VAR:
                    left_pre_context = None
                    left_pre_context_type = CONTEXT_TYPE_NONE
-                else:
+                elif left_pre_context.strip():
                    left_pre_context, _, _ = char_permutations(left_pre_context.strip(), current_filter=current_filter)
                    if left_pre_context:
                        left_pre_context_max_len = len(left_pre_context or [])
@@ -981,8 +999,11 @@ def parse_transform_rules(xml):
                    else:
                        left_pre_context = None
                        left_pre_context_type = CONTEXT_TYPE_NONE
            else:
                left_pre_context = None
                left_pre_context_type = CONTEXT_TYPE_NONE
-            if left is not None:
+            if left:
                left, _, left_groups = char_permutations(left.strip(), current_filter=current_filter)
                if left_groups:
                    left_groups = format_groups(left, left_groups)
@@ -997,7 +1018,7 @@ def parse_transform_rules(xml):
                elif left_post_context.strip() == START_OF_HAN_VAR:
                    left_pre_context_type = None
                    left_pre_context_type = CONTEXT_TYPE_NONE
-                else:
+                elif left_post_context.strip():
                    left_post_context, _, _ = char_permutations(left_post_context.strip(), current_filter=current_filter)
                    if left_post_context:
                        left_post_context_max_len = len(left_post_context or [])
@@ -1009,6 +1030,10 @@ def parse_transform_rules(xml):
                    else:
                        left_post_context = None
                        left_post_context_type = CONTEXT_TYPE_NONE
            else:
                left_post_context = None
                left_post_context_type = CONTEXT_TYPE_NONE
            if right:
                right, move, right_groups = char_permutations(right.strip(), current_filter=current_filter)
@@ -1039,10 +1064,15 @@ EXISTING_STEP = 'EXISTING_STEP'
 supplemental_transliterations = {
    'latin-ascii': (EXISTING_STEP, [
        # German transliterations not handled by standard NFD normalization
-        (u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'),       # ä => ae
+        # ä => ae
-        (u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'),       # ö => oe
+        (u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'),
-        (u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'),       # ü => ue
+        # ö => oe
-        (u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'),       # ß => ss
+        (u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'),
        # ü => ue
        (u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'),
        # ß => ss
        (u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'),
    ]),
 }
@@ -1090,6 +1120,9 @@ def get_all_transform_rules():
                if rule.lower() in all_transforms and rule.lower() not in EXCLUDE_TRANSLITERATORS:
                    dependencies[name].append(rule.lower())
                    steps.append((STEP_TRANSFORM, rule.lower()))
                elif rule.split('-')[0].lower() in all_transforms and rule.split('-')[0].lower() not in EXCLUDE_TRANSLITERATORS:
                    dependencies[name].append(rule.split('-')[0].lower())
                    steps.append((STEP_TRANSFORM, rule.split('-')[0].lower()))
                rule = UTF8PROC_TRANSFORMS.get(rule, rule)
                if rule in UNICODE_NORMALIZATION_TRANSFORMS:
@@ -1166,16 +1199,152 @@ transliterator_source_t transliterators_source[] = {{
 '''
 transliterator_script_data_template = u'''
 #ifndef TRANSLITERATION_SCRIPTS_H
 #define TRANSLITERATION_SCRIPTS_H
-def create_transliterator(name, internal, steps):
+#include <stdlib.h>
-    return transliterator_template.format(name=name, internal=int(internal), num_steps=len(steps))
+#include "unicode_scripts.h"
 #include "transliterate.h"
 typedef struct script_transliteration_rule {{
    script_type_t script;
    char *language;
    uint32_t index;
    uint32_t len;
 }} script_transliteration_rule_t;
 script_transliteration_rule_t script_transliteration_rules[] = {{
    {rules}
 }};
 char *script_transliterators[] = {{
    {transliterators}
 }}
 #endif
 '''
-TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c'
+script_transliterators = {
    'arabic': {None: ['arabic-latin', 'arabic-latin-bgn'],
               'fa': ['persian-latin-bgn'],
               'ps': ['pashto-latin-bgn'],
               },
    'armenian': {None: ['armenian-latin-bgn']},
    'balinese': None,
    'bamum': None,
    'batak': None,
    'bengali': {None: ['bengali-latin']},
    'bopomofo': None,
    'braille': None,
    'buginese': None,
    'buhid': None,
    'canadian_aboriginal': {None: ['canadianaboriginal-latin']},
    'cham': None,
    'cherokee': None,
    'common': {None: ['latin-ascii']},
    'coptic': None,
    'cyrillic': {None: ['cyrillic-latin'],
                 'be': ['belarusian-latin-bgn'],
                 'ru': ['russian-latin-bgn'],
                 'bg': ['bulgarian-latin-bgn'],
                 'kk': ['kazakh-latin-bgn'],
                 'ky': ['kirghiz-latin-bgn'],
                 'mk': ['macedonian-latin-bgn'],
                 'mn': ['mongolian-latin-bgn'],
                 'sr': ['serbian-latin-bgn'],
                 'uk': ['ukrainian-latin-bgn'],
                 'uz': ['uzbek-latin-bgn'],
                 },
    'devanagari': {None: ['devanagari-latin']},
    'ethiopic': None,
    'georgian': {None: ['georgian-latin', 'georgian-latin-bgn']},
    'glagolitic': None,
    'greek': {None: ['greek-latin', 'greek-latin-bgn', 'greek_latin_ungegn']},
    'gujarati': {None: ['gujarati-latin']},
    'gurmukhi': {None: ['gurmukhi-latin']},
    'han': {None: ['han-latin']},
    'hangul': {None: ['korean-latin-bgn']},
    'hanunoo': None,
    'hebrew': {None: ['hebrew-latin', 'hebrew-latin-bgn']},
    'hiragana': {None: ['hiragana-latin']},
    'inherited': None,
    'javanese': None,
    'kannada': {None: ['kannada-latin']},
    'katakana': {None: ['katakana-latin-bgn']},
    'kayah_li': None,
    'khmer': None,
    'lao': None,
    'latin': {None: ['latin-ascii']},
    'lepcha': None,
    'limbu': None,
    'lisu': None,
    'malayalam': {None: ['malayam-latin']},
    'mandaic': None,
    'meetei_mayek': None,
    'mongolian': None,
    'myanmar': None,
    'new_tai_lue': None,
    'nko': None,
    'ogham': None,
    'ol_chiki': None,
    'oriya': {None: ['oriya-latin']},
    'phags_pa': None,
    'rejang': None,
    'runic': None,
    'samaritan': None,
    'saurashtra': None,
    'sinhala': None,
    'sundanese': None,
    'syloti_nagri': None,
    'syriac': None,
    'tagalog': None,
    'tagbanwa': None,
    'tai_le': None,
    'tai_tham': None,
    'tai_viet': None,
    'tamil': {None: ['tamil-latin']},
    'telugu': {None: ['telugu-latin']},
    'thaana': None,
    'thai': {None: ['thai-latin']},
    'tibetan': None,
    'tifinagh': None,
    'unknown': None,
    'vai': None,
    'yi': None
 }
-def main(out_dir):
+def write_transliterator_scripts_file(filename):
-    f = open(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME), 'w')
+    transliterator_rule_template = '''{{{script_type}, {lang}, {start}, {length}}}'''
    rules = []
    all_transliterators = []
    index = 0
    for script, i in unicode_script_ids.iteritems():
        spec = script_transliterators.get(script.lower())
        if not spec:
            continue
        script_type = 'SCRIPT_{}'.format(script.upper())
        for lang, transliterators in spec.iteritems():
            lang = 'NULL' if not lang else quote_string(lang)
            num_transliterators = len(transliterators)
            rules.append(transliterator_rule_template.format(script_type=script_type,
                         lang=lang, start=index, length=num_transliterators))
            for trans in transliterators:
                all_transliterators.append(quote_string(trans))
            index += num_transliterators
    template = transliterator_script_data_template.format(rules=''',
    '''.join(rules), transliterators=''',
    '''.join(all_transliterators))
    f = open(filename, 'w')
    f.write(safe_encode(template))
 def write_transliteration_data_file(filename):
    transforms, steps, rules = get_all_transform_rules()
    all_transforms = u''',
@@ -1193,9 +1362,19 @@ def main(out_dir):
        all_rules=all_rules
    )
    f = open(filename, 'w')
    f.write(safe_encode(template))
 TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c'
 TRANSLITERATION_SCRIPTS_FILENAME = 'transliteration_scripts.h'
 def main(out_dir):
    write_transliteration_data_file(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME))
    write_transliterator_scripts_file(os.path.join(out_dir, TRANSLITERATION_SCRIPTS_FILENAME))
 if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'Usage: python transliteration_rules.py out_dir'