libpostal/scripts/geodata/i18n/transliteration_rules.py

# -*- coding: utf-8 -*-
'''
transliteration.py

Automatically builds rules for transforming other scripts (e.g. Cyrillic, Greek,
Han, Katakana, Devanagari, etc.) into Latin characters.

Uses XML transforms from the CLDR repository.

'''

import argparse
import codecs
import csv
import htmlentitydefs
import itertools
import os
import re
import requests
import sys
import time
import urlparse
import unicodedata

from collections import defaultdict

from lxml import etree

from scanner import Scanner
from unicode_scripts import get_chars_by_script
from unicode_paths import CLDR_DIR
from geodata.encoding import safe_decode, safe_encode

CLDR_TRANSFORMS_DIR = os.path.join(CLDR_DIR, 'common', 'transforms')

PRE_TRANSFORM = 1
FORWARD_TRANSFORM = 2
BACKWARD_TRANSFORM = 3
BIDIRECTIONAL_TRANSFORM = 4

PRE_TRANSFORM_OP = '::'
BACKWARD_TRANSFORM_OP = u'←'
FORWARD_TRANSFORM_OP = u'→'
BIDIRECTIONAL_TRANSFORM_OP = u'↔'

ASSIGNMENT_OP = '='

PRE_CONTEXT_INDICATOR = '{'
POST_CONTEXT_INDICATOR = '}'

REVISIT_INDICATOR = '|'

WORD_BOUNDARY_VAR_NAME = 'wordBoundary'
WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME)

word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$'))

EMPTY_TRANSITION = u'\u0000'

EXCLUDE_TRANSLITERATORS = set([
    'Hangul-Latin',
    'InterIndic-Latin',
    'Jamo-Latin',
])

NFD = 'NFD'
NFKD = 'NFKD'
NFC = 'NFC'
NFKC = 'NFKC'

LOWER = 'lower'
UPPER = 'upper'
TITLE = 'title'

UNICODE_NORMALIZATION_TRANSFORMS = set([
    NFD,
    NFKD,
    NFC,
    NFKC,
])

unicode_category_aliases = {
    'letter': 'L',
    'lower': 'Ll',
    'lowercase': 'Ll',
    'lowercaseletter': 'Ll',
    'upper': 'Lu',
    'uppercase': 'Lu',
    'uppercaseletter': 'Lu',
    'title': 'Lt',
    'nonspacing mark': 'Mn',
    'mark': 'M',
}

unicode_categories = defaultdict(list)
unicode_general_categories = defaultdict(list)
unicode_scripts = defaultdict(list)


def init_unicode_categories():
    global unicode_categories, unicode_general_categories, unicode_scripts

    for i in xrange(65536):
        unicode_categories[unicodedata.category(unichr(i))].append(unichr(i))

    for key in unicode_categories.keys():
        unicode_general_categories[key[0]].extend(unicode_categories[key])

    script_chars = get_chars_by_script()
    for i, script in enumerate(script_chars):
        if script:
            unicode_scripts[script.lower()].append(unichr(i))


RULE = 'RULE'
TRANSFORM = 'TRANSFORM'

UTF8PROC_TRANSFORMS = {
    'Any-NFC': NFC,
    'Any-NFD': NFD,
    'Any-NFKD': NFKD,
    'Any-NFKC': NFKC,
    'Any-Lower': LOWER,
    'Any-Upper': UPPER,
    'Any-Title': TITLE,
}


CONTEXT_TYPE_NONE = 'CONTEXT_TYPE_NONE'
CONTEXT_TYPE_STRING = 'CONTEXT_TYPE_STRING'
CONTEXT_TYPE_WORD_BOUNDARY = 'CONTEXT_TYPE_WORD_BOUNDARY'
CONTEXT_TYPE_REGEX = 'CONTEXT_TYPE_REGEX'

all_transforms = set()

pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE)
pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\(.*\)[\s]*)?', re.UNICODE)
transform_regex = re.compile(u"(?:[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)([←→↔=])(?:[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)", re.UNICODE)

quoted_string_regex = re.compile(r'\'.*?\'', re.UNICODE)

COMMENT_CHAR = '#'
END_CHAR = ';'


def unescape_unicode_char(m):
    return m.group(0).decode('unicode-escape')

escaped_unicode_regex = re.compile(r'(?:\\u[0-9A-Fa-f]{4}|\\U[0-9A-Fa-f]{8})')

literal_space_regex = re.compile(r'(?:\\u0020|\\U00000020)')

# These are a few unicode property types that were needed by the transforms
unicode_property_regexes = [
    ('ideographic', '[〆〇〡-〩〸-〺㐀-䶵一-鿌豈-舘並-龎 𠀀-𪛖𪜀-𫜴𫝀-𫠝丽-𪘀]'),
    ('logical_order_exception', '[เ-ไ ເ-ໄ ꪵ ꪶ ꪹ ꪻ ꪼ]'),
]

unicode_properties = {}


def replace_literal_space(m):
    return "' '"

regex_char_set_greedy = re.compile(r'\[(.*)\]', re.UNICODE)
regex_char_set = re.compile(r'\[(.*?)(?<!\\)\]', re.UNICODE)

char_class_regex_str = '\[(?:[^\[\]]*\[[^\[\]]*\][^\[\]]*)*[^\[\]]*\]'

nested_char_class_regex = re.compile('\[(?:[^\[\]]*\[[^\[\]]*\][^\[\]]*)+[^\[\]]*\]', re.UNICODE)

range_regex = re.compile(r'([^\\])\-([^\\])', re.UNICODE)
var_regex = re.compile('\$([A-Za-z_\-]+)')

context_regex = re.compile(u'(?:[\s]*(?!=[\s])(.*?)(?<![\s])[\s]*{)?(?:[\s]*([^}{]*)[\s]*)(?:}[\s]*(?!=[\s])(.*)(?<![\s])[\s]*)?', re.UNICODE)

paren_regex = re.compile(r'\(.*\)', re.UNICODE)

group_ref_regex_str = '\$[0-9]+'
group_ref_regex = re.compile(group_ref_regex_str)

# Limited subset of regular expressions used in transforms

OPEN_SET = 'OPEN_SET'
CLOSE_SET = 'CLOSE_SET'
OPEN_GROUP = 'OPEN_GROUP'
CLOSE_GROUP = 'CLOSE_GROUP'
GROUP_REF = 'GROUP_REF'
CHAR_SET = 'CHAR_SET'
CHAR_CLASS = 'CHAR_CLASS'
OPTIONAL = 'OPTIONAL'
CHARACTER = 'CHARACTER'
REVISIT = 'REVISIT'
REPEAT = 'REPEAT'
LPAREN = 'LPAREN'
RPAREN = 'RPAREN'
WHITESPACE = 'WHITESPACE'
QUOTED_STRING = 'QUOTED_STRING'
SINGLE_QUOTE = 'SINGLE_QUOTE'
HTML_ENTITY = 'HTML_ENTITY'
SINGLE_QUOTE = 'SINGLE_QUOTE'
ESCAPED_CHARACTER = 'ESCAPED_CHARACTER'

BEFORE_CONTEXT = '{'
AFTER_CONTEXT = '}'

PLUS = 'PLUS'
STAR = 'STAR'

# Scanner for the lvalue or rvalue of a transform rule

transform_scanner = Scanner([
    (r'[\\].', ESCAPED_CHARACTER),
    (r'\'\'', SINGLE_QUOTE),
    (r'\'.*?\'', QUOTED_STRING),
    # Char classes only appear to go two levels deep in LDML
    ('\[', OPEN_SET),
    ('\]', CLOSE_SET),
    ('\(', OPEN_GROUP),
    ('\)', CLOSE_GROUP),
    (group_ref_regex_str, GROUP_REF),
    (r'\|', REVISIT),
    (r'&.*?;', HTML_ENTITY),
    (r'(?<![\\])\*', REPEAT),
    (r'(?<![\\])\+', PLUS),
    ('\?', OPTIONAL),
    ('\(', LPAREN),
    ('\)', RPAREN),
    ('\|', REVISIT),
    ('[\s]+', WHITESPACE),
    (r'[\\]?[^\s]', CHARACTER),
], re.UNICODE)

CHAR_RANGE = 'CHAR_RANGE'
WORD_BOUNDARY = 'WORD_BOUNDARY'
NEGATION = 'NEGATION'
INTERSECTION = 'INTERSECTION'

# Scanner for a character set (yes, a regex regex)

char_set_scanner = Scanner([
    ('^\^', NEGATION),
    (r'[\\]?[^\\]\-[\\]?.', CHAR_RANGE),
    (r'[\\].', ESCAPED_CHARACTER),
    (r'\'\'', SINGLE_QUOTE),
    (r'\'.*?\'', QUOTED_STRING),
    (':[^:]+:', CHAR_CLASS),
    # Char set
    ('\[[^\[\]]+\]', CHAR_SET),
    ('\[', OPEN_SET),
    ('\]', CLOSE_SET),
    ('&', INTERSECTION),
    ('\$', WORD_BOUNDARY),
    (r'[^\s]', CHARACTER),
])

NUM_CHARS = 65536

all_chars = set([unichr(i) for i in xrange(NUM_CHARS)])


def get_transforms():
    return [f for f in os.listdir(CLDR_TRANSFORMS_DIR) if f.endswith('.xml')]


def replace_html_entity(ent):
    name = ent.strip('&;')
    return unichr(htmlentitydefs.name2codepoint[name])


def parse_regex_char_range(regex):
    prev_char = None
    ranges = range_regex.findall(regex)
    regex = range_regex.sub('', regex)
    chars = [ord(c) for c in regex]

    for start, end in ranges:
        if ord(end) > ord(start):
            # Ranges are inclusive
            chars.extend([unichr(c) for c in range(ord(start), ord(end) + 1)])

    return chars


def parse_regex_char_class(c):
    chars = []
    orig = c
    c = c.strip(':')
    is_negation = False
    if c.startswith('^'):
        is_negation = True
        c = c.strip('^')

    if '=' in c:
        cat, c = c.split('=')
        if cat.strip() in ('script', 'sc'):
            c = c.strip()

    c = unicode_category_aliases.get(c.lower(), c)

    if c in unicode_general_categories:
        chars = unicode_general_categories[c]
    elif c in unicode_categories:
        chars = unicode_categories.get(c)
    elif c.lower() in unicode_scripts:
        chars = unicode_scripts[c.lower()]
    elif c.lower() in unicode_properties:
        chars = unicode_properties[c.lower()]
    else:
        chars = []

    if is_negation:
        chars = sorted(all_chars - set(chars))

    return chars


def parse_regex_char_set(s):
    '''
    Given a regex character set, which may look something like:

    [[:Latin:][:Greek:] & [:Ll:]]
    [A-Za-z_]
    [ $lowerVowel $upperVowel ]

    Parse into a single, flat character set without the unicode properties,
    ranges, unions/intersections, etc.
    '''
    s = s[1:-1]
    is_negation = False
    this_group = set()
    is_intersection = False
    is_word_boundary = False

    for token, token_class in char_set_scanner.scan(s):
        if token_class == CHAR_RANGE:
            this_char_set = set(parse_regex_char_range(token))
            this_group |= this_char_set
        elif token_class == ESCAPED_CHARACTER:
            token = token.strip('\\')
            this_group.add(token)
        elif token_class == SINGLE_QUOTE:
            this_group.add("'")
        elif token_class == QUOTED_STRING:
            this_group.add(token.strip("'"))
        elif token_class == NEGATION:
            is_negation = True
        elif token_class == CHAR_CLASS:
            this_group |= set(parse_regex_char_class(token))
        elif token_class == CHAR_SET:
            # Recursive calls, as performance doesn't matter here and nesting is shallow
            this_char_set = set(parse_regex_char_set(token))
            # Shouldn't be complex set expression logic here
            if is_intersection:
                this_group &= this_char_set
            else:
                this_group |= this_char_set
        elif token_class == INTERSECTION:
            is_intersection = True
        elif token_class == CHARACTER:
            this_group.add(token)
        elif token_class == WORD_BOUNDARY:
            is_word_boundary = True

    if is_negation:
        this_group = all_chars - this_group

    return sorted(this_group) + (['$'] if is_word_boundary else [])


for name, regex_range in unicode_property_regexes:
    unicode_properties[name] = parse_regex_char_set(regex_range)


def get_source_and_target(xml):
    return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0]


def get_raw_rules_and_variables(xml):
    '''
    Parse tRule nodes from the transform XML

    At this point we only care about lvalue, op and rvalue
    for parsing forward and two-way transforms.

    Variables are collected in a dictionary in this pass so they can be substituted later
    '''
    rules = []
    variables = {}

    for rule in xml.xpath('*//tRule'):
        if not rule.text:
            continue
        rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip())
        rule = literal_space_regex.sub(replace_literal_space, rule)
        rule = escaped_unicode_regex.sub(unescape_unicode_char, rule)
        rule = rule.rstrip(END_CHAR).strip()

        transform = transform_regex.match(rule)
        if transform:
            lvalue, op, rvalue = transform.groups()
            lvalue = lvalue.strip()
            rvalue = rvalue.strip()

            if op == FORWARD_TRANSFORM_OP:
                rules.append((FORWARD_TRANSFORM, (lvalue, rvalue)))
            elif op == BIDIRECTIONAL_TRANSFORM_OP:
                rules.append((BIDIRECTIONAL_TRANSFORM, (lvalue, rvalue)))
            elif op == BACKWARD_TRANSFORM_OP:
                rules.append((BACKWARD_TRANSFORM, (lvalue, rvalue)))
            elif op == ASSIGNMENT_OP:
                var_name = lvalue.lstrip('$')
                variables[var_name] = rvalue
        else:
            pre_transform = pre_transform_full_regex.match(rule)
            if pre_transform:
                rules.append((PRE_TRANSFORM, pre_transform.group(1)))

    return rules, variables

CHAR_CLASSES = set([
    ESCAPED_CHARACTER,
    CHAR_CLASS,
    QUOTED_STRING,
    CHARACTER,
    GROUP_REF,
])


def char_permutations(s):
    '''
    char_permutations

    Parses the lvalue or rvalue of a transform rule into
    a list of character permutations, in addition to keeping
    track of revisits and regex groups
    '''
    char_types = []
    move = 0
    in_revisit = False

    in_group = False
    last_token_group_start = False

    start_group = 0
    end_group = 0

    open_brackets = 0
    current_set = []

    groups = []

    for token, token_type in transform_scanner.scan(s):
        if open_brackets > 0 and token_type not in (OPEN_SET, CLOSE_SET):
            current_set.append(token)
            continue

        if token_type == ESCAPED_CHARACTER:
            char_types.append([token.strip('\\')])
        elif token_type == OPEN_GROUP:
            in_group = True
            last_token_group_start = True
        elif token_type == CLOSE_GROUP:
            in_group = False
            end_group = len(char_types)
            groups.append((start_group, end_group))
        elif token_type == OPEN_SET:
            open_brackets += 1
            current_set.append(token)
        elif token_type == CLOSE_SET:
            open_brackets -= 1
            current_set.append(token)
            if open_brackets == 0:
                char_types.append(parse_regex_char_set(u''.join(current_set)))
                current_set = []
        elif token_type == QUOTED_STRING:
            token = token.strip("'")
            for c in token:
                char_types.append([c])
        elif token_type == GROUP_REF:
            char_types.append([token])
        elif token_type == REVISIT:
            in_revisit = True
        elif token_type == REPEAT:
            char_types.append([STAR])
        elif token_type == PLUS:
            char_types.append([PLUS])
        elif token_type == OPTIONAL:
            char_types[-1].append('')
        elif token_type == REVISIT:
            in_revisit = True
        elif token_type == HTML_ENTITY:
            char_types.append([replace_html_entity(token)])
        elif token_type == CHARACTER:
            char_types.append([token])

        if in_group and last_token_group_start:
            start_group = len(char_types)
            last_token_group_start = False

        if in_revisit and token_type in CHAR_CLASSES:
            move += 1

    return char_types, move, groups

    return list(itertools.product(char_types)), move


string_replacements = {
    u'[': u'\[',
    u']': u'\]',
    u'(': u'\(',
    u')': u'\)',
    u'{': u'\{',
    u'}': u'\{',
    u'$': u'\$',
    u'^': u'\^',
    u'\\': u'\\\\',
    u'\u0000': '',
    u'': EMPTY_TRANSITION,
    u'*': u'\*',
    u'+': u'\+',
    PLUS: u'+',
    STAR: u'*',
}

escape_sequence_long_regex = re.compile(r'(\\x[0-9a-f]{2})([0-9a-f])', re.I)


def replace_long_escape_sequence(s):
    def replace_match(m):
        return u'{}""{}'.format(m.group(1), m.group(2))

    return escape_sequence_long_regex.sub(replace_match, s)


def quote_string(s):
    return u'"{}"'.format(replace_long_escape_sequence(safe_decode(s).replace('"', '\\"')))


def char_types_string(char_types):
    '''
    Transforms the char_permutations output into a string
    suitable for simple parsing in C (characters and character sets only,
    no variables, unicode character properties or unions/intersections)
    '''
    ret = []

    for chars in char_types:
        template = u'{}' if len(chars) == 1 else u'[{}]'
        norm = []
        for c in chars:
            c = string_replacements.get(c, c)
            norm.append(c)

        ret.append(template.format(u''.join(norm)))

    return u''.join(ret)


def format_groups(char_types, groups):
    group_regex = []
    last_end = 0
    for start, end in groups:
        group_regex.append(char_types_string(char_types[last_end:start]))
        group_regex.append(u'(')
        group_regex.append(char_types_string(char_types[start:end + 1]))
        group_regex.append(u')')
        last_end = end
    group_regex.append(char_types_string(char_types[last_end + 1:]))
    return u''.join(group_regex)


charset_regex = re.compile(r'(?<!\\)\[')


def escape_string(s):
    return s.encode('string-escape')


def format_rule(rule):
    '''
    Creates the C literal for a given transliteration rule
    '''
    key = safe_encode(rule[0])
    key_len = len(key)

    pre_context_type = rule[1]
    pre_context = rule[2]
    if pre_context is None:
        pre_context = 'NULL'
        pre_context_len = 0
    else:
        pre_context = safe_encode(pre_context)
        pre_context_len = len(pre_context)
        pre_context = quote_string(escape_string(pre_context))

    pre_context_max_len = rule[3]

    post_context_type = rule[4]
    post_context = rule[5]

    if post_context is None:
        post_context = 'NULL'
        post_context_len = 0
    else:
        post_context = safe_encode(post_context)
        post_context_len = len(post_context)
        post_context = quote_string(escape_string(post_context))

    post_context_max_len = rule[6]

    groups = rule[7]
    if not groups:
        groups = 'NULL'
        groups_len = 0
    else:
        groups = safe_encode(groups)
        groups_len = len(groups)
        groups = quote_string(escape_string(groups))

    replacement = safe_encode(rule[8])
    replacement_len = len(replacement)
    move = rule[9]

    output_rule = (
        quote_string(escape_string(key)),
        str(key_len),
        pre_context_type,
        str(pre_context_max_len),
        pre_context,
        str(pre_context_len),

        post_context_type,
        str(post_context_max_len),
        post_context,
        str(post_context_len),

        quote_string(escape_string(replacement)),
        str(replacement_len),
        str(move),
        groups,
        str(groups_len),
    )

    return output_rule


def parse_transform_rules(xml):
    '''
    parse_transform_rules takes a parsed xml document as input
    and generates rules suitable for use in the C code.

    Since we're only concerned with transforming into Latin/ASCII,
    we don't care about backward transforms or two-way contexts.
    Only the lvalue's context needs to be used.
    '''
    rules, variables = get_raw_rules_and_variables(xml)

    def get_var(m):
        return variables.get(m.group(1))

    # Replace variables within variables
    while True:
        num_found = 0
        for k, v in variables.items():
            if var_regex.search(v):
                v = var_regex.sub(get_var, v)
                variables[k] = v
                num_found += 1
        if num_found == 0:
            break

    variables[WORD_BOUNDARY_VAR_NAME] = WORD_BOUNDARY_VAR

    for rule_type, rule in rules:
        if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM):
            left, right = rule
            left = var_regex.sub(get_var, left)
            right = var_regex.sub(get_var, right)

            left_pre_context, left, left_post_context = context_regex.match(left).groups()
            right_pre_context, right, right_post_context = context_regex.match(right).groups()

            left_pre_context_max_len = 0
            left_post_context_max_len = 0

            left_pre_context_type = CONTEXT_TYPE_NONE
            left_post_context_type = CONTEXT_TYPE_NONE

            move = 0
            left_groups = []
            right_groups = []

            if left_pre_context:
                if left_pre_context.strip() == WORD_BOUNDARY_VAR:
                    left_pre_context = None
                    left_pre_context_type = CONTEXT_TYPE_WORD_BOUNDARY
                else:
                    left_pre_context, _, _ = char_permutations(left_pre_context.strip())
                    left_pre_context_max_len = len(left_pre_context or [])
                    left_pre_context = char_types_string(left_pre_context)
                    if charset_regex.search(left_pre_context):
                        left_pre_context_type = CONTEXT_TYPE_REGEX
                    else:
                        left_pre_context_type = CONTEXT_TYPE_STRING

            if left:
                left, _, left_groups = char_permutations(left.strip())
                if left_groups:
                    left_groups = format_groups(left, left_groups)
                else:
                    left_groups = None
                left = char_types_string(left)

            if left_post_context:
                if left_post_context.strip() == WORD_BOUNDARY_VAR:
                    left_post_context = None
                    left_post_context_type = CONTEXT_TYPE_WORD_BOUNDARY
                else:
                    left_post_context, _, _ = char_permutations(left_post_context.strip())
                    left_post_context_max_len = len(left_post_context or [])
                    left_post_context = char_types_string(left_post_context)
                    if charset_regex.search(left_post_context):
                        left_post_context_type = CONTEXT_TYPE_REGEX
                    else:
                        left_post_context_type = CONTEXT_TYPE_STRING

            if right:
                right, move, right_groups = char_permutations(right.strip())
                right = char_types_string(right)

            yield RULE, (left, left_pre_context_type, left_pre_context, left_pre_context_max_len,
                         left_post_context_type, left_post_context, left_post_context_max_len, left_groups, right, move)

        elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule:
            continue
        elif rule_type == PRE_TRANSFORM:
            pre_transform = pre_transform_regex.match(rule)
            if pre_transform:
                yield TRANSFORM, pre_transform.group(1)


STEP_RULESET = 'STEP_RULESET'
STEP_TRANSFORM = 'STEP_TRANSFORM'
STEP_UNICODE_NORMALIZATION = 'STEP_UNICODE_NORMALIZATION'


NEW_STEP = 'NEW_STEP'
EXISTING_STEP = 'EXISTING_STEP'

# Extra rules defined here
supplemental_transliterations = {
    'latin-ascii': (EXISTING_STEP, [
        # German transliterations not handled by standard NFD normalization
        (u'"\xc3\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'),       # ä => ae
        (u'"\xc3\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'),       # ö => oe
        (u'"\xc3\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'),       # ü => ue
        (u'"\xc3\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'),       # ß => ss
    ]),
}


def get_all_transform_rules():
    transforms = {}
    to_latin = set()

    retain_transforms = set()

    init_unicode_categories()

    all_transforms = set([name.strip('.xml').lower() for name in get_transforms()])

    for filename in get_transforms():
        name = filename.strip('.xml').lower()

        f = open(os.path.join(CLDR_TRANSFORMS_DIR, filename))
        xml = etree.parse(f)
        source, target = get_source_and_target(xml)

        if (target.lower() == 'latin' or name == 'latin-ascii') and name not in EXCLUDE_TRANSLITERATORS:
            to_latin.add(name)
            retain_transforms.add(name)

        steps = []
        rule_set = []
        for rule_type, rule in parse_transform_rules(xml):
            if rule_type == RULE:
                rule = format_rule(rule)
                rule_set.append(rule)
            elif rule_type == TRANSFORM:
                if rule_set:
                    steps.append((STEP_RULESET, rule_set))
                    rule_set = []
                if name in to_latin and rule.lower() in all_transforms:
                    retain_transforms.add(rule.lower())
                    steps.append((STEP_TRANSFORM, rule))

                rule = UTF8PROC_TRANSFORMS.get(rule, rule)
                if rule in UNICODE_NORMALIZATION_TRANSFORMS:
                    steps.append((STEP_UNICODE_NORMALIZATION, rule))

        if rule_set:
            steps.append((STEP_RULESET, rule_set))

        transforms[name] = steps

    all_rules = []
    all_steps = []
    all_transforms = []

    for name, steps in transforms.iteritems():
        if name in supplemental_transliterations:
            step_type, rules = supplemental_transliterations[name]
            if step_type == EXISTING_STEP:
                steps[-1][1].extend(rules)
            else:
                steps[-1].append((STEP_RULESET, rules))
        # Only care if it's a transform to Latin/ASCII or a dependency
        # for a transform to Latin/ASCII
        elif name not in retain_transforms:
            continue
        step_index = len(all_steps)
        num_steps = len(steps)
        for i, (step_type, data) in enumerate(steps):
            if step_type == STEP_RULESET:
                rule_index = len(all_rules)
                num_rules = len(data)
                step = (STEP_RULESET, str(rule_index), str(num_rules), quote_string(str(i)))
                all_rules.extend(data)
            elif step_type == STEP_TRANSFORM:
                step = (STEP_TRANSFORM, '-1', '-1', quote_string(data))
            elif step_type == STEP_UNICODE_NORMALIZATION:
                step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', quote_string(data))
            all_steps.append(step)

        internal = int(name not in to_latin)

        transliterator = (quote_string(name), str(internal), str(step_index), str(num_steps))
        all_transforms.append(transliterator)

    return all_transforms, all_steps, all_rules


transliteration_data_template = u'''#include <stdlib.h>

transliteration_rule_source_t rules_source[] = {{
    {all_rules}
}};

transliteration_step_source_t steps_source[] = {{
    {all_steps}
}};

transliterator_source_t transliterators_source[] = {{
    {all_transforms}
}};

'''


def create_transliterator(name, internal, steps):
    return transliterator_template.format(name=name, internal=int(internal), num_steps=len(steps))


TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c'


def main(out_dir):
    f = open(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME), 'w')
    transforms, steps, rules = get_all_transform_rules()

    all_transforms = u''',
    '''.join([u'{{{}}}'.format(u','.join(t)) for t in transforms])

    all_steps = u''',
    '''.join([u'{{{}}}'.format(u','.join(s)) for s in steps])

    all_rules = u''',
    '''.join([u'{{{}}}'.format(u','.join(r)) for r in rules])

    template = transliteration_data_template.format(
        all_transforms=all_transforms,
        all_steps=all_steps,
        all_rules=all_rules
    )

    f.write(safe_encode(template))


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'Usage: python transliteration_rules.py out_dir'
        exit(1)
    main(sys.argv[1])