# -*- coding: utf-8 -*- ''' transliteration.py Automatically builds rules for transforming other scripts (e.g. Cyrillic, Greek, Han, Katakana, Devanagari, etc.) into Latin characters. Uses XML transforms from the CLDR repository. ''' import argparse import codecs import csv import htmlentitydefs import itertools import os import re import requests import sys import time import urlparse import unicodedata from collections import defaultdict, deque from lxml import etree from scanner import Scanner from unicode_properties import * from unicode_paths import CLDR_DIR from geodata.encoding import safe_decode, safe_encode CLDR_TRANSFORMS_DIR = os.path.join(CLDR_DIR, 'common', 'transforms') PRE_TRANSFORM = 1 FORWARD_TRANSFORM = 2 BACKWARD_TRANSFORM = 3 BIDIRECTIONAL_TRANSFORM = 4 PRE_TRANSFORM_OP = '::' BACKWARD_TRANSFORM_OPS = set([u'←', u'<']) FORWARD_TRANSFORM_OPS = set([u'→', u'>']) BIDIRECTIONAL_TRANSFORM_OPS = set([u'↔', u'<>']) ASSIGNMENT_OP = '=' PRE_CONTEXT_INDICATOR = '{' POST_CONTEXT_INDICATOR = '}' REVISIT_INDICATOR = '|' WORD_BOUNDARY_VAR_NAME = 'wordBoundary' WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME) START_OF_HAN_VAR_NAME = 'startOfHanMarker' START_OF_HAN_VAR = '${}'.format(START_OF_HAN_VAR_NAME) start_of_han_regex = re.compile(START_OF_HAN_VAR.replace('$', '\$')) word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$')) WORD_BOUNDARY_CHAR = u'\u0001' EMPTY_TRANSITION = u'\u0004' NAMESPACE_SEPARATOR_CHAR = u"|" WORD_BOUNDARY_CHAR = u"\x01" PRE_CONTEXT_CHAR = u"\x02" POST_CONTEXT_CHAR = u"\x03" EMPTY_TRANSITION_CHAR = u"\x04" REPEAT_CHAR = u"\x05" GROUP_INDICATOR_CHAR = u"\x06" BEGIN_SET_CHAR = u"\x0e" END_SET_CHAR = u"\x0f" BIDIRECTIONAL_TRANSLITERATORS = { 'fullwidth-halfwidth': 'halfwidth-fullwidth' } REVERSE_TRANSLITERATORS = { 'latin-katakana': 'katakana-latin', } EXCLUDE_TRANSLITERATORS = set([ 'hangul-latin', 'jamo-latin', # Don't care about spaced Han because our tokenizer does it already 'han-spacedhan', ]) NFD = 'NFD' NFKD = 'NFKD' NFC = 'NFC' NFKC = 'NFKC' STRIP_MARK = 'STRIP_MARK' LOWER = 'lower' UPPER = 'upper' TITLE = 'title' UNICODE_NORMALIZATION_TRANSFORMS = set([ NFD, NFKD, NFC, NFKC, STRIP_MARK, ]) unicode_category_aliases = { 'letter': 'L', 'lower': 'Ll', 'lowercase': 'Ll', 'lowercaseletter': 'Ll', 'upper': 'Lu', 'uppercase': 'Lu', 'uppercaseletter': 'Lu', 'title': 'Lt', 'nonspacing mark': 'Mn', 'mark': 'M', } unicode_categories = defaultdict(list) unicode_blocks = defaultdict(list) unicode_combining_classes = defaultdict(list) unicode_general_categories = defaultdict(list) unicode_scripts = defaultdict(list) unicode_properties = {} unicode_script_ids = {} unicode_blocks = {} unicode_category_aliases = {} unicode_property_aliases = {} unicode_property_value_aliases = {} unicode_word_breaks = {} COMBINING_CLASS_PROP = 'canonical_combining_class' BLOCK_PROP = 'block' GENERAL_CATEGORY_PROP = 'general_category' SCRIPT_PROP = 'script' WORD_BREAK_PROP = 'word_break' class TransliterationParseError(Exception): pass def init_unicode_categories(): global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases global unicode_property_value_aliases, unicode_scripts, unicode_script_ids, unicode_word_breaks for i in xrange(NUM_CHARS): unicode_categories[unicodedata.category(unichr(i))].append(unichr(i)) unicode_combining_classes[str(unicodedata.combining(unichr(i)))].append(unichr(i)) unicode_categories = dict(unicode_categories) unicode_combining_classes = dict(unicode_combining_classes) for key in unicode_categories.keys(): unicode_general_categories[key[0]].extend(unicode_categories[key]) unicode_general_categories = dict(unicode_general_categories) script_chars = get_chars_by_script() for i, script in enumerate(script_chars): if script: unicode_scripts[script.lower()].append(unichr(i)) unicode_scripts = dict(unicode_scripts) unicode_script_ids.update(build_master_scripts_list(script_chars)) unicode_blocks.update(get_unicode_blocks()) unicode_properties.update(get_unicode_properties()) unicode_property_aliases.update(get_property_aliases()) unicode_word_breaks.update(get_word_break_properties()) for key, value in get_property_value_aliases().iteritems(): key = unicode_property_aliases.get(key, key) if key == GENERAL_CATEGORY_PROP: for k, v in value.iteritems(): k = k.lower() unicode_category_aliases[k] = v if '_' in k: unicode_category_aliases[k.replace('_', '')] = v unicode_property_value_aliases[key] = value RULE = 'RULE' TRANSFORM = 'TRANSFORM' FILTER = 'FILTER' UTF8PROC_TRANSFORMS = { 'Any-NFC': NFC, 'Any-NFD': NFD, 'Any-NFKD': NFKD, 'Any-NFKC': NFKC, 'Any-Lower': LOWER, 'Any-Upper': UPPER, 'Any-Title': TITLE, } CONTEXT_TYPE_NONE = 'CONTEXT_TYPE_NONE' CONTEXT_TYPE_STRING = 'CONTEXT_TYPE_STRING' CONTEXT_TYPE_WORD_BOUNDARY = 'CONTEXT_TYPE_WORD_BOUNDARY' CONTEXT_TYPE_REGEX = 'CONTEXT_TYPE_REGEX' all_transforms = set() pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE) pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\((.*)\)[\s]*)?', re.UNICODE) assignment_regex = re.compile(u"(?:[\s]*(\$[^\s\=]+)[\s]*\=[\s]*(?!=[\s])(.*)(?)|[←<→>↔])(?:[\s]*(?!=[\s])(.*)(? ord(start): # Ranges are inclusive chars.extend([unichr(c) for c in range(ord(start), ord(end) + 1)]) return chars def parse_regex_char_class(c, current_filter=all_chars): chars = [] orig = c if c.startswith('\\p'): c = c.split('{')[-1].split('}')[0] c = c.strip(': ') is_negation = False if c.startswith('^'): is_negation = True c = c.strip('^') if '=' in c: prop, value = c.split('=') prop = unicode_property_aliases.get(prop.lower(), prop) value = unicode_property_value_aliases.get(prop.lower(), {}).get(value, value) if prop == COMBINING_CLASS_PROP: chars = unicode_combining_classes[value] elif prop == GENERAL_CATEGORY_PROP: chars = unicode_categories.get(value, unicode_general_categories[value]) elif prop == BLOCK_PROP: chars = unicode_blocks[value.lower()] elif prop == SCRIPT_PROP: chars = unicode_scripts[value.lower()] elif prop == WORD_BREAK_PROP: chars = unicode_word_breaks[value] else: raise TransliterationParseError(c) else: c = c.replace('-', '_').replace(' ', '_') if c.lower() in unicode_property_aliases: c = unicode_property_aliases[c.lower()] elif c.lower() in unicode_category_aliases: c = unicode_category_aliases[c.lower()] if c in unicode_general_categories: chars = unicode_general_categories[c] elif c in unicode_categories: chars = unicode_categories[c] elif c.lower() in unicode_properties: chars = unicode_properties[c.lower()] elif c.lower() in unicode_scripts: chars = unicode_scripts[c.lower()] elif c.lower() in unicode_properties: chars = unicode_properties[c.lower()] else: raise TransliterationParseError(c) if is_negation: chars = current_filter - set(chars) return sorted((set(chars) & current_filter) - control_chars) def parse_balanced_sets(s): open_brackets = 0 max_nesting = 0 skip = False for i, ch in enumerate(s): if ch == '[': if open_brackets == 0: start = i max_nesting open_brackets += 1 elif ch == ']': open_brackets -= 1 if open_brackets == 0: skip = False yield (s[start:i + 1], CHAR_MULTI_SET) (start, i + 1) elif open_brackets == 0 and not skip: for token, token_class in char_set_scanner.scan(s[i:]): if token_class not in (CHAR_SET, CHAR_MULTI_SET, OPEN_SET, CLOSE_SET): yield token, token_class else: break skip = True def parse_regex_char_set(s, current_filter=all_chars): ''' Given a regex character set, which may look something like: [[:Latin:][:Greek:] & [:Ll:]] [A-Za-z_] [ $lowerVowel $upperVowel ] Parse into a single, flat character set without the unicode properties, ranges, unions/intersections, etc. ''' s = s[1:-1] is_negation = False this_group = set() is_intersection = False is_difference = False is_word_boundary = False real_chars = set() for token, token_class in parse_balanced_sets(s): if token_class == CHAR_RANGE: this_char_set = set(parse_regex_char_range(token)) this_group |= this_char_set elif token_class == ESCAPED_CHARACTER: token = token.strip('\\') this_group.add(token) real_chars.add(token) elif token_class == SINGLE_QUOTE: t = "'" this_group.add(t) real_chars.add(t) elif token_class == QUOTED_STRING: t = token.strip("'") this_group.add(t) real_chars.add(t) elif token_class == NEGATION: is_negation = True elif token_class in (CHAR_CLASS, CHAR_CLASS_PCRE): this_group |= set(parse_regex_char_class(token, current_filter=current_filter)) elif token_class in (CHAR_SET, CHAR_MULTI_SET): # Recursive calls, as performance doesn't matter here and nesting is shallow this_char_set = set(parse_regex_char_set(token, current_filter=current_filter)) if is_intersection: this_group &= this_char_set is_intersection = False elif is_difference: this_group -= this_char_set is_difference = False else: this_group |= this_char_set elif token_class == INTERSECTION: is_intersection = True elif token_class == DIFFERENCE: is_difference = True elif token_class == CHARACTER and token not in control_chars: this_group.add(token) real_chars.add(token) elif token_class == UNICODE_CHARACTER: token = token.decode('unicode-escape') if token not in control_chars: this_group.add(token) real_chars.add(token) elif token_class in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER): continue elif token_class == BRACKETED_CHARACTER: if token.strip('{{}}') not in control_chars: this_group.add(token) real_chars.add(token) elif token_class == WORD_BOUNDARY: is_word_boundary = True if is_negation: this_group = current_filter - this_group return sorted((this_group & (current_filter | real_chars)) - control_chars) + ([WORD_BOUNDARY_CHAR] if is_word_boundary else []) for name, regex_range in unicode_property_regexes: unicode_properties[name] = parse_regex_char_set(regex_range) def get_source_and_target(xml): return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0] def is_internal(xml): return xml.xpath('//transform/@visibility="internal"') def get_raw_rules_and_variables(xml, reverse=False): ''' Parse tRule nodes from the transform XML At this point we only care about lvalue, op and rvalue for parsing forward and two-way transforms. Variables are collected in a dictionary in this pass so they can be substituted later ''' rules = [] variables = {} in_compound_rule = False compound_rule = [] nodes = xml.xpath('*//tRule') if reverse: nodes = reversed(nodes) for rule in nodes: if not rule.text: continue rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip()) if rule not in rule_map: rule = literal_space_regex.sub(replace_literal_space, rule) rule = rule.rstrip(END_CHAR).strip() else: rule = rule_map[rule] if rule.strip().endswith('\\'): compound_rule.append(rule.rstrip('\\')) in_compound_rule = True continue elif in_compound_rule: compound_rule.append(rule) rule = u''.join(compound_rule) in_compound_rule = False compound_rule = [] assignment = assignment_regex.match(rule) transform = transform_regex.match(rule) pre_transform = pre_transform_full_regex.match(rule) if pre_transform: rules.append((PRE_TRANSFORM, pre_transform.group(1))) elif assignment: lvalue, rvalue = assignment.groups() var_name = lvalue.strip().lstrip('$') rvalue = rvalue.strip() variables[var_name] = rvalue elif transform: lvalue, op, rvalue = transform.groups() lvalue = lvalue.strip() rvalue = rvalue.strip() if op in FORWARD_TRANSFORM_OPS: rules.append((FORWARD_TRANSFORM, (lvalue, rvalue))) elif op in BIDIRECTIONAL_TRANSFORM_OPS: rules.append((BIDIRECTIONAL_TRANSFORM, (lvalue, rvalue))) elif op in BACKWARD_TRANSFORM_OPS: rules.append((BACKWARD_TRANSFORM, (lvalue, rvalue))) return rules, variables CHAR_CLASSES = set([ ESCAPED_CHARACTER, CHAR_CLASS, QUOTED_STRING, CHARACTER, GROUP_REF, ]) def char_permutations(s, current_filter=all_chars): ''' char_permutations Parses the lvalue or rvalue of a transform rule into a list of character permutations, in addition to keeping track of revisits and regex groups ''' if not s: return [EMPTY_TRANSITION_CHAR], [], [] char_types = [] revisit_char_types = [] in_revisit = False in_group = False last_token_group_start = False start_group = 0 end_group = 0 open_brackets = 0 current_set = [] current_chars = char_types groups = [] for token, token_type in transform_scanner.scan(s): if open_brackets > 0 and token_type not in (OPEN_SET, CLOSE_SET): current_set.append(token) continue if token_type == ESCAPED_CHARACTER: current_chars.append([token.strip('\\')]) elif token_type == OPEN_GROUP: in_group = True last_token_group_start = True elif token_type == CLOSE_GROUP: in_group = False end_group = len([c for c in current_chars if c[0] != REPEAT_CHAR]) groups.append((start_group, end_group)) elif token_type == OPEN_SET: open_brackets += 1 current_set.append(token) elif token_type == CLOSE_SET: open_brackets -= 1 current_set.append(token) if open_brackets == 0: char_set = parse_regex_char_set(u''.join(current_set), current_filter=current_filter) if char_set: current_chars.append(char_set) current_set = [] elif token_type == QUOTED_STRING: token = token.strip("'") for c in token: current_chars.append([c]) elif token_type == GROUP_REF: current_chars.append([token.replace('$', GROUP_INDICATOR_CHAR)]) elif token_type == REVISIT: in_revisit = True current_chars = revisit_char_types elif token_type == REPEAT: current_chars[-1].append(EMPTY_TRANSITION_CHAR) current_chars.append([REPEAT_CHAR]) elif token_type == REPEAT_ONE: current_chars.append([REPEAT_CHAR]) elif token_type == OPTIONAL: current_chars[-1].append(EMPTY_TRANSITION_CHAR) elif token_type == HTML_ENTITY: current_chars.append([replace_html_entity(token)]) elif token_type == CHARACTER: current_chars.append([token]) elif token_type == SINGLE_QUOTE: current_chars.append(["'"]) elif token_type == UNICODE_CHARACTER: token = token.decode('unicode-escape') current_chars.append([token]) elif token_type in (WIDE_CHARACTER, UNICODE_WIDE_CHARACTER): continue if in_group and last_token_group_start: start_group = len(current_chars) last_token_group_start = False return char_types, revisit_char_types, groups string_replacements = { u'[': u'\[', u']': u'\]', u'(': u'\(', u')': u'\)', u'{': u'\{', u'}': u'\{', u'$': u'\$', u'^': u'\^', u'-': u'\-', u'\\': u'\\\\', u'*': u'\*', u'+': u'\+', } escape_sequence_long_regex = re.compile(r'(\\x[0-9a-f]{2})([0-9a-f])', re.I) def replace_long_escape_sequence(s): def replace_match(m): return u'{}""{}'.format(m.group(1), m.group(2)) return escape_sequence_long_regex.sub(replace_match, s) def quote_string(s): return u'"{}"'.format(replace_long_escape_sequence(safe_decode(s).replace('"', '\\"'))) def char_types_string(char_types): ''' Transforms the char_permutations output into a string suitable for simple parsing in C (characters and character sets only, no variables, unicode character properties or unions/intersections) ''' ret = [] for chars in char_types: template = u'{}' if len(chars) == 1 else u'[{}]' norm = [] for c in chars: c = string_replacements.get(c, c) norm.append(c) ret.append(template.format(u''.join(norm))) return u''.join(ret) def format_groups(char_types, groups): group_regex = [] last_end = 0 for start, end in groups: group_regex.append(char_types_string(char_types[last_end:start])) group_regex.append(u'(') group_regex.append(char_types_string(char_types[start:end])) group_regex.append(u')') last_end = end group_regex.append(char_types_string(char_types[last_end:])) return u''.join(group_regex) charset_regex = re.compile(r'(? ae (u'"\\xc3\\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', 'NULL', '0', 'NULL', '0'), # Ä => Ae if followed by lower case Latin letter (u'"\\xc3\\x84"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Ae"', '2', 'NULL', '0', 'NULL', '0'), # Ä => AE otherwise (u'"\\xc3\\x84"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"AE"', '2', 'NULL', '0', 'NULL', '0'), # ö => oe (u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', 'NULL', '0', 'NULL', '0'), # Ö => Oe if followed by lower case Latin letter (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Oe"', '2', 'NULL', '0', 'NULL', '0'), # Ö => OE otherwise (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"OE"', '2', 'NULL', '0', 'NULL', '0'), # Note this is the German form. In Swedish ü => y, # might make sense to split these rules into # language-specific transliterators # ü => ue (u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', 'NULL', '0', 'NULL', '0'), # Ü => Ue if followed by lower case Latin letter (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Ue"', '2', 'NULL', '0', 'NULL', '0'), # Ü => UE otherwise (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"UE"', '2', 'NULL', '0', 'NULL', '0'), # Swedish transliterations not handled by standard NFD normalization (u'"\\xc3\\xa5"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"aa"', '2', 'NULL', '0', 'NULL', '0'), # Å => Aa if followed by lower case Latin letter (u'"\\xc3\\x85"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', '"[{}]"'.format(latin_lower_set), '0', u'"Aa"', '2', 'NULL', '0', 'NULL', '0'), # Å => AA otherwise (u'"\\xc3\\x85"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"AA"', '2', 'NULL', '0', 'NULL', '0'), ]), (PREPEND_STEP, [(quote_string(name), str(len(name)), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0') for name, value in html_escapes.iteritems() ] ), ], } def get_all_transform_rules(): transforms = {} to_latin = set() retain_transforms = set() init_unicode_categories() all_transforms = set([name.split('.xml')[0].lower() for name in get_transforms()]) name_aliases = {} for filename in get_transforms(): name = filename.split('.xml')[0].lower() f = open(os.path.join(CLDR_TRANSFORMS_DIR, filename)) xml = etree.parse(f) source, target = get_source_and_target(xml) name_alias = '-'.join([source.lower(), target.lower()]) if name_alias not in name_aliases: name_aliases[name_alias] = name if name in REVERSE_TRANSLITERATORS: all_transforms.add(REVERSE_TRANSLITERATORS[name]) elif name in BIDIRECTIONAL_TRANSLITERATORS: all_transforms.add(BIDIRECTIONAL_TRANSLITERATORS[name]) dependencies = defaultdict(list) def parse_steps(name, xml, reverse=False): steps = [] rule_set = [] for rule_type, rule in parse_transform_rules(xml, reverse=reverse): if rule_type == RULE: rule = format_rule(rule) rule_set.append(rule) elif rule_type == TRANSFORM: if rule_set: steps.append((STEP_RULESET, rule_set)) rule_set = [] if rule.lower() in all_transforms and rule.lower() not in EXCLUDE_TRANSLITERATORS: dependencies[name].append(rule.lower()) steps.append((STEP_TRANSFORM, rule.lower())) elif rule.lower() in name_aliases and rule.lower() not in EXCLUDE_TRANSLITERATORS: dep = name_aliases[rule.lower()] dependencies[name].append(dep) steps.append((STEP_TRANSFORM, dep)) elif rule.split('-')[0].lower() in all_transforms and rule.split('-')[0].lower() not in EXCLUDE_TRANSLITERATORS: dependencies[name].append(rule.split('-')[0].lower()) steps.append((STEP_TRANSFORM, rule.split('-')[0].lower())) rule = UTF8PROC_TRANSFORMS.get(rule, rule) if rule in UNICODE_NORMALIZATION_TRANSFORMS: steps.append((STEP_UNICODE_NORMALIZATION, rule)) if rule_set: steps.append((STEP_RULESET, rule_set)) return steps for filename in get_transforms(): name = filename.split('.xml')[0].lower() f = open(os.path.join(CLDR_TRANSFORMS_DIR, filename)) xml = etree.parse(f) source, target = get_source_and_target(xml) internal = is_internal(xml) if name in EXCLUDE_TRANSLITERATORS: continue reverse = name in REVERSE_TRANSLITERATORS bidirectional = name in BIDIRECTIONAL_TRANSLITERATORS if target.lower() == 'latin' or name == 'latin-ascii' and not internal: to_latin.add(name) retain_transforms.add(name) elif (reverse and source.lower() == 'latin') and not internal: to_latin.add(REVERSE_TRANSLITERATORS[name]) retain_transforms.add(REVERSE_TRANSLITERATORS[name]) elif (bidirectional and source.lower() == 'latin') and not internal: to_latin.add(BIDIRECTIONAL_TRANSLITERATORS[name]) retain_transforms.add(BIDIRECTIONAL_TRANSLITERATORS[name]) print 'doing', filename if not reverse and not bidirectional: steps = parse_steps(name, xml, reverse=False) transforms[name] = steps elif reverse: name = REVERSE_TRANSLITERATORS[name] steps = parse_steps(name, xml, reverse=True) transforms[name] = steps elif bidirectional: steps = parse_steps(name, xml, reverse=False) transforms[name] = steps name = BIDIRECTIONAL_TRANSLITERATORS[name] all_transforms.add(name) steps = parse_steps(name, xml, reverse=True) transforms[name] = steps dependency_queue = deque(to_latin) retain_transforms |= to_latin seen = set() while dependency_queue: name = dependency_queue.popleft() for dep in dependencies.get(name, []): retain_transforms.add(dep) if dep not in seen: dependency_queue.append(dep) seen.add(dep) all_rules = [] all_steps = [] all_transforms = [] for name, steps in transforms.iteritems(): if name in supplemental_transliterations: for step_type, rules in supplemental_transliterations[name]: if step_type == EXISTING_STEP: steps[-1][1].extend(rules) elif step_type == PREPEND_STEP: steps = [(STEP_RULESET, rules)] + steps else: steps.append((STEP_RULESET, rules)) # Only care if it's a transform to Latin/ASCII or a dependency # for a transform to Latin/ASCII elif name not in retain_transforms: continue step_index = len(all_steps) num_steps = len(steps) for i, (step_type, data) in enumerate(steps): if step_type == STEP_RULESET: rule_index = len(all_rules) num_rules = len(data) step = (STEP_RULESET, str(rule_index), str(num_rules), quote_string(str(i))) all_rules.extend(data) elif step_type == STEP_TRANSFORM: step = (STEP_TRANSFORM, '-1', '-1', quote_string(data)) elif step_type == STEP_UNICODE_NORMALIZATION: step = (STEP_UNICODE_NORMALIZATION, '-1', '-1', quote_string(data)) all_steps.append(step) internal = int(name not in to_latin) transliterator = (quote_string(name.replace('_', '-')), str(internal), str(step_index), str(num_steps)) all_transforms.append(transliterator) return all_transforms, all_steps, all_rules transliteration_data_template = u'''#include transliteration_rule_source_t rules_source[] = {{ {all_rules} }}; transliteration_step_source_t steps_source[] = {{ {all_steps} }}; transliterator_source_t transliterators_source[] = {{ {all_transforms} }}; ''' transliterator_script_data_template = u''' #ifndef TRANSLITERATION_SCRIPTS_H #define TRANSLITERATION_SCRIPTS_H #include #include "unicode_scripts.h" #include "transliterate.h" typedef struct script_transliteration_rule {{ script_type_t script; char *language; uint32_t index; uint32_t len; }} script_transliteration_rule_t; script_transliteration_rule_t script_transliteration_rules[] = {{ {rules} }}; char *script_transliterators[] = {{ {transliterators} }} #endif ''' script_transliterators = { 'arabic': {None: ['arabic-latin', 'arabic-latin-bgn'], 'fa': ['persian-latin-bgn'], 'ps': ['pashto-latin-bgn'], }, 'armenian': {None: ['armenian-latin-bgn']}, 'balinese': None, 'bamum': None, 'batak': None, 'bengali': {None: ['bengali-latin']}, 'bopomofo': None, 'braille': None, 'buginese': None, 'buhid': None, 'canadian_aboriginal': {None: ['canadianaboriginal-latin']}, 'cham': None, 'cherokee': None, 'common': {None: ['latin-ascii']}, 'coptic': None, 'cyrillic': {None: ['cyrillic-latin'], 'be': ['belarusian-latin-bgn'], 'ru': ['russian-latin-bgn'], 'bg': ['bulgarian-latin-bgn'], 'kk': ['kazakh-latin-bgn'], 'ky': ['kirghiz-latin-bgn'], 'mk': ['macedonian-latin-bgn'], 'mn': ['mongolian-latin-bgn'], 'sr': ['serbian-latin-bgn'], 'uk': ['ukrainian-latin-bgn'], 'uz': ['uzbek-latin-bgn'], }, 'devanagari': {None: ['devanagari-latin']}, 'ethiopic': None, 'georgian': {None: ['georgian-latin', 'georgian-latin-bgn']}, 'glagolitic': None, 'greek': {None: ['greek-latin', 'greek-latin-bgn', 'greek-latin-ungegn']}, 'gujarati': {None: ['gujarati-latin']}, 'gurmukhi': {None: ['gurmukhi-latin']}, 'han': {None: ['han-latin']}, 'hangul': {None: ['korean-latin-bgn']}, 'hanunoo': None, 'hebrew': {None: ['hebrew-latin', 'hebrew-latin-bgn']}, 'hiragana': {None: ['hiragana-latin']}, 'inherited': None, 'javanese': None, 'kannada': {None: ['kannada-latin']}, 'katakana': {None: ['katakana-latin', 'katakana-latin-bgn']}, 'kayah_li': None, 'khmer': None, 'lao': None, 'latin': {None: ['latin-ascii']}, 'lepcha': None, 'limbu': None, 'lisu': None, 'malayalam': {None: ['malayam-latin']}, 'mandaic': None, 'meetei_mayek': None, 'mongolian': None, 'myanmar': None, 'new_tai_lue': None, 'nko': None, 'ogham': None, 'ol_chiki': None, 'oriya': {None: ['oriya-latin']}, 'phags_pa': None, 'rejang': None, 'runic': None, 'samaritan': None, 'saurashtra': None, 'sinhala': None, 'sundanese': None, 'syloti_nagri': None, 'syriac': None, 'tagalog': None, 'tagbanwa': None, 'tai_le': None, 'tai_tham': None, 'tai_viet': None, 'tamil': {None: ['tamil-latin']}, 'telugu': {None: ['telugu-latin']}, 'thaana': None, 'thai': {None: ['thai-latin']}, 'tibetan': None, 'tifinagh': None, 'unknown': None, 'vai': None, 'yi': None } def write_transliterator_scripts_file(filename): transliterator_rule_template = '''{{{script_type}, {lang}, {start}, {length}}}''' rules = [] all_transliterators = [] index = 0 for script, i in unicode_script_ids.iteritems(): spec = script_transliterators.get(script.lower()) if not spec: continue script_type = 'SCRIPT_{}'.format(script.upper()) for lang, transliterators in spec.iteritems(): lang = 'NULL' if not lang else quote_string(lang) num_transliterators = len(transliterators) rules.append(transliterator_rule_template.format(script_type=script_type, lang=lang, start=index, length=num_transliterators)) for trans in transliterators: all_transliterators.append(quote_string(trans)) index += num_transliterators template = transliterator_script_data_template.format(rules=''', '''.join(rules), transliterators=''', '''.join(all_transliterators)) f = open(filename, 'w') f.write(safe_encode(template)) def write_transliteration_data_file(filename): transforms, steps, rules = get_all_transform_rules() all_transforms = u''', '''.join([u'{{{}}}'.format(u','.join(t)) for t in transforms]) all_steps = u''', '''.join([u'{{{}}}'.format(u','.join(s)) for s in steps]) for r in rules: try: r = u','.join(r) except Exception: print 'Problem with rule' print r all_rules = u''', '''.join([u'{{{}}}'.format(u','.join(r)) for r in rules]) template = transliteration_data_template.format( all_transforms=all_transforms, all_steps=all_steps, all_rules=all_rules, ) f = open(filename, 'w') f.write(safe_encode(template)) TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c' TRANSLITERATION_SCRIPTS_FILENAME = 'transliteration_scripts.h' def main(out_dir): write_transliteration_data_file(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME)) write_transliterator_scripts_file(os.path.join(out_dir, TRANSLITERATION_SCRIPTS_FILENAME)) if __name__ == '__main__': if len(sys.argv) < 2: print 'Usage: python transliteration_rules.py out_dir' exit(1) main(sys.argv[1])