# -*- coding: utf-8 -*- ''' transliteration.py Automatically builds rules for transforming other scripts (e.g. Cyrillic, Greek, Han, Katakana, Devanagari, etc.) into Latin characters. Uses XML transforms from the CLDR repository. ''' import argparse import codecs import csv import htmlentitydefs import itertools import os import re import requests import sys import time import urlparse import unicodedata from collections import defaultdict from lxml import etree from scanner import Scanner from unicode_scripts import get_chars_by_script from unicode_paths import CLDR_DIR from geodata.encoding import safe_decode, safe_encode CLDR_TRANSFORMS_DIR = os.path.join(CLDR_DIR, 'common', 'transforms') PRE_TRANSFORM = 1 FORWARD_TRANSFORM = 2 BACKWARD_TRANSFORM = 3 BIDIRECTIONAL_TRANSFORM = 4 PRE_TRANSFORM_OP = '::' BACKWARD_TRANSFORM_OP = u'←' FORWARD_TRANSFORM_OP = u'→' BIDIRECTIONAL_TRANSFORM_OP = u'↔' ASSIGNMENT_OP = '=' PRE_CONTEXT_INDICATOR = '{' POST_CONTEXT_INDICATOR = '}' REVISIT_INDICATOR = '|' WORD_BOUNDARY_VAR = 'wordBoundary' EMPTY_TRANSITION = u'\u007f' EXCLUDE_TRANSLITERATORS = set([ 'Hangul-Latin', 'InterIndic-Latin', 'Jamo-Latin', ]) NFD = 'NFD' NFKD = 'NFKD' NFC = 'NFC' NFKC = 'NFKC' LOWER = 'lower' UPPER = 'upper' TITLE = 'title' UNICODE_NORMALIZATION_TRANSFORMS = set([ NFD, NFKD, NFC, NFKC, ]) unicode_category_aliases = { 'letter': 'L', 'lower': 'Ll', 'lowercase': 'Ll', 'lowercaseletter': 'Ll', 'upper': 'Lu', 'uppercase': 'Lu', 'uppercaseletter': 'Lu', 'title': 'Lt', 'nonspacing mark': 'Mn', 'mark': 'M', } unicode_categories = defaultdict(list) unicode_general_categories = defaultdict(list) unicode_scripts = defaultdict(list) def init_unicode_categories(): global unicode_categories, unicode_general_categories, unicode_scripts for i in xrange(65536): unicode_categories[unicodedata.category(unichr(i))].append(unichr(i)) for key in unicode_categories.keys(): unicode_general_categories[key[0]].extend(unicode_categories[key]) script_chars = get_chars_by_script() for i, script in enumerate(script_chars): if script: unicode_scripts[script.lower()].append(unichr(i)) RULE = 'RULE' TRANSFORM = 'TRANSFORM' UTF8PROC_TRANSFORMS = { 'Any-NFC': NFC, 'Any-NFD': NFD, 'Any-NFKD': NFKD, 'Any-NFKC': NFKC, 'Any-Lower': LOWER, 'Any-Upper': UPPER, 'Any-Title': TITLE, } CONTEXT_TYPE_NONE = 'CONTEXT_TYPE_NONE' CONTEXT_TYPE_STRING = 'CONTEXT_TYPE_STRING' CONTEXT_TYPE_WORD_BOUNDARY = 'CONTEXT_TYPE_WORD_BOUNDARY' CONTEXT_TYPE_REGEX = 'CONTEXT_TYPE_REGEX' all_transforms = set() pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE) pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\(.*\)[\s]*)?', re.UNICODE) transform_regex = re.compile(u"(?:[\s]*(?!=[\s])(.*)(? ord(start): # Ranges are inclusive chars.extend([unichr(c) for c in range(ord(start), ord(end) + 1)]) return chars def parse_regex_char_class(c): chars = [] orig = c c = c.strip(':') is_negation = False if c.startswith('^'): is_negation = True c = c.strip('^') if '=' in c: cat, c = c.split('=') if cat.strip() in ('script', 'sc'): c = c.strip() c = unicode_category_aliases.get(c.lower(), c) if c in unicode_general_categories: chars = unicode_general_categories[c] elif c in unicode_categories: chars = unicode_categories.get(c) elif c.lower() in unicode_scripts: chars = unicode_scripts[c.lower()] elif c.lower() in unicode_properties: chars = unicode_properties[c.lower()] else: chars = [] if is_negation: chars = sorted(all_chars - set(chars)) return chars def parse_regex_char_set(s): ''' Given a regex character set, which may look something like: [[:Latin:][:Greek:] & [:Ll:]] [A-Za-z_] [ $lowerVowel $upperVowel ] Parse into a single, flat character set without the unicode properties, ranges, unions/intersections, etc. ''' s = s[1:-1] is_negation = False this_group = set() is_intersection = False is_word_boundary = False for token, token_class in char_set_scanner.scan(s): if token_class == CHAR_RANGE: this_char_set = set(parse_regex_char_range(token)) this_group |= this_char_set elif token_class == ESCAPED_CHARACTER: token = token.strip('\\') this_group.add(token) elif token_class == SINGLE_QUOTE: this_group.add("'") elif token_class == QUOTED_STRING: this_group.add(token.strip("'")) elif token_class == NEGATION: is_negation = True elif token_class == CHAR_CLASS: this_group |= set(parse_regex_char_class(token)) elif token_class == CHAR_SET: # Recursive calls, as performance doesn't matter here and nesting is shallow this_char_set = set(parse_regex_char_set(token)) # Shouldn't be complex set expression logic here if is_intersection: this_group &= this_char_set else: this_group |= this_char_set elif token_class == INTERSECTION: is_intersection = True elif token_class == CHARACTER: this_group.add(token) elif token_class == WORD_BOUNDARY: is_word_boundary = True if is_negation: this_group = all_chars - this_group return sorted(this_group) + (['$'] if is_word_boundary else []) for name, regex_range in unicode_property_regexes: unicode_properties[name] = parse_regex_char_set(regex_range) def get_source_and_target(xml): return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0] def get_raw_rules_and_variables(xml): ''' Parse tRule nodes from the transform XML At this point we only care about lvalue, op and rvalue for parsing forward and two-way transforms. Variables are collected in a dictionary in this pass so they can be substituted later ''' rules = [] variables = {} for rule in xml.xpath('*//tRule'): if not rule.text: continue rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip()) rule = literal_space_regex.sub(replace_literal_space, rule) rule = escaped_unicode_regex.sub(unescape_unicode_char, rule) rule = rule.rstrip(END_CHAR).strip() transform = transform_regex.match(rule) if transform: lvalue, op, rvalue = transform.groups() lvalue = lvalue.strip() rvalue = rvalue.strip() if op == FORWARD_TRANSFORM_OP: rules.append((FORWARD_TRANSFORM, (lvalue, rvalue))) elif op == BIDIRECTIONAL_TRANSFORM_OP: rules.append((BIDIRECTIONAL_TRANSFORM, (lvalue, rvalue))) elif op == BACKWARD_TRANSFORM_OP: rules.append((BACKWARD_TRANSFORM, (lvalue, rvalue))) elif op == ASSIGNMENT_OP: var_name = lvalue.lstrip('$') variables[var_name] = rvalue else: pre_transform = pre_transform_full_regex.match(rule) if pre_transform: rules.append((PRE_TRANSFORM, pre_transform.group(1))) return rules, variables CHAR_CLASSES = set([ ESCAPED_CHARACTER, CHAR_CLASS, QUOTED_STRING, CHARACTER, GROUP_REF, ]) def char_permutations(s): ''' char_permutations Parses the lvalue or rvalue of a transform rule into a list of character permutations, in addition to keeping track of revisits and regex groups ''' char_types = [] move = 0 in_revisit = False in_group = False last_token_group_start = False start_group = 0 end_group = 0 open_brackets = 0 current_set = [] groups = [] for token, token_type in transform_scanner.scan(s): if open_brackets > 0 and token_type not in (OPEN_SET, CLOSE_SET): current_set.append(token) continue if token_type == ESCAPED_CHARACTER: char_types.append([token.strip('\\')]) elif token_type == OPEN_GROUP: in_group = True last_token_group_start = True elif token_type == CLOSE_GROUP: in_group = False end_group = len(char_types) groups.append((start_group, end_group)) elif token_type == OPEN_SET: open_brackets += 1 current_set.append(token) elif token_type == CLOSE_SET: open_brackets -= 1 current_set.append(token) if open_brackets == 0: char_types.append(parse_regex_char_set(u''.join(current_set))) current_set = [] elif token_type == QUOTED_STRING: token = token.strip("'") for c in token: char_types.append([c]) elif token_type == GROUP_REF: char_types.append([token]) elif token_type == REVISIT: in_revisit = True elif token_type == REPEAT: char_types.append([STAR]) elif token_type == PLUS: char_types.append([PLUS]) elif token_type == OPTIONAL: char_types[-1].append('') elif token_type == REVISIT: in_revisit = True elif token_type == HTML_ENTITY: char_types.append([replace_html_entity(token)]) elif token_type == CHARACTER: char_types.append([token]) if in_group and last_token_group_start: start_group = len(char_types) last_token_group_start = False if in_revisit and token_type in CHAR_CLASSES: move += 1 return char_types, move, groups return list(itertools.product(char_types)), move def quote_string(s): return u'"{}"'.format(s.replace('"', '\\"')) def char_types_string(char_types): ''' Transforms the char_permutations output into a string suitable for simple parsing in C (characters and character sets only, no variables, unicode character properties or unions/intersections) ''' ret = [] for chars in char_types: template = u'{}' if len(chars) == 1 else u'[{}]' norm = [] for c in chars: if c == '[': c = '\[' elif c == '': c = EMPTY_TRANSITION elif c == '*': c = '\*' elif c == '+': c = '\+' elif c == PLUS: c = '+' elif c == STAR: c = '*' norm.append(c) ret.append(template.format(u''.join(norm))) return u''.join(ret) def format_groups(char_types, groups): group_regex = [] last_end = 0 for start, end in groups: group_regex.append(char_types_string(char_types[last_end:start])) group_regex.append(u'(') group_regex.append(char_types_string(char_types[start:end + 1])) group_regex.append(u')') last_end = end group_regex.append(char_types_string(char_types[last_end + 1:])) return u''.join(group_regex) charset_regex = re.compile(r'(? #include #include "transliteration_rule.h" transliteration_rule_source_t rules_source[] = {{ {all_rules} }}; transliteration_step_source_t steps_source[] = {{ {all_steps} }}; transliterator_source_t transliterators_source[] = {{ {all_transforms} }}; ''' def create_transliterator(name, internal, steps): return transliterator_template.format(name=name, internal=int(internal), num_steps=len(steps)) TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c' def main(out_dir): f = open(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME), 'w') transforms, steps, rules = get_all_transform_rules() all_transforms = u''', '''.join([u'{{{}}}'.format(u','.join(t)) for t in transforms]) all_steps = u''', '''.join([u'{{{}}}'.format(u','.join(s)) for s in steps]) all_rules = u''', '''.join([u'{{{}}}'.format(u','.join(r)) for r in rules]) template = transliteration_data_template.format( all_transforms=all_transforms, all_steps=all_steps, all_rules=all_rules ) f.write(safe_encode(template)) if __name__ == '__main__': if len(sys.argv) < 2: print 'Usage: python transliteration_rules.py out_dir' exit(1) main(sys.argv[1])