From be29874f1307dace7bbc036101dc1bcb57b1999a Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 25 Apr 2015 15:42:21 -0400 Subject: [PATCH] [transliteration] Parser for CLDR transforms to generate (simple) C transform rules --- scripts/geodata/i18n/transliteration_rules.py | 833 ++++++++++++++++++ 1 file changed, 833 insertions(+) create mode 100644 scripts/geodata/i18n/transliteration_rules.py diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py new file mode 100644 index 00000000..39e1481e --- /dev/null +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -0,0 +1,833 @@ +# -*- coding: utf-8 -*- +''' +transliteration.py + +Automatically builds rules for transforming other scripts (e.g. Cyrillic, Greek, +Han, Katakana, Devanagari, etc.) into Latin characters. + +Uses XML transforms from the CLDR repository. + +''' + +import argparse +import codecs +import csv +import htmlentitydefs +import itertools +import os +import re +import requests +import sys +import time +import urlparse +import unicodedata + +from collections import defaultdict + +from lxml import etree + +from scanner import Scanner +from unicode_scripts import get_chars_by_script +from unicode_paths import CLDR_DIR +from geodata.encoding import safe_decode, safe_encode + +CLDR_TRANSFORMS_DIR = os.path.join(CLDR_DIR, 'common', 'transforms') + +PRE_TRANSFORM = 1 +FORWARD_TRANSFORM = 2 +BACKWARD_TRANSFORM = 3 +BIDIRECTIONAL_TRANSFORM = 4 + +PRE_TRANSFORM_OP = '::' +BACKWARD_TRANSFORM_OP = u'←' +FORWARD_TRANSFORM_OP = u'→' +BIDIRECTIONAL_TRANSFORM_OP = u'↔' + +ASSIGNMENT_OP = '=' + +PRE_CONTEXT_INDICATOR = '{' +POST_CONTEXT_INDICATOR = '}' + +REVISIT_INDICATOR = '|' + +WORD_BOUNDARY_VAR = 'wordBoundary' + +EMPTY_TRANSITION = u'\u007f' + +EXCLUDE_TRANSLITERATORS = set([ + 'Hangul-Latin', + 'InterIndic-Latin', + 'Jamo-Latin', +]) + +NFD = 'NFD' +NFKD = 'NFKD' +NFC = 'NFC' +NFKC = 'NFKC' + +LOWER = 'lower' +UPPER = 'upper' +TITLE = 'title' + +UNICODE_NORMALIZATION_TRANSFORMS = set([ + NFD, + NFKD, + NFC, + NFKC, +]) + +unicode_category_aliases = { + 'letter': 'L', + 'lower': 'Ll', + 'lowercase': 'Ll', + 'lowercaseletter': 'Ll', + 'upper': 'Lu', + 'uppercase': 'Lu', + 'uppercaseletter': 'Lu', + 'title': 'Lt', + 'nonspacing mark': 'Mn', + 'mark': 'M', +} + +unicode_categories = defaultdict(list) +unicode_general_categories = defaultdict(list) +unicode_scripts = defaultdict(list) + + +def init_unicode_categories(): + global unicode_categories, unicode_general_categories, unicode_scripts + + for i in xrange(65536): + unicode_categories[unicodedata.category(unichr(i))].append(unichr(i)) + + for key in unicode_categories.keys(): + unicode_general_categories[key[0]].extend(unicode_categories[key]) + + script_chars = get_chars_by_script() + for i, script in enumerate(script_chars): + if script: + unicode_scripts[script.lower()].append(unichr(i)) + + +RULE = 'RULE' +TRANSFORM = 'TRANSFORM' + +UTF8PROC_TRANSFORMS = { + 'Any-NFC': NFC, + 'Any-NFD': NFD, + 'Any-NFKD': NFKD, + 'Any-NFKC': NFKC, + 'Any-Lower': LOWER, + 'Any-Upper': UPPER, + 'Any-Title': TITLE, +} + + +CONTEXT_TYPE_NONE = 'CONTEXT_TYPE_NONE' +CONTEXT_TYPE_STRING = 'CONTEXT_TYPE_STRING' +CONTEXT_TYPE_WORD_BOUNDARY = 'CONTEXT_TYPE_WORD_BOUNDARY' +CONTEXT_TYPE_REGEX = 'CONTEXT_TYPE_REGEX' + +all_transforms = set() + +pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE) +pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\(.*\)[\s]*)?', re.UNICODE) +transform_regex = re.compile(u"(?:[\s]*(?!=[\s])(.*)(? ord(start): + # Ranges are inclusive + chars.extend([unichr(c) for c in range(ord(start), ord(end) + 1)]) + + return chars + + +def parse_regex_char_class(c): + chars = [] + orig = c + c = c.strip(':') + is_negation = False + if c.startswith('^'): + is_negation = True + c = c.strip('^') + + if '=' in c: + cat, c = c.split('=') + if cat.strip() in ('script', 'sc'): + c = c.strip() + + c = unicode_category_aliases.get(c.lower(), c) + + if c in unicode_general_categories: + chars = unicode_general_categories[c] + elif c in unicode_categories: + chars = unicode_categories.get(c) + elif c.lower() in unicode_scripts: + chars = unicode_scripts[c.lower()] + elif c.lower() in unicode_properties: + chars = unicode_properties[c.lower()] + else: + chars = [] + + if is_negation: + chars = sorted(all_chars - set(chars)) + + return chars + + +def parse_regex_char_set(s): + ''' + Given a regex character set, which may look something like: + + [[:Latin:][:Greek:] & [:Ll:]] + [A-Za-z_] + [ $lowerVowel $upperVowel ] + + Parse into a single, flat character set without the unicode properties, + ranges, unions/intersections, etc. + ''' + s = s[1:-1] + is_negation = False + this_group = set() + is_intersection = False + is_word_boundary = False + + for token, token_class in char_set_scanner.scan(s): + if token_class == CHAR_RANGE: + this_char_set = set(parse_regex_char_range(token)) + this_group |= this_char_set + elif token_class == ESCAPED_CHARACTER: + token = token.strip('\\') + this_group.add(token) + elif token_class == SINGLE_QUOTE: + this_group.add("'") + elif token_class == QUOTED_STRING: + this_group.add(token.strip("'")) + elif token_class == NEGATION: + is_negation = True + elif token_class == CHAR_CLASS: + this_group |= set(parse_regex_char_class(token)) + elif token_class == CHAR_SET: + # Recursive calls, as performance doesn't matter here and nesting is shallow + this_char_set = set(parse_regex_char_set(token)) + # Shouldn't be complex set expression logic here + if is_intersection: + this_group &= this_char_set + else: + this_group |= this_char_set + elif token_class == INTERSECTION: + is_intersection = True + elif token_class == CHARACTER: + this_group.add(token) + elif token_class == WORD_BOUNDARY: + is_word_boundary = True + + if is_negation: + this_group = all_chars - this_group + + return sorted(this_group) + (['$'] if is_word_boundary else []) + + +for name, regex_range in unicode_property_regexes: + unicode_properties[name] = parse_regex_char_set(regex_range) + + +def get_source_and_target(xml): + return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0] + + +def get_raw_rules_and_variables(xml): + ''' + Parse tRule nodes from the transform XML + + At this point we only care about lvalue, op and rvalue + for parsing forward and two-way transforms. + + Variables are collected in a dictionary in this pass so they can be substituted later + ''' + rules = [] + variables = {} + + for rule in xml.xpath('*//tRule'): + if not rule.text: + continue + rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip()) + rule = literal_space_regex.sub(replace_literal_space, rule) + rule = escaped_unicode_regex.sub(unescape_unicode_char, rule) + rule = rule.rstrip(END_CHAR).strip() + + transform = transform_regex.match(rule) + if transform: + lvalue, op, rvalue = transform.groups() + lvalue = lvalue.strip() + rvalue = rvalue.strip() + + if op == FORWARD_TRANSFORM_OP: + rules.append((FORWARD_TRANSFORM, (lvalue, rvalue))) + elif op == BIDIRECTIONAL_TRANSFORM_OP: + rules.append((BIDIRECTIONAL_TRANSFORM, (lvalue, rvalue))) + elif op == BACKWARD_TRANSFORM_OP: + rules.append((BACKWARD_TRANSFORM, (lvalue, rvalue))) + elif op == ASSIGNMENT_OP: + var_name = lvalue.lstrip('$') + variables[var_name] = rvalue + else: + pre_transform = pre_transform_full_regex.match(rule) + if pre_transform: + rules.append((PRE_TRANSFORM, pre_transform.group(1))) + + return rules, variables + +CHAR_CLASSES = set([ + ESCAPED_CHARACTER, + CHAR_CLASS, + QUOTED_STRING, + CHARACTER, + GROUP_REF, +]) + + +def char_permutations(s): + ''' + char_permutations + + Parses the lvalue or rvalue of a transform rule into + a list of character permutations, in addition to keeping + track of revisits and regex groups + ''' + char_types = [] + move = 0 + in_revisit = False + + in_group = False + last_token_group_start = False + + start_group = 0 + end_group = 0 + + open_brackets = 0 + current_set = [] + + groups = [] + + for token, token_type in transform_scanner.scan(s): + if open_brackets > 0 and token_type not in (OPEN_SET, CLOSE_SET): + current_set.append(token) + continue + + if token_type == ESCAPED_CHARACTER: + char_types.append([token.strip('\\')]) + elif token_type == OPEN_GROUP: + in_group = True + last_token_group_start = True + elif token_type == CLOSE_GROUP: + in_group = False + end_group = len(char_types) + groups.append((start_group, end_group)) + elif token_type == OPEN_SET: + open_brackets += 1 + current_set.append(token) + elif token_type == CLOSE_SET: + open_brackets -= 1 + current_set.append(token) + if open_brackets == 0: + char_types.append(parse_regex_char_set(u''.join(current_set))) + current_set = [] + elif token_type == QUOTED_STRING: + token = token.strip("'") + for c in token: + char_types.append([c]) + elif token_type == GROUP_REF: + char_types.append([token]) + elif token_type == REVISIT: + in_revisit = True + elif token_type == REPEAT: + char_types.append([STAR]) + elif token_type == PLUS: + char_types.append([PLUS]) + elif token_type == OPTIONAL: + char_types[-1].append('') + elif token_type == REVISIT: + in_revisit = True + elif token_type == HTML_ENTITY: + char_types.append([replace_html_entity(token)]) + elif token_type == CHARACTER: + char_types.append([token]) + + if in_group and last_token_group_start: + start_group = len(char_types) + last_token_group_start = False + + if in_revisit and token_type in CHAR_CLASSES: + move += 1 + + return char_types, move, groups + + return list(itertools.product(char_types)), move + + +def quote_string(s): + return u'"{}"'.format(s.replace('"', '\\"')) + + +def char_types_string(char_types): + ''' + Transforms the char_permutations output into a string + suitable for simple parsing in C (characters and character sets only, + no variables, unicode character properties or unions/intersections) + ''' + ret = [] + + for chars in char_types: + template = u'{}' if len(chars) == 1 else u'[{}]' + norm = [] + for c in chars: + if c == '[': + c = '\[' + elif c == '': + c = EMPTY_TRANSITION + elif c == '*': + c = '\*' + elif c == '+': + c = '\+' + elif c == PLUS: + c = '+' + elif c == STAR: + c = '*' + + norm.append(c) + + ret.append(template.format(u''.join(norm))) + + return u''.join(ret) + + +def format_groups(char_types, groups): + group_regex = [] + last_end = 0 + for start, end in groups: + group_regex.append(char_types_string(char_types[last_end:start])) + group_regex.append(u'(') + group_regex.append(char_types_string(char_types[start:end + 1])) + group_regex.append(u')') + last_end = end + group_regex.append(char_types_string(char_types[last_end + 1:])) + return u''.join(group_regex) + + +charset_regex = re.compile(r'(? +#include + +#include "transliteration_rule.h" + +transliteration_rule_source_t rules_source[] = {{ + {all_rules} +}}; + +transliteration_step_source_t steps_source[] = {{ + {all_steps} +}}; + +transliterator_source_t transliterators_source[] = {{ + {all_transforms} +}}; + +''' + + +def create_transliterator(name, internal, steps): + return transliterator_template.format(name=name, internal=int(internal), num_steps=len(steps)) + + +TRANSLITERATION_DATA_FILENAME = 'transliteration_data.c' + + +def main(out_dir): + f = open(os.path.join(out_dir, TRANSLITERATION_DATA_FILENAME), 'w') + transforms, steps, rules = get_all_transform_rules() + + all_transforms = u''', + '''.join([u'{{{}}}'.format(u','.join(t)) for t in transforms]) + + all_steps = u''', + '''.join([u'{{{}}}'.format(u','.join(s)) for s in steps]) + + all_rules = u''', + '''.join([u'{{{}}}'.format(u','.join(r)) for r in rules]) + + template = transliteration_data_template.format( + all_transforms=all_transforms, + all_steps=all_steps, + all_rules=all_rules + ) + + f.write(safe_encode(template)) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print 'Usage: python transliteration_rules.py out_dir' + exit(1) + main(sys.argv[1])