From 3814af52ec0c8a52c21f2a18f61dea6c7eb63cec Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 12 May 2015 12:10:15 -0400 Subject: [PATCH] [transliteration] Python script now implements the full TR-35 spec, including filter rules, which cuts down significantly on the size of the data file and complexity of generating the trie --- scripts/geodata/i18n/transliteration_rules.py | 216 +++++++++++++----- 1 file changed, 162 insertions(+), 54 deletions(-) diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py index 8b4a1ebb..a20571b0 100644 --- a/scripts/geodata/i18n/transliteration_rules.py +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -27,7 +27,7 @@ from collections import defaultdict from lxml import etree from scanner import Scanner -from unicode_scripts import get_chars_by_script +from unicode_scripts import * from unicode_paths import CLDR_DIR from geodata.encoding import safe_decode, safe_encode @@ -39,9 +39,9 @@ BACKWARD_TRANSFORM = 3 BIDIRECTIONAL_TRANSFORM = 4 PRE_TRANSFORM_OP = '::' -BACKWARD_TRANSFORM_OP = u'←' -FORWARD_TRANSFORM_OP = u'→' -BIDIRECTIONAL_TRANSFORM_OP = u'↔' +BACKWARD_TRANSFORM_OPS = set([u'←', u'<']) +FORWARD_TRANSFORM_OPS = set([u'→', u'>']) +BIDIRECTIONAL_TRANSFORM_OPS = set([u'↔', u'<>']) ASSIGNMENT_OP = '=' @@ -58,6 +58,18 @@ word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$')) WORD_BOUNDARY_CHAR = u'\u0001' EMPTY_TRANSITION = u'\u0004' +NAMESPACE_SEPARATOR_CHAR = u"|" + +WORD_BOUNDARY_CHAR = u"\x01" +PRE_CONTEXT_CHAR = u"\x02" +POST_CONTEXT_CHAR = u"\x03" +EMPTY_TRANSITION_CHAR = u"\x04" +REPEAT_ZERO_CHAR = u"\x05" +REPEAT_ONE_CHAR = u"\x06" +BEGIN_SET_CHAR = u"\x07" +END_SET_CHAR = u"\x08" +GROUP_INDICATOR_CHAR = u"\x09" + EXCLUDE_TRANSLITERATORS = set([ 'Hangul-Latin', 'InterIndic-Latin', @@ -94,27 +106,74 @@ unicode_category_aliases = { } unicode_categories = defaultdict(list) +unicode_blocks = defaultdict(list) +unicode_combining_classes = defaultdict(list) unicode_general_categories = defaultdict(list) unicode_scripts = defaultdict(list) +unicode_properties = {} + +unicode_blocks = {} +unicode_category_aliases = {} +unicode_property_aliases = {} +unicode_property_value_aliases = {} +unicode_word_breaks = {} + +COMBINING_CLASS_PROP = 'canonical_combining_class' +BLOCK_PROP = 'block' +GENERAL_CATEGORY_PROP = 'general_category' +SCRIPT_PROP = 'script' +WORD_BREAK_PROP = 'word_break' + + +class TransliterationParseError(Exception): + pass def init_unicode_categories(): - global unicode_categories, unicode_general_categories, unicode_scripts + global unicode_categories, unicode_general_categories, unicode_scripts, unicode_category_aliases + global unicode_blocks, unicode_combining_classes, unicode_properties, unicode_property_aliases + global unicode_property_value_aliases, unicode_scripts, unicode_word_breaks for i in xrange(65536): unicode_categories[unicodedata.category(unichr(i))].append(unichr(i)) + unicode_combining_classes[str(unicodedata.combining(unichr(i)))].append(unichr(i)) + + unicode_categories = dict(unicode_categories) + unicode_combining_classes = dict(unicode_combining_classes) for key in unicode_categories.keys(): unicode_general_categories[key[0]].extend(unicode_categories[key]) + unicode_general_categories = dict(unicode_general_categories) + script_chars = get_chars_by_script() for i, script in enumerate(script_chars): if script: unicode_scripts[script.lower()].append(unichr(i)) + unicode_scripts = dict(unicode_scripts) + + unicode_blocks.update(get_unicode_blocks()) + unicode_properties.update(get_unicode_properties()) + unicode_property_aliases.update(get_property_aliases()) + + unicode_word_breaks.update(get_word_break_properties()) + + for key, value in get_property_value_aliases().iteritems(): + key = unicode_property_aliases.get(key, key) + if key == GENERAL_CATEGORY_PROP: + for k, v in value.iteritems(): + k = k.lower() + unicode_category_aliases[k] = v + if '_' in k: + unicode_category_aliases[k.replace('_', '')] = v + + unicode_property_value_aliases[key] = value + RULE = 'RULE' TRANSFORM = 'TRANSFORM' +FILTER = 'FILTER' UTF8PROC_TRANSFORMS = { 'Any-NFC': NFC, @@ -136,7 +195,7 @@ all_transforms = set() pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE) pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\(.*\)[\s]*)?', re.UNICODE) -transform_regex = re.compile(u"(?:[\s]*(?!=[\s])(.*)(?)|[←<→>↔=])(?:[\s]*(?!=[\s])(.*)(? ord(start): # Ranges are inclusive chars.extend([unichr(c) for c in range(ord(start), ord(end) + 1)]) @@ -295,37 +357,62 @@ def parse_regex_char_range(regex): return chars -def parse_regex_char_class(c): +def parse_regex_char_class(c, current_filter=all_chars): chars = [] orig = c - c = c.strip(':') + if c.startswith('\\p'): + c = c.split('{')[-1].split('}')[0] + + c = c.strip(': ') is_negation = False if c.startswith('^'): is_negation = True c = c.strip('^') if '=' in c: - cat, c = c.split('=') - if cat.strip() in ('script', 'sc'): - c = c.strip() + prop, value = c.split('=') + prop = unicode_property_aliases.get(prop.lower(), prop) - c = unicode_category_aliases.get(c.lower(), c) + value = unicode_property_value_aliases.get(prop.lower(), {}).get(value, value) - if c in unicode_general_categories: - chars = unicode_general_categories[c] - elif c in unicode_categories: - chars = unicode_categories.get(c) - elif c.lower() in unicode_scripts: - chars = unicode_scripts[c.lower()] - elif c.lower() in unicode_properties: - chars = unicode_properties[c.lower()] + if prop == COMBINING_CLASS_PROP: + chars = unicode_combining_classes[value] + elif prop == GENERAL_CATEGORY_PROP: + chars = unicode_categories.get(value, unicode_general_categories[value]) + elif prop == BLOCK_PROP: + chars = unicode_blocks[value.lower()] + elif prop == SCRIPT_PROP: + chars = unicode_scripts[value.lower()] + elif prop == WORD_BREAK_PROP: + chars = unicode_word_breaks[value] + else: + raise TransliterationParseError(c) else: - chars = [] + c = c.replace('-', '_').replace(' ', '_') + + if c.lower() in unicode_property_aliases: + c = unicode_property_aliases[c.lower()] + elif c.lower() in unicode_category_aliases: + c = unicode_category_aliases[c.lower()] + + if c in unicode_general_categories: + chars = unicode_general_categories[c] + elif c in unicode_categories: + chars = unicode_categories[c] + elif c.lower() in unicode_properties: + chars = unicode_properties[c.lower()] + + elif c.lower() in unicode_scripts: + chars = unicode_scripts[c.lower()] + elif c.lower() in unicode_properties: + chars = unicode_properties[c.lower()] + else: + raise TransliterationParseError(c) if is_negation: - chars = all_chars - set(chars) + chars = current_filter - set(chars) - return sorted(set(chars) - control_chars) + return sorted((set(chars) & current_filter) - control_chars) def parse_balanced_sets(s): @@ -355,7 +442,7 @@ def parse_balanced_sets(s): skip = True -def parse_regex_char_set(s): +def parse_regex_char_set(s, current_filter=all_chars): ''' Given a regex character set, which may look something like: @@ -389,11 +476,11 @@ def parse_regex_char_set(s): this_group.add(token.strip("'")) elif token_class == NEGATION: is_negation = True - elif token_class == CHAR_CLASS: - this_group |= set(parse_regex_char_class(token)) + elif token_class in (CHAR_CLASS, CHAR_CLASS_PCRE): + this_group |= set(parse_regex_char_class(token, current_filter=current_filter)) elif token_class in (CHAR_SET, CHAR_MULTI_SET): # Recursive calls, as performance doesn't matter here and nesting is shallow - this_char_set = set(parse_regex_char_set(token)) + this_char_set = set(parse_regex_char_set(token, current_filter=current_filter)) if is_intersection: this_group &= this_char_set is_intersection = False @@ -412,9 +499,9 @@ def parse_regex_char_set(s): is_word_boundary = True if is_negation: - this_group = all_chars - this_group + this_group = current_filter - this_group - return sorted(this_group - control_chars) + (['$'] if is_word_boundary else []) + return sorted((this_group & current_filter) - control_chars) + ([WORD_BOUNDARY_CHAR] if is_word_boundary else []) for name, regex_range in unicode_property_regexes: @@ -437,33 +524,48 @@ def get_raw_rules_and_variables(xml): rules = [] variables = {} + in_compound_rule = False + compound_rule = [] + for rule in xml.xpath('*//tRule'): if not rule.text: continue + rule = safe_decode(rule.text.rsplit(COMMENT_CHAR)[0].strip()) rule = literal_space_regex.sub(replace_literal_space, rule) rule = escaped_unicode_regex.sub(unescape_unicode_char, rule) rule = rule.rstrip(END_CHAR).strip() + if rule.strip().endswith('\\'): + compound_rule.append(rule.rstrip('\\')) + continue + elif in_compound_rule: + compound_rule.append(rule) + rule = u''.join(compound_rule) + in_compound_rule = False + transform = transform_regex.match(rule) - if transform: + pre_transform = pre_transform_full_regex.match(rule) + + if pre_transform: + rules.append((PRE_TRANSFORM, pre_transform.group(1))) + + elif transform: lvalue, op, rvalue = transform.groups() lvalue = lvalue.strip() rvalue = rvalue.strip() - if op == FORWARD_TRANSFORM_OP: + if op in FORWARD_TRANSFORM_OPS: rules.append((FORWARD_TRANSFORM, (lvalue, rvalue))) - elif op == BIDIRECTIONAL_TRANSFORM_OP: + elif op in BIDIRECTIONAL_TRANSFORM_OPS: rules.append((BIDIRECTIONAL_TRANSFORM, (lvalue, rvalue))) - elif op == BACKWARD_TRANSFORM_OP: + elif op in BACKWARD_TRANSFORM_OPS: rules.append((BACKWARD_TRANSFORM, (lvalue, rvalue))) elif op == ASSIGNMENT_OP: var_name = lvalue.lstrip('$') variables[var_name] = rvalue else: - pre_transform = pre_transform_full_regex.match(rule) - if pre_transform: - rules.append((PRE_TRANSFORM, pre_transform.group(1))) + print 'non-rule', rule, get_source_and_target(xml) return rules, variables @@ -476,7 +578,7 @@ CHAR_CLASSES = set([ ]) -def char_permutations(s): +def char_permutations(s, current_filter=all_chars): ''' char_permutations @@ -520,22 +622,22 @@ def char_permutations(s): open_brackets -= 1 current_set.append(token) if open_brackets == 0: - char_types.append(parse_regex_char_set(u''.join(current_set))) + char_types.append(parse_regex_char_set(u''.join(current_set), current_filter=current_filter)) current_set = [] elif token_type == QUOTED_STRING: token = token.strip("'") for c in token: char_types.append([c]) elif token_type == GROUP_REF: - char_types.append([token]) + char_types.append([token.replace('$', GROUP_INDICATOR_CHAR)]) elif token_type == REVISIT: in_revisit = True elif token_type == REPEAT: - char_types.append([STAR]) + char_types.append([REPEAT_ZERO_CHAR]) elif token_type == PLUS: - char_types.append([PLUS]) + char_types.append([REPEAT_ONE_CHAR]) elif token_type == OPTIONAL: - char_types[-1].append('') + char_types[-1].append(EMPTY_TRANSITION_CHAR) elif token_type == REVISIT: in_revisit = True elif token_type == HTML_ENTITY: @@ -554,7 +656,6 @@ def char_permutations(s): return list(itertools.product(char_types)), move - string_replacements = { u'[': u'\[', u']': u'\]', @@ -564,13 +665,10 @@ string_replacements = { u'}': u'\{', u'$': u'\$', u'^': u'\^', + u'-': u'\-', u'\\': u'\\\\', - u'\u0000': '', - u'': EMPTY_TRANSITION, u'*': u'\*', u'+': u'\+', - PLUS: u'+', - STAR: u'*', } escape_sequence_long_regex = re.compile(r'(\\x[0-9a-f]{2})([0-9a-f])', re.I) @@ -722,6 +820,8 @@ def parse_transform_rules(xml): variables[WORD_BOUNDARY_VAR_NAME] = WORD_BOUNDARY_VAR + current_filter = all_chars + for rule_type, rule in rules: if rule_type in (BIDIRECTIONAL_TRANSFORM, FORWARD_TRANSFORM): left, right = rule @@ -746,16 +846,17 @@ def parse_transform_rules(xml): left_pre_context = None left_pre_context_type = CONTEXT_TYPE_WORD_BOUNDARY else: - left_pre_context, _, _ = char_permutations(left_pre_context.strip()) + left_pre_context, _, _ = char_permutations(left_pre_context.strip(), current_filter=current_filter) left_pre_context_max_len = len(left_pre_context or []) left_pre_context = char_types_string(left_pre_context) + if charset_regex.search(left_pre_context): left_pre_context_type = CONTEXT_TYPE_REGEX else: left_pre_context_type = CONTEXT_TYPE_STRING if left: - left, _, left_groups = char_permutations(left.strip()) + left, _, left_groups = char_permutations(left.strip(), current_filter=current_filter) if left_groups: left_groups = format_groups(left, left_groups) else: @@ -767,7 +868,7 @@ def parse_transform_rules(xml): left_post_context = None left_post_context_type = CONTEXT_TYPE_WORD_BOUNDARY else: - left_post_context, _, _ = char_permutations(left_post_context.strip()) + left_post_context, _, _ = char_permutations(left_post_context.strip(), current_filter=current_filter) left_post_context_max_len = len(left_post_context or []) left_post_context = char_types_string(left_post_context) if charset_regex.search(left_post_context): @@ -776,14 +877,18 @@ def parse_transform_rules(xml): left_post_context_type = CONTEXT_TYPE_STRING if right: - right, move, right_groups = char_permutations(right.strip()) + right, move, right_groups = char_permutations(right.strip(), current_filter=current_filter) right = char_types_string(right) yield RULE, (left, left_pre_context_type, left_pre_context, left_pre_context_max_len, left_post_context_type, left_post_context, left_post_context_max_len, left_groups, right, move) - - elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule: + elif rule_type == PRE_TRANSFORM and rule.strip(': ').startswith('('): continue + elif rule_type == PRE_TRANSFORM and '[' in rule and ']' in rule: + filter_rule = regex_char_set_greedy.search(rule) + current_filter = set(parse_regex_char_set(filter_rule.group(0))) + if 92 in current_filter: + raise TransliterationParseError(rule) elif rule_type == PRE_TRANSFORM: pre_transform = pre_transform_regex.match(rule) if pre_transform: @@ -831,8 +936,11 @@ def get_all_transform_rules(): to_latin.add(name) retain_transforms.add(name) + print 'doing', filename + steps = [] rule_set = [] + for rule_type, rule in parse_transform_rules(xml): if rule_type == RULE: rule = format_rule(rule)