[transliteration] Adding language-specific transliterators for handling umlauts in German + special transliterations in the Nordic languages. It may still result in some wrong transliterations if the language classifier is wrong, but generally it's accurate enough that its predictions can be relied upon. Also adding a Latin-ASCII-Simple transform which only does the punctuation portion of Latin-ASCII so it won't change anything substantial about the input string.

This commit is contained in:
Al
2016-08-20 18:17:35 -04:00
parent 85ae5d4a05
commit cb4408fea8

View File

@@ -17,6 +17,7 @@ import itertools
import os import os
import re import re
import requests import requests
import six
import sys import sys
import time import time
import urlparse import urlparse
@@ -92,6 +93,10 @@ EXCLUDE_TRANSLITERATORS = set([
'korean-latin-bgn', 'korean-latin-bgn',
]) ])
TRANSLITERATOR_ALIASES = {
'greek_latin_ungegn': 'greek-latin-ungegn'
}
NFD = 'NFD' NFD = 'NFD'
NFKD = 'NFKD' NFKD = 'NFKD'
NFC = 'NFC' NFC = 'NFC'
@@ -308,7 +313,7 @@ char_set_scanner = Scanner([
NUM_CODEPOINTS_16 = 65536 NUM_CODEPOINTS_16 = 65536
all_chars = set([unichr(i) for i in xrange(NUM_CODEPOINTS_16)]) all_chars = set([unichr(i) for i in six.moves.xrange(NUM_CODEPOINTS_16)])
control_chars = set([c for c in all_chars if unicodedata.category(c) in ('Cc', 'Cn', 'Cs')]) control_chars = set([c for c in all_chars if unicodedata.category(c) in ('Cc', 'Cn', 'Cs')])
@@ -317,11 +322,16 @@ def get_transforms(d=CLDR_TRANSFORMS_DIR):
return [f for f in os.listdir(d) if f.endswith('.xml')] return [f for f in os.listdir(d) if f.endswith('.xml')]
def parse_transform_file(filename, d=CLDR_TRANSFORMS_DIR):
f = open(os.path.join(d, filename))
xml = etree.parse(f)
return xml
def parse_transforms(d=CLDR_TRANSFORMS_DIR): def parse_transforms(d=CLDR_TRANSFORMS_DIR):
for filename in get_transforms(d=d): for filename in get_transforms(d=d):
name = filename.split('.xml')[0].lower() name = filename.split('.xml')[0].lower()
f = open(os.path.join(d, filename)) xml = parse_transform_file(filename)
xml = etree.parse(f)
yield filename, name, xml yield filename, name, xml
@@ -346,6 +356,10 @@ def parse_regex_char_range(regex):
return chars return chars
chars = get_chars_by_script()
all_scripts = build_master_scripts_list(chars)
script_codes = {k.lower(): v.lower() for k, v in six.iteritems(get_script_codes(all_scripts))}
def parse_regex_char_class(c, current_filter=all_chars): def parse_regex_char_class(c, current_filter=all_chars):
chars = [] chars = []
@@ -372,7 +386,10 @@ def parse_regex_char_class(c, current_filter=all_chars):
elif prop == BLOCK_PROP: elif prop == BLOCK_PROP:
chars = unicode_blocks[value.lower()] chars = unicode_blocks[value.lower()]
elif prop == SCRIPT_PROP: elif prop == SCRIPT_PROP:
chars = unicode_scripts[value.lower()] if value.lower() in unicode_scripts:
chars = unicode_scripts[value.lower()]
elif value.lower() in script_codes:
chars = unicode_scripts[script_codes[value.lower()]]
elif prop == WORD_BREAK_PROP: elif prop == WORD_BREAK_PROP:
chars = unicode_word_breaks[value] chars = unicode_word_breaks[value]
else: else:
@@ -394,6 +411,8 @@ def parse_regex_char_class(c, current_filter=all_chars):
elif c.lower() in unicode_scripts: elif c.lower() in unicode_scripts:
chars = unicode_scripts[c.lower()] chars = unicode_scripts[c.lower()]
elif c.lower() in script_codes:
chars = unicode_scripts[script_codes[c.lower()]]
elif c.lower() in unicode_properties: elif c.lower() in unicode_properties:
chars = unicode_properties[c.lower()] chars = unicode_properties[c.lower()]
else: else:
@@ -518,6 +537,7 @@ for name, regex_range in unicode_property_regexes:
init_unicode_categories() init_unicode_categories()
hangul_jamo_latin_filter = set(parse_regex_char_set("[['ᄀ-하-ᅵᆨ-ᇂ가-힣ㄱ-ㄿㅁ-ㅃㅅ-ㅣ㈀-㈜㉠-㉻가-힣'ᄀ-ᆵᄆ-ᄈᄉ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-][:Latin:]]")) hangul_jamo_latin_filter = set(parse_regex_char_set("[['ᄀ-하-ᅵᆨ-ᇂ가-힣ㄱ-ㄿㅁ-ㅃㅅ-ㅣ㈀-㈜㉠-㉻가-힣'ᄀ-ᆵᄆ-ᄈᄉ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-][:Latin:]]"))
custom_filters = { custom_filters = {
@@ -525,9 +545,12 @@ custom_filters = {
} }
def get_source_and_target(xml): def get_source_and_target(name):
return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0] name = TRANSLITERATOR_ALIASES.get(name.lower(), name.lower())
components = name.split('-')[:2]
if len(components) < 2:
raise Exception(name)
return components
def is_internal(xml): def is_internal(xml):
return xml.xpath('//transform/@visibility="internal"') return xml.xpath('//transform/@visibility="internal"')
@@ -1130,11 +1153,11 @@ PREPEND_STEP = 'PREPEND_STEP'
html_escapes = {'&{};'.format(name): safe_encode(wide_unichr(value)) html_escapes = {'&{};'.format(name): safe_encode(wide_unichr(value))
for name, value in htmlentitydefs.name2codepoint.iteritems() for name, value in six.iteritems(htmlentitydefs.name2codepoint)
} }
html_escapes.update({'&#{};'.format(i): safe_encode(wide_unichr(i)) html_escapes.update({'&#{};'.format(i): safe_encode(wide_unichr(i))
for i in xrange(NUM_CODEPOINTS_16) for i in six.moves.xrange(NUM_CODEPOINTS_16)
}) })
# [[:Latin] & [:Ll:]] # [[:Latin] & [:Ll:]]
@@ -1144,15 +1167,16 @@ latin_lower_rule = '[{}]'.format(latin_lower_set)
latin_lower_rule_len = len(latin_lower_rule.decode('string-escape')) latin_lower_rule_len = len(latin_lower_rule.decode('string-escape'))
latin_lower_rule = quote_string(latin_lower_rule) latin_lower_rule = quote_string(latin_lower_rule)
# Extra rules defined here html_escape_step = [(quote_string(name), str(len(name)), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(escape_string(value)), str(len(value)), 'NULL', '0', 'NULL', '0')
supplemental_transliterations = { for name, value in six.iteritems(html_escapes)
'latin-ascii': [ ]
# Prepend transformations get applied in the reverse order of their appearance here
(PREPEND_STEP, [(quote_string(name), str(len(name)), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(escape_string(value)), str(len(value)), 'NULL', '0', 'NULL', '0') extra_transforms = {
for name, value in html_escapes.iteritems() 'html-escape': [
] (STEP_RULESET, html_escape_step)
), ],
(PREPEND_STEP, [ 'german-ascii': [
(STEP_RULESET, [
# German transliterations not handled by standard NFD normalization # German transliterations not handled by standard NFD normalization
# ä => ae # ä => ae
(u'"\\xc3\\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', 'NULL', '0', 'NULL', '0'), (u'"\\xc3\\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', 'NULL', '0', 'NULL', '0'),
@@ -1177,6 +1201,21 @@ supplemental_transliterations = {
(u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', latin_lower_rule, str(latin_lower_rule_len), u'"Ue"', '2', 'NULL', '0', 'NULL', '0'), (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', latin_lower_rule, str(latin_lower_rule_len), u'"Ue"', '2', 'NULL', '0', 'NULL', '0'),
# Ü => UE otherwise # Ü => UE otherwise
(u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"UE"', '2', 'NULL', '0', 'NULL', '0'), (u'"\\xc3\\x96"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"UE"', '2', 'NULL', '0', 'NULL', '0'),
]),
(STEP_TRANSFORM, 'latin-ascii'),
],
'scandinavian-ascii': [
(STEP_RULESET, [
# Swedish transliterations not handled by standard NFD normalization
(u'"\\xc3\\xb8"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', 'NULL', '0', 'NULL', '0'),
# Å => Aa if followed by lower case Latin letter
(u'"\\xc3\\x98"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_REGEX, '1', latin_lower_rule, str(latin_lower_rule_len), u'"Oe"', '2', 'NULL', '0', 'NULL', '0'),
# Å => AA otherwise
(u'"\\xc3\\x98"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"OE"', '2', 'NULL', '0', 'NULL', '0'),
# Swedish transliterations not handled by standard NFD normalization # Swedish transliterations not handled by standard NFD normalization
(u'"\\xc3\\xa5"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"aa"', '2', 'NULL', '0', 'NULL', '0'), (u'"\\xc3\\xa5"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"aa"', '2', 'NULL', '0', 'NULL', '0'),
@@ -1186,14 +1225,55 @@ supplemental_transliterations = {
# Å => AA otherwise # Å => AA otherwise
(u'"\\xc3\\x85"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"AA"', '2', 'NULL', '0', 'NULL', '0'), (u'"\\xc3\\x85"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"AA"', '2', 'NULL', '0', 'NULL', '0'),
]), ]),
(STEP_TRANSFORM, 'latin-ascii'),
],
}
# Extra rules defined here
supplemental_transliterations = {
'latin-ascii': [
# Prepend transformations get applied in the reverse order of their appearance here
(PREPEND_STEP, [(STEP_TRANSFORM, 'html-escape')]),
], ],
} }
def simple_latin_ruleset():
xml = parse_transform_file('Latin-ASCII.xml')
category_chars = get_unicode_categories()
cats = [None] * 0x10ffff
for cat, chars in six.iteritems(category_chars):
for c in chars:
cats[wide_ord(c)] = cat
ruleset = [(STEP_TRANSFORM, 'html-escape')]
simple_rules = []
for rule_type, rule in parse_transform_rules('latin-ascii', xml):
if rule_type == RULE:
key = safe_decode(rule[0])
pre_context_type = rule[1]
post_context_type = rule[4]
value = safe_decode(rule[8])
if len(key) == 1 and len(value) == 1 and pre_context_type == CONTEXT_TYPE_NONE and post_context_type == CONTEXT_TYPE_NONE:
cat = cats[wide_ord(key)]
# Only use punctuation rules, not numeric
if not cat.startswith('L') and not cat.startswith('N'):
simple_rules.append(format_rule(rule))
ruleset.append((STEP_RULESET, simple_rules))
return ruleset
extra_transforms['latin-ascii-simple'] = simple_latin_ruleset()
def get_all_transform_rules(): def get_all_transform_rules():
transforms = {} transforms = {}
to_latin = set() to_latin = set()
@@ -1215,7 +1295,7 @@ def get_all_transform_rules():
all_transforms.add(BIDIRECTIONAL_TRANSLITERATORS[name]) all_transforms.add(BIDIRECTIONAL_TRANSLITERATORS[name])
for filename, name, xml in parse_transforms(): for filename, name, xml in parse_transforms():
source, target = get_source_and_target(xml) source, target = get_source_and_target(name)
name_alias = '-'.join([source.lower(), target.lower()]) name_alias = '-'.join([source.lower(), target.lower()])
if name_alias not in name_aliases and name_alias != name: if name_alias not in name_aliases and name_alias != name:
name_aliases[name_alias] = name name_aliases[name_alias] = name
@@ -1277,7 +1357,7 @@ def get_all_transform_rules():
return steps return steps
for filename, name, xml in parse_transforms(): for filename, name, xml in parse_transforms():
source, target = get_source_and_target(xml) source, target = get_source_and_target(name)
internal = is_internal(xml) internal = is_internal(xml)
if name in EXCLUDE_TRANSLITERATORS: if name in EXCLUDE_TRANSLITERATORS:
@@ -1301,6 +1381,8 @@ def get_all_transform_rules():
dependency_queue = deque(to_latin) dependency_queue = deque(to_latin)
retain_transforms |= to_latin retain_transforms |= to_latin
print retain_transforms
seen = set() seen = set()
while dependency_queue: while dependency_queue:
@@ -1334,10 +1416,10 @@ def get_all_transform_rules():
# Only care if it's a transform to Latin/ASCII or a dependency # Only care if it's a transform to Latin/ASCII or a dependency
# for a transform to Latin/ASCII # for a transform to Latin/ASCII
if name not in retain_transforms and normalized_name not in retain_transforms: if name not in retain_transforms and normalized_name not in retain_transforms:
print 'skipping', filename print('skipping {}'.format(filename))
continue continue
print 'doing', filename print('doing {}'.format(filename))
if not reverse and not bidirectional: if not reverse and not bidirectional:
steps = parse_steps(name, xml, reverse=False) steps = parse_steps(name, xml, reverse=False)
@@ -1353,13 +1435,15 @@ def get_all_transform_rules():
steps = parse_steps(name, xml, reverse=True) steps = parse_steps(name, xml, reverse=True)
transforms[name] = steps transforms[name] = steps
transforms.update(extra_transforms)
for name, steps in transforms.iteritems(): for name, steps in transforms.iteritems():
if name in supplemental_transliterations: if name in supplemental_transliterations:
for step_type, rules in supplemental_transliterations[name]: for step_type, rules in supplemental_transliterations[name]:
if step_type == EXISTING_STEP: if step_type == EXISTING_STEP:
steps[-1][1].extend(rules) steps[-1][1].extend(rules)
elif step_type == PREPEND_STEP: elif step_type == PREPEND_STEP:
steps = [(STEP_RULESET, rules)] + steps steps = rules + steps
else: else:
steps.append((STEP_RULESET, rules)) steps.append((STEP_RULESET, rules))
step_index = len(steps_data) step_index = len(steps_data)
@@ -1436,7 +1520,12 @@ script_transliterators = {
'canadian_aboriginal': {None: ['canadianaboriginal-latin']}, 'canadian_aboriginal': {None: ['canadianaboriginal-latin']},
'cham': None, 'cham': None,
'cherokee': None, 'cherokee': None,
'common': {None: ['latin-ascii']}, 'common': {None: ['latin-ascii'],
'de': ['german-ascii'],
'da': ['scandinavian-ascii', 'latin-ascii'],
'nb': ['scandinavian-ascii', 'latin-ascii'],
'sv': ['scandinavian-ascii', 'latin-ascii'],
},
'coptic': None, 'coptic': None,
'cyrillic': {None: ['cyrillic-latin'], 'cyrillic': {None: ['cyrillic-latin'],
'be': ['belarusian-latin-bgn'], 'be': ['belarusian-latin-bgn'],
@@ -1469,7 +1558,12 @@ script_transliterators = {
'kayah_li': None, 'kayah_li': None,
'khmer': None, 'khmer': None,
'lao': None, 'lao': None,
'latin': {None: ['latin-ascii']}, 'latin': {None: ['latin-ascii'],
'de': ['german-ascii'],
'da': ['scandinavian-ascii', 'latin-ascii'],
'nb': ['scandinavian-ascii', 'latin-ascii'],
'sv': ['scandinavian-ascii', 'latin-ascii'],
},
'lepcha': None, 'lepcha': None,
'limbu': None, 'limbu': None,
'lisu': None, 'lisu': None,
@@ -1550,8 +1644,8 @@ def write_transliteration_data_file(filename):
try: try:
r = u','.join(r) r = u','.join(r)
except Exception: except Exception:
print 'Problem with rule' print('Exception in rule')
print r print(r)
all_rules = u''', all_rules = u''',
'''.join([u'{{{}}}'.format(u','.join(r)) for r in rules]) '''.join([u'{{{}}}'.format(u','.join(r)) for r in rules])