[transliteration] Adding html escapes as the first step in the Latin-ASCII transformation

This commit is contained in:
Al
2015-05-20 14:44:55 -04:00
parent 1fee0a3e35
commit d65f7747f0

View File

@@ -1066,21 +1066,33 @@ STEP_UNICODE_NORMALIZATION = 'STEP_UNICODE_NORMALIZATION'
NEW_STEP = 'NEW_STEP' NEW_STEP = 'NEW_STEP'
EXISTING_STEP = 'EXISTING_STEP' EXISTING_STEP = 'EXISTING_STEP'
PREPEND_STEP = 'PREPEND_STEP'
html_escapes = {'&{};'.format(name): escape_string(safe_encode(unichr(value)))
for name, value in htmlentitydefs.name2codepoint.iteritems()
}
# Extra rules defined here # Extra rules defined here
supplemental_transliterations = { supplemental_transliterations = {
'latin-ascii': (EXISTING_STEP, [ 'latin-ascii': [
# German transliterations not handled by standard NFD normalization (PREPEND_STEP, [(quote_string(name), str(len(name) + 2), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), '0', 'NULL', '0')
# ä => ae for name, value in html_escapes.iteritems()
(u'"\\xc3\\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'), ]
# ö => oe ),
(u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'), (EXISTING_STEP, [
# ü => ue # German transliterations not handled by standard NFD normalization
(u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'), # ä => ae
# ß => ss (u'"\\xc3\\xa4"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ae"', '2', '0', 'NULL', '0'),
(u'"\\xc3\\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'), # ö => oe
(u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', '0', 'NULL', '0'),
# ü => ue
(u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', '0', 'NULL', '0'),
# ß => ss
(u'"\\xc3\\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', '0', 'NULL', '0'),
]), ]),
],
} }
@@ -1175,11 +1187,13 @@ def get_all_transform_rules():
for name, steps in transforms.iteritems(): for name, steps in transforms.iteritems():
if name in supplemental_transliterations: if name in supplemental_transliterations:
step_type, rules = supplemental_transliterations[name] for step_type, rules in supplemental_transliterations[name]:
if step_type == EXISTING_STEP: if step_type == EXISTING_STEP:
steps[-1][1].extend(rules) steps[-1][1].extend(rules)
else: elif step_type == PREPEND_STEP:
steps[-1].append((STEP_RULESET, rules)) steps = [(STEP_RULESET, rules)] + steps
else:
steps.append((STEP_RULESET, rules))
# Only care if it's a transform to Latin/ASCII or a dependency # Only care if it's a transform to Latin/ASCII or a dependency
# for a transform to Latin/ASCII # for a transform to Latin/ASCII
elif name not in retain_transforms: elif name not in retain_transforms: