[fix] InterIndic-Latin is an internal transliterator, but needed for most of the Indic languages. Also fixing the string lengths for HTML entity replacements

This commit is contained in:
Al
2015-05-29 19:47:49 -04:00
parent 8b56d63fde
commit 9547c93a38

View File

@@ -77,7 +77,6 @@ END_SET_CHAR = u"\x0f"
EXCLUDE_TRANSLITERATORS = set([
'hangul-latin',
'interindic-latin',
'jamo-latin',
# Don't care about spaced Han because our tokenizer does it already
'han-spacedhan',
@@ -570,6 +569,10 @@ def get_source_and_target(xml):
return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0]
def is_internal(xml):
return xml.xpath('//transform/@visibility="internal"')
def get_raw_rules_and_variables(xml):
'''
Parse tRule nodes from the transform XML
@@ -1099,11 +1102,8 @@ supplemental_transliterations = {
(u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', 'NULL', '0', 'NULL', '0'),
# ü => ue
(u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', 'NULL', '0', 'NULL', '0'),
# ß => ss
(u'"\\xc3\\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', 'NULL', '0', 'NULL', '0'),
]),
(PREPEND_STEP, [(quote_string(name), str(len(name) + 2), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0')
(PREPEND_STEP, [(quote_string(name), str(len(name)), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0')
for name, value in html_escapes.iteritems()
]
),
@@ -1141,11 +1141,12 @@ def get_all_transform_rules():
f = open(os.path.join(CLDR_TRANSFORMS_DIR, filename))
xml = etree.parse(f)
source, target = get_source_and_target(xml)
internal = is_internal(xml)
if name in EXCLUDE_TRANSLITERATORS:
continue
if (target.lower() == 'latin' or name == 'latin-ascii'):
if (target.lower() == 'latin' or name == 'latin-ascii') and not internal:
to_latin.add(name)
retain_transforms.add(name)