[fix] InterIndic-Latin is an internal transliterator, but needed for most of the Indic languages. Also fixing the string lengths for HTML entity replacements
This commit is contained in:
@@ -77,7 +77,6 @@ END_SET_CHAR = u"\x0f"
|
|||||||
|
|
||||||
EXCLUDE_TRANSLITERATORS = set([
|
EXCLUDE_TRANSLITERATORS = set([
|
||||||
'hangul-latin',
|
'hangul-latin',
|
||||||
'interindic-latin',
|
|
||||||
'jamo-latin',
|
'jamo-latin',
|
||||||
# Don't care about spaced Han because our tokenizer does it already
|
# Don't care about spaced Han because our tokenizer does it already
|
||||||
'han-spacedhan',
|
'han-spacedhan',
|
||||||
@@ -570,6 +569,10 @@ def get_source_and_target(xml):
|
|||||||
return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0]
|
return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0]
|
||||||
|
|
||||||
|
|
||||||
|
def is_internal(xml):
|
||||||
|
return xml.xpath('//transform/@visibility="internal"')
|
||||||
|
|
||||||
|
|
||||||
def get_raw_rules_and_variables(xml):
|
def get_raw_rules_and_variables(xml):
|
||||||
'''
|
'''
|
||||||
Parse tRule nodes from the transform XML
|
Parse tRule nodes from the transform XML
|
||||||
@@ -1099,11 +1102,8 @@ supplemental_transliterations = {
|
|||||||
(u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', 'NULL', '0', 'NULL', '0'),
|
(u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', 'NULL', '0', 'NULL', '0'),
|
||||||
# ü => ue
|
# ü => ue
|
||||||
(u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', 'NULL', '0', 'NULL', '0'),
|
(u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', 'NULL', '0', 'NULL', '0'),
|
||||||
# ß => ss
|
|
||||||
(u'"\\xc3\\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', 'NULL', '0', 'NULL', '0'),
|
|
||||||
|
|
||||||
]),
|
]),
|
||||||
(PREPEND_STEP, [(quote_string(name), str(len(name) + 2), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0')
|
(PREPEND_STEP, [(quote_string(name), str(len(name)), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0')
|
||||||
for name, value in html_escapes.iteritems()
|
for name, value in html_escapes.iteritems()
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
@@ -1141,11 +1141,12 @@ def get_all_transform_rules():
|
|||||||
f = open(os.path.join(CLDR_TRANSFORMS_DIR, filename))
|
f = open(os.path.join(CLDR_TRANSFORMS_DIR, filename))
|
||||||
xml = etree.parse(f)
|
xml = etree.parse(f)
|
||||||
source, target = get_source_and_target(xml)
|
source, target = get_source_and_target(xml)
|
||||||
|
internal = is_internal(xml)
|
||||||
|
|
||||||
if name in EXCLUDE_TRANSLITERATORS:
|
if name in EXCLUDE_TRANSLITERATORS:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (target.lower() == 'latin' or name == 'latin-ascii'):
|
if (target.lower() == 'latin' or name == 'latin-ascii') and not internal:
|
||||||
to_latin.add(name)
|
to_latin.add(name)
|
||||||
retain_transforms.add(name)
|
retain_transforms.add(name)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user