From 9547c93a3873b68fd810937b7e66a20a9b80ec45 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 May 2015 19:47:49 -0400 Subject: [PATCH] [fix] InterIndic-Latin is an internal transliterator, but needed for most of the Indic languages. Also fixing the string lengths for HTML entity replacements --- scripts/geodata/i18n/transliteration_rules.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py index 67ba0e16..74497ce1 100644 --- a/scripts/geodata/i18n/transliteration_rules.py +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -77,7 +77,6 @@ END_SET_CHAR = u"\x0f" EXCLUDE_TRANSLITERATORS = set([ 'hangul-latin', - 'interindic-latin', 'jamo-latin', # Don't care about spaced Han because our tokenizer does it already 'han-spacedhan', @@ -570,6 +569,10 @@ def get_source_and_target(xml): return xml.xpath('//transform/@source')[0], xml.xpath('//transform/@target')[0] +def is_internal(xml): + return xml.xpath('//transform/@visibility="internal"') + + def get_raw_rules_and_variables(xml): ''' Parse tRule nodes from the transform XML @@ -1099,11 +1102,8 @@ supplemental_transliterations = { (u'"\\xc3\\xb6"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"oe"', '2', 'NULL', '0', 'NULL', '0'), # ü => ue (u'"\\xc3\\xbc"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ue"', '2', 'NULL', '0', 'NULL', '0'), - # ß => ss - (u'"\\xc3\\x9f"', '2', CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', u'"ss"', '2', 'NULL', '0', 'NULL', '0'), - ]), - (PREPEND_STEP, [(quote_string(name), str(len(name) + 2), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0') + (PREPEND_STEP, [(quote_string(name), str(len(name)), CONTEXT_TYPE_NONE, '0', 'NULL', '0', CONTEXT_TYPE_NONE, '0', 'NULL', '0', quote_string(value), str(len(value)), 'NULL', '0', 'NULL', '0') for name, value in html_escapes.iteritems() ] ), @@ -1141,11 +1141,12 @@ def get_all_transform_rules(): f = open(os.path.join(CLDR_TRANSFORMS_DIR, filename)) xml = etree.parse(f) source, target = get_source_and_target(xml) + internal = is_internal(xml) if name in EXCLUDE_TRANSLITERATORS: continue - if (target.lower() == 'latin' or name == 'latin-ascii'): + if (target.lower() == 'latin' or name == 'latin-ascii') and not internal: to_latin.add(name) retain_transforms.add(name)