diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py index 0f50cc61..3534a319 100644 --- a/scripts/geodata/i18n/transliteration_rules.py +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -53,6 +53,11 @@ REVISIT_INDICATOR = '|' WORD_BOUNDARY_VAR_NAME = 'wordBoundary' WORD_BOUNDARY_VAR = '${}'.format(WORD_BOUNDARY_VAR_NAME) +START_OF_HAN_VAR_NAME = 'startOfHanMarker' +START_OF_HAN_VAR = '${}'.format(START_OF_HAN_VAR_NAME) + +start_of_han_regex = re.compile(START_OF_HAN_VAR.replace('$', '\$')) + word_boundary_var_regex = re.compile(WORD_BOUNDARY_VAR.replace('$', '\$')) WORD_BOUNDARY_CHAR = u'\u0001' @@ -70,11 +75,12 @@ BEGIN_SET_CHAR = u"\x07" END_SET_CHAR = u"\x08" GROUP_INDICATOR_CHAR = u"\x09" + EXCLUDE_TRANSLITERATORS = set([ 'hangul-latin', 'interindic-latin', 'jamo-latin', - # Don't care about spaced Han because + # Don't care about spaced Han because our tokenizer does it already 'han-spacedhan', ]) @@ -197,7 +203,8 @@ all_transforms = set() pre_transform_full_regex = re.compile('::[\s]*(.*)[\s]*', re.UNICODE) pre_transform_regex = re.compile('[\s]*([^\s\(\)]*)[\s]*(?:\(.*\)[\s]*)?', re.UNICODE) -transform_regex = re.compile(u"(?:[\s]*(?!=[\s])(.*)(?)|[←<→>↔=])(?:[\s]*(?!=[\s])(.*)(?)|[←<→>↔])(?:[\s]*(?!=[\s])(.*)(?