From 1f3ac0c3f94128d11a3abcf4ccc99efd42db7b2e Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 14 May 2015 16:34:03 -0400 Subject: [PATCH] [transliteration] using a proper lexer on the entire rule to correct some parses, allowing bracketed multiple characters in sets, fixing optionals --- scripts/geodata/i18n/transliteration_rules.py | 145 +++++++++++++++--- 1 file changed, 120 insertions(+), 25 deletions(-) diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py index 3534a319..b7bc5a4d 100644 --- a/scripts/geodata/i18n/transliteration_rules.py +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -71,9 +71,9 @@ POST_CONTEXT_CHAR = u"\x03" EMPTY_TRANSITION_CHAR = u"\x04" REPEAT_ZERO_CHAR = u"\x05" REPEAT_ONE_CHAR = u"\x06" -BEGIN_SET_CHAR = u"\x07" -END_SET_CHAR = u"\x08" -GROUP_INDICATOR_CHAR = u"\x09" +BEGIN_SET_CHAR = u"\x0e" +END_SET_CHAR = u"\x0f" +GROUP_INDICATOR_CHAR = u"\x10" EXCLUDE_TRANSLITERATORS = set([ @@ -246,7 +246,7 @@ char_class_regex_str = '\[(?:[^\[\]]*\[[^\[\]]*\][^\[\]]*)*[^\[\]]*\]' nested_char_class_regex = re.compile('\[(?:[^\[\]]*\[[^\[\]]*\][^\[\]]*)+[^\[\]]*\]', re.UNICODE) range_regex = re.compile(r'[\\]?([^\\])\-[\\]?([^\\])', re.UNICODE) -var_regex = re.compile('\$([A-Za-z_\-]+)') +var_regex = re.compile('[\s]*\$([A-Za-z_\-]+)[\s]*') context_regex = re.compile(u'(?:[\s]*(?!=[\s])(.*?)(?