From c00ecf6ea8bb8099f8442dc5eb65a7630c737fad Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 22 May 2015 18:11:54 -0400 Subject: [PATCH] [fix] minimizing c* into (c|'')+, using empty transition instead of zero-length string --- scripts/geodata/i18n/transliteration_rules.py | 19 +++-- src/transliteration_data.c | 84 +++++++++---------- 2 files changed, 54 insertions(+), 49 deletions(-) diff --git a/scripts/geodata/i18n/transliteration_rules.py b/scripts/geodata/i18n/transliteration_rules.py index 41d56a55..46966c8e 100644 --- a/scripts/geodata/i18n/transliteration_rules.py +++ b/scripts/geodata/i18n/transliteration_rules.py @@ -69,11 +69,10 @@ WORD_BOUNDARY_CHAR = u"\x01" PRE_CONTEXT_CHAR = u"\x02" POST_CONTEXT_CHAR = u"\x03" EMPTY_TRANSITION_CHAR = u"\x04" -REPEAT_ZERO_CHAR = u"\x05" -REPEAT_ONE_CHAR = u"\x06" +REPEAT_CHAR = u"\x05" +GROUP_INDICATOR_CHAR = u"\x06" BEGIN_SET_CHAR = u"\x0e" END_SET_CHAR = u"\x0f" -GROUP_INDICATOR_CHAR = u"\x10" EXCLUDE_TRANSLITERATORS = set([ @@ -276,6 +275,7 @@ CHARACTER = 'CHARACTER' WIDE_CHARACTER = 'WIDE_CHARACTER' REVISIT = 'REVISIT' REPEAT = 'REPEAT' +REPEAT_ONE = 'REPEAT_ONE' LPAREN = 'LPAREN' RPAREN = 'RPAREN' WHITESPACE = 'WHITESPACE' @@ -324,7 +324,7 @@ transform_scanner = Scanner([ (r'\|', REVISIT), (r'&.*?;', HTML_ENTITY), (r'(?