[fix] minimizing c* into (c|'')+, using empty transition instead of zero-length string

This commit is contained in:
Al
2015-05-22 18:11:54 -04:00
parent b2d15b29cf
commit c00ecf6ea8
2 changed files with 54 additions and 49 deletions

View File

@@ -69,11 +69,10 @@ WORD_BOUNDARY_CHAR = u"\x01"
PRE_CONTEXT_CHAR = u"\x02"
POST_CONTEXT_CHAR = u"\x03"
EMPTY_TRANSITION_CHAR = u"\x04"
REPEAT_ZERO_CHAR = u"\x05"
REPEAT_ONE_CHAR = u"\x06"
REPEAT_CHAR = u"\x05"
GROUP_INDICATOR_CHAR = u"\x06"
BEGIN_SET_CHAR = u"\x0e"
END_SET_CHAR = u"\x0f"
GROUP_INDICATOR_CHAR = u"\x10"
EXCLUDE_TRANSLITERATORS = set([
@@ -276,6 +275,7 @@ CHARACTER = 'CHARACTER'
WIDE_CHARACTER = 'WIDE_CHARACTER'
REVISIT = 'REVISIT'
REPEAT = 'REPEAT'
REPEAT_ONE = 'REPEAT_ONE'
LPAREN = 'LPAREN'
RPAREN = 'RPAREN'
WHITESPACE = 'WHITESPACE'
@@ -324,7 +324,7 @@ transform_scanner = Scanner([
(r'\|', REVISIT),
(r'&.*?;', HTML_ENTITY),
(r'(?<![\\])\*', REPEAT),
(r'(?<![\\])\+', PLUS),
(r'(?<![\\])\+', REPEAT_ONE),
('(?<=[^\s])\?', OPTIONAL),
('\(', LPAREN),
('\)', RPAREN),
@@ -648,6 +648,10 @@ def char_permutations(s, current_filter=all_chars):
a list of character permutations, in addition to keeping
track of revisits and regex groups
'''
if not s:
return [EMPTY_TRANSITION_CHAR], 0, []
char_types = []
move = 0
in_revisit = False
@@ -697,9 +701,10 @@ def char_permutations(s, current_filter=all_chars):
elif token_type == REVISIT:
in_revisit = True
elif token_type == REPEAT:
char_types.append([REPEAT_ZERO_CHAR])
elif token_type == PLUS:
char_types.append([REPEAT_ONE_CHAR])
char_types[-1].append(EMPTY_TRANSITION_CHAR)
char_types.append([REPEAT_CHAR])
elif token_type == REPEAT_ONE:
char_types.append([REPEAT_CHAR])
elif token_type == OPTIONAL:
char_types[-1].append(EMPTY_TRANSITION_CHAR)
elif token_type == REVISIT: