[fix] merging repeat codepoints in trie builder
This commit is contained in:
@@ -92,12 +92,12 @@ typedef struct transliteration_table {
|
||||
#define EMPTY_TRANSITION_CHAR "\x04"
|
||||
#define EMPTY_TRANSITION_CODEPOINT 4
|
||||
#define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
|
||||
#define REPEAT_ZERO_CHAR "\x05"
|
||||
#define REPEAT_ZERO_CODEPOINT 5
|
||||
#define REPEAT_ZERO_CHAR_LEN strlen(REPEAT_ZERO_CHAR)
|
||||
#define REPEAT_ONE_CHAR "\x06"
|
||||
#define REPEAT_ONE_CODEPOINT 6
|
||||
#define REPEAT_ONE_CHAR_LEN strlen(REPEAT_ONE_CHAR)
|
||||
#define REPEAT_CHAR "\x05"
|
||||
#define REPEAT_CODEPOINT 5
|
||||
#define REPEAT_CHAR_LEN strlen(REPEAT_ZERO_CHAR)
|
||||
#define GROUP_INDICATOR_CHAR "\x06"
|
||||
#define GROUP_INDICATOR_CODEPOINT 6
|
||||
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
|
||||
#define BEGIN_SET_CHAR "\x0f"
|
||||
#define BEGIN_SET_CODEPOINT 15
|
||||
#define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
|
||||
@@ -105,9 +105,6 @@ typedef struct transliteration_table {
|
||||
#define END_SET_CODEPOINT 14
|
||||
#define END_SET_CHAR_LEN strlen(END_SET_CHAR)
|
||||
|
||||
#define GROUP_INDICATOR_CHAR "\x10"
|
||||
#define GROUP_INDICATOR_CODEPOINT 16
|
||||
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
|
||||
|
||||
#define DOLLAR_CODEPOINT 36
|
||||
|
||||
|
||||
@@ -96,21 +96,6 @@ string_tree_t *regex_string_tree(char *regex, size_t len) {
|
||||
} else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) {
|
||||
log_debug("group\n");
|
||||
add_to_index = false;
|
||||
} else if (codepoint == STAR_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
|
||||
log_debug("star\n");
|
||||
// For *, we add an optional transition to the empty
|
||||
codepoint = REPEAT_ZERO_CODEPOINT;
|
||||
} else if (codepoint == PLUS_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
|
||||
log_debug("plus\n");
|
||||
codepoint = REPEAT_ONE_CODEPOINT;
|
||||
} else if (codepoint == DOLLAR_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
|
||||
log_debug("dollar\n");
|
||||
codepoint = WORD_BOUNDARY_CODEPOINT;
|
||||
|
||||
if (in_set) {
|
||||
uint32_array_push(char_set, codepoint);
|
||||
add_to_index = false;
|
||||
}
|
||||
} else if (in_set) {
|
||||
log_debug("in set\n");
|
||||
// Queue node, we'll add them to the trie
|
||||
|
||||
Reference in New Issue
Block a user