[fix] merging repeat codepoints in trie builder
This commit is contained in:
@@ -92,12 +92,12 @@ typedef struct transliteration_table {
|
|||||||
#define EMPTY_TRANSITION_CHAR "\x04"
|
#define EMPTY_TRANSITION_CHAR "\x04"
|
||||||
#define EMPTY_TRANSITION_CODEPOINT 4
|
#define EMPTY_TRANSITION_CODEPOINT 4
|
||||||
#define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
|
#define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
|
||||||
#define REPEAT_ZERO_CHAR "\x05"
|
#define REPEAT_CHAR "\x05"
|
||||||
#define REPEAT_ZERO_CODEPOINT 5
|
#define REPEAT_CODEPOINT 5
|
||||||
#define REPEAT_ZERO_CHAR_LEN strlen(REPEAT_ZERO_CHAR)
|
#define REPEAT_CHAR_LEN strlen(REPEAT_ZERO_CHAR)
|
||||||
#define REPEAT_ONE_CHAR "\x06"
|
#define GROUP_INDICATOR_CHAR "\x06"
|
||||||
#define REPEAT_ONE_CODEPOINT 6
|
#define GROUP_INDICATOR_CODEPOINT 6
|
||||||
#define REPEAT_ONE_CHAR_LEN strlen(REPEAT_ONE_CHAR)
|
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
|
||||||
#define BEGIN_SET_CHAR "\x0f"
|
#define BEGIN_SET_CHAR "\x0f"
|
||||||
#define BEGIN_SET_CODEPOINT 15
|
#define BEGIN_SET_CODEPOINT 15
|
||||||
#define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
|
#define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
|
||||||
@@ -105,9 +105,6 @@ typedef struct transliteration_table {
|
|||||||
#define END_SET_CODEPOINT 14
|
#define END_SET_CODEPOINT 14
|
||||||
#define END_SET_CHAR_LEN strlen(END_SET_CHAR)
|
#define END_SET_CHAR_LEN strlen(END_SET_CHAR)
|
||||||
|
|
||||||
#define GROUP_INDICATOR_CHAR "\x10"
|
|
||||||
#define GROUP_INDICATOR_CODEPOINT 16
|
|
||||||
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
|
|
||||||
|
|
||||||
#define DOLLAR_CODEPOINT 36
|
#define DOLLAR_CODEPOINT 36
|
||||||
|
|
||||||
|
|||||||
@@ -96,21 +96,6 @@ string_tree_t *regex_string_tree(char *regex, size_t len) {
|
|||||||
} else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) {
|
} else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) {
|
||||||
log_debug("group\n");
|
log_debug("group\n");
|
||||||
add_to_index = false;
|
add_to_index = false;
|
||||||
} else if (codepoint == STAR_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
|
|
||||||
log_debug("star\n");
|
|
||||||
// For *, we add an optional transition to the empty
|
|
||||||
codepoint = REPEAT_ZERO_CODEPOINT;
|
|
||||||
} else if (codepoint == PLUS_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
|
|
||||||
log_debug("plus\n");
|
|
||||||
codepoint = REPEAT_ONE_CODEPOINT;
|
|
||||||
} else if (codepoint == DOLLAR_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
|
|
||||||
log_debug("dollar\n");
|
|
||||||
codepoint = WORD_BOUNDARY_CODEPOINT;
|
|
||||||
|
|
||||||
if (in_set) {
|
|
||||||
uint32_array_push(char_set, codepoint);
|
|
||||||
add_to_index = false;
|
|
||||||
}
|
|
||||||
} else if (in_set) {
|
} else if (in_set) {
|
||||||
log_debug("in set\n");
|
log_debug("in set\n");
|
||||||
// Queue node, we'll add them to the trie
|
// Queue node, we'll add them to the trie
|
||||||
|
|||||||
Reference in New Issue
Block a user