[tokenization/trie] simpler url regex reduces the scanner file size, accounting for a few more variations in word tokens, making trie suffix search use iteration instead of malloc'ing a new string
This commit is contained in:
389113
src/scanner.c
389113
src/scanner.c
File diff suppressed because it is too large
Load Diff
@@ -115,7 +115,7 @@ katakana = {katakana_chars}+;
|
|||||||
// WB13a and WB13b
|
// WB13a and WB13b
|
||||||
word_extend_num_letter = ({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})+{extend_num_letter_chars}({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})*;
|
word_extend_num_letter = ({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})+{extend_num_letter_chars}({letter}|{numeric_chars}|{katakana}|{extend_num_letter_chars})*;
|
||||||
|
|
||||||
possible_word_char = {letter}|{mark_spacing_combining_chars}|{mark_enclosing_chars}|{mark_nonspacing_chars}|{punct_connector_chars}|{punct_dash_chars}|{currency_symbol_chars}|{symbol_modifier_chars}|{symbol_math_chars}|{symbol_other_chars};
|
possible_word_char = {letter}|{mark_spacing_combining_chars}|{mark_enclosing_chars}|{mark_nonspacing_chars}|{punct_connector_chars}|{punct_dash_chars}|{currency_symbol_chars}|{symbol_modifier_chars}|{symbol_math_chars}|{symbol_other_chars}|{digit};
|
||||||
//possible_word_char = [^\+\(\)\[\]}{\/\\\,:;!"&\? 0-9\t\u00A0\u2000-\u200A\u3000\r\n\u2028\u2029\u000B\u000C\u0085\u0096\u0097\u2013\u2014\u2015\u0000];
|
//possible_word_char = [^\+\(\)\[\]}{\/\\\,:;!"&\? 0-9\t\u00A0\u2000-\u200A\u3000\r\n\u2028\u2029\u000B\u000C\u0085\u0096\u0097\u2013\u2014\u2015\u0000];
|
||||||
any_word = ({possible_word_char}*{letter}+{possible_word_char}*);
|
any_word = ({possible_word_char}*{letter}+{possible_word_char}*);
|
||||||
|
|
||||||
@@ -136,12 +136,7 @@ abbreviation = ({word})"\.";
|
|||||||
us_phone_number = ("\+"?"1"[\-\. ]?)?"\("?([2-9][0-8][0-9])"\)"?[\-\. ]?([2-9][0-9]{2})[\-\. ]?([0-9]{4});
|
us_phone_number = ("\+"?"1"[\-\. ]?)?"\("?([2-9][0-8][0-9])"\)"?[\-\. ]?([2-9][0-9]{2})[\-\. ]?([0-9]{4});
|
||||||
international_phone_number = "\+"("9"[976][0-9]|"8"[987530][0-9]|"6"[987][0-9]|"5"[90][0-9]|"420-9"|"3"[875][0-9]|"2"[98654321][0-9]|"9"[8543210]|"8"[6421]|"6"[6543210]|"5"[87654321]|"4"[987654310]|"3"[9643210]|"2"[70]|"7"|"1"){space}*(([()\.\-/ ]{0,1}[0-9]){9}[0-9]{1,2});
|
international_phone_number = "\+"("9"[976][0-9]|"8"[987530][0-9]|"6"[987][0-9]|"5"[90][0-9]|"420-9"|"3"[875][0-9]|"2"[98654321][0-9]|"9"[8543210]|"8"[6421]|"6"[6543210]|"5"[87654321]|"4"[987654310]|"3"[9643210]|"2"[70]|"7"|"1"){space}*(([()\.\-/ ]{0,1}[0-9]){9}[0-9]{1,2});
|
||||||
|
|
||||||
// Paste from a list of top-level domains
|
url = ('http''s'?":"("/"{1,3}|[A-Za-z0-9%]))([^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f<>{}\[\]]+);
|
||||||
|
|
||||||
tlds = ('com'|'net'|'org'|'edu'|'gov'|'mil'|'aero'|'asia'|'biz'|'cat'|'coop'|'info'|'int'|'jobs'|'mobi'|'museum'|'name'|'post'|'pro'|'tel'|'travel'|'xxx'|'ac'|'ad'|'ae'|'af'|'ag'|'ai'|'al'|'am'|'an'|'ao'|'aq'|'ar'|'as'|'at'|'au'|'aw'|'ax'|'az'|'ba'|'bb'|'bd'|'be'|'bf'|'bg'|'bh'|'bi'|'bj'|'bm'|'bn'|'bo'|'br'|'bs'|'bt'|'bv'|'bw'|'by'|'bz'|'ca'|'cc'|'cd'|'cf'|'cg'|'ch'|'ci'|'ck'|'cl'|'cm'|'cn'|'co'|'cr'|'cs'|'cu'|'cv'|'cx'|'cy'|'cz'|'dd'|'de'|'dj'|'dk'|'dm'|'do'|'dz'|'ec'|'ee'|'eg'|'eh'|'er'|'es'|'et'|'eu'|'fi'|'fj'|'fk'|'fm'|'fo'|'fr'|'ga'|'gb'|'gd'|'ge'|'gf'|'gg'|'gh'|'gi'|'gl'|'gm'|'gn'|'gp'|'gq'|'gr'|'gs'|'gt'|'gu'|'gw'|'gy'|'hk'|'hm'|'hn'|'hr'|'ht'|'hu'|'id'|'ie'|'il'|'im'|'in'|'io'|'iq'|'ir'|'is'|'it'|'je'|'jm'|'jo'|'jp'|'ke'|'kg'|'kh'|'ki'|'km'|'kn'|'kp'|'kr'|'kw'|'ky'|'kz'|'la'|'lb'|'lc'|'li'|'lk'|'lr'|'ls'|'lt'|'lu'|'lv'|'ly'|'ma'|'mc'|'md'|'me'|'mg'|'mh'|'mk'|'ml'|'mm'|'mn'|'mo'|'mp'|'mq'|'mr'|'ms'|'mt'|'mu'|'mv'|'mw'|'mx'|'my'|'mz'|'na'|'nc'|'ne'|'nf'|'ng'|'ni'|'nl'|'no'|'np'|'nr'|'nu'|'nz'|'om'|'pa'|'pe'|'pf'|'pg'|'ph'|'pk'|'pl'|'pm'|'pn'|'pr'|'ps'|'pt'|'pw'|'py'|'qa'|'re'|'ro'|'rs'|'ru'|'rw'|'sa'|'sb'|'sc'|'sd'|'se'|'sg'|'sh'|'si'|'sj'|'Ja'|'sk'|'sl'|'sm'|'sn'|'so'|'sr'|'ss'|'st'|'su'|'sv'|'sx'|'sy'|'sz'|'tc'|'td'|'tf'|'tg'|'th'|'tj'|'tk'|'tl'|'tm'|'tn'|'to'|'tp'|'tr'|'tt'|'tv'|'tw'|'tz'|'ua'|'ug'|'uk'|'us'|'uy'|'uz'|'va'|'vc'|'ve'|'vg'|'vi'|'vn'|'vu'|'wf'|'ws'|'ye'|'yt'|'yu'|'za'|'zm'|'zw');
|
|
||||||
|
|
||||||
// Gruber's liberal url regex: https://gist.github.com/gruber/8891611
|
|
||||||
url = (('http''s'?":"("/"{1,3}|[A-Za-z0-9%])|[A-Za-z0-9.\-]+[.]{tlds}"/")([^\u0000() \t\u00A0\u2000-\u200A\u3000\r\n\f<>{}\[\]]+|"\("[^\u0000() \t\u00A0\u2000-\u200A\u3000\r\n\f]*?"\("[^\u0000() \t\u00A0\u2000-\u200A\u3000\r\n\f]+"\)"[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]*?"\)"|"\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f]+?"\)")+("\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]*?"\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]+"\)"[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f()]*?"\)"|"\("[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f]+?"\)"|[^\u0000 \t\u00A0\u2000-\u200A\u3000\r\n\f`!()\[\]{};:\'\"\.,<>?«»“”‘’])|([A-Za-z0-9]+([.\-][A-Za-z0-9]+)*[.]{tlds}"/"?));
|
|
||||||
|
|
||||||
email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3});
|
email = ([a-zA-Z0-9\._%+\-]+"@"([a-zA-Z0-9]+[\.])+[a-zA-Z0-9]{2,3});
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
|
|
||||||
// If we're in the middle of a word and the first letter was not a match, skip the word
|
// If we're in the middle of a word and the first letter was not a match, skip the word
|
||||||
if (is_letter && state == SEARCH_STATE_NO_MATCH) {
|
if (is_letter && state == SEARCH_STATE_NO_MATCH) {
|
||||||
log_debug("skipping\n");
|
log_debug("skipping\n", NULL);
|
||||||
ptr += len;
|
ptr += len;
|
||||||
index += len;
|
index += len;
|
||||||
last_state = state;
|
last_state = state;
|
||||||
@@ -51,7 +51,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
|
|
||||||
// Match in the middle of a word
|
// Match in the middle of a word
|
||||||
if (is_letter && last_state == SEARCH_STATE_MATCH) {
|
if (is_letter && last_state == SEARCH_STATE_MATCH) {
|
||||||
log_debug("last_state == SEARCH_STATE_MATCH && is_letter\n");
|
log_debug("last_state == SEARCH_STATE_MATCH && is_letter\n", NULL);
|
||||||
// Only set match to false so we don't callback
|
// Only set match to false so we don't callback
|
||||||
match = false;
|
match = false;
|
||||||
}
|
}
|
||||||
@@ -66,7 +66,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
if (node.check != node_id) {
|
if (node.check != node_id) {
|
||||||
state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN;
|
state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN;
|
||||||
if (match) {
|
if (match) {
|
||||||
log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n");
|
log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n", NULL);
|
||||||
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data});
|
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data});
|
||||||
index = phrase_start + phrase_len;
|
index = phrase_start + phrase_len;
|
||||||
advance_index = false;
|
advance_index = false;
|
||||||
@@ -84,7 +84,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
match = false;
|
match = false;
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
log_debug("node.check == node_id\n");
|
log_debug("node.check == node_id\n", NULL);
|
||||||
state = SEARCH_STATE_PARTIAL_MATCH;
|
state = SEARCH_STATE_PARTIAL_MATCH;
|
||||||
if (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN) {
|
if (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN) {
|
||||||
log_debug("phrase_start=%llu\n", index);
|
log_debug("phrase_start=%llu\n", index);
|
||||||
@@ -105,7 +105,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
|
|
||||||
if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) {
|
if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) {
|
||||||
state = SEARCH_STATE_MATCH;
|
state = SEARCH_STATE_MATCH;
|
||||||
log_debug("Tail matches\n");
|
log_debug("Tail matches\n", NULL);
|
||||||
last_state = state;
|
last_state = state;
|
||||||
data = data_node.data;
|
data = data_node.data;
|
||||||
log_debug("%llu, %d, %zu\n", index, phrase_len, tail_len);
|
log_debug("%llu, %d, %zu\n", index, phrase_len, tail_len);
|
||||||
@@ -115,7 +115,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
phrase_len = index + len - phrase_start;
|
phrase_len = index + len - phrase_start;
|
||||||
match = true;
|
match = true;
|
||||||
} else if (match) {
|
} else if (match) {
|
||||||
log_debug("match is true and longer phrase tail did not match\n");
|
log_debug("match is true and longer phrase tail did not match\n", NULL);
|
||||||
log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len);
|
log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len);
|
||||||
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data});
|
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data});
|
||||||
ptr = fail_ptr;
|
ptr = fail_ptr;
|
||||||
@@ -129,7 +129,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
if (ch != '\0') {
|
if (ch != '\0') {
|
||||||
trie_node_t terminal_node = trie_get_transition(self, node, '\0');
|
trie_node_t terminal_node = trie_get_transition(self, node, '\0');
|
||||||
if (terminal_node.check == next_id) {
|
if (terminal_node.check == next_id) {
|
||||||
log_debug("Transition to NUL byte matched\n");
|
log_debug("Transition to NUL byte matched\n", NULL);
|
||||||
state = SEARCH_STATE_MATCH;
|
state = SEARCH_STATE_MATCH;
|
||||||
match = true;
|
match = true;
|
||||||
phrase_len = index + len - phrase_start;
|
phrase_len = index + len - phrase_start;
|
||||||
@@ -148,7 +148,7 @@ phrase_array *trie_search(trie_t *self, char *text) {
|
|||||||
|
|
||||||
if (unich == 0) {
|
if (unich == 0) {
|
||||||
if (last_state == SEARCH_STATE_MATCH) {
|
if (last_state == SEARCH_STATE_MATCH) {
|
||||||
log_debug("Found match at the end\n");
|
log_debug("Found match at the end\n", NULL);
|
||||||
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data});
|
phrase_array_push(phrases, (phrase_t){phrase_start, phrase_len, data});
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -178,7 +178,7 @@ int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, tokenized_strin
|
|||||||
int token_len = token.len;
|
int token_len = token.len;
|
||||||
|
|
||||||
if (!(*tail_ptr)) {
|
if (!(*tail_ptr)) {
|
||||||
log_debug("tail matches!\n");
|
log_debug("tail matches!\n", NULL);
|
||||||
return i-1;
|
return i-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,12 +212,10 @@ phrase_array *trie_search_tokens(trie_t *self, tokenized_string_t *response) {
|
|||||||
|
|
||||||
int phrase_len = 0, phrase_start = 0, last_match_index = -1;
|
int phrase_len = 0, phrase_start = 0, last_match_index = -1;
|
||||||
|
|
||||||
bool advance_index = true;
|
|
||||||
|
|
||||||
trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN;
|
trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN;
|
||||||
|
|
||||||
log_debug("num_tokens: %zu\n", tokens->n);
|
log_debug("num_tokens: %zu\n", tokens->n);
|
||||||
for (int i = 0; i < tokens->n; advance_index && i++, advance_index = true, last_state = state) {
|
for (int i = 0; i < tokens->n; i++, last_state = state) {
|
||||||
char *ptr = tokenized_string_get_token(response, i);
|
char *ptr = tokenized_string_get_token(response, i);
|
||||||
log_debug("On %d, token=%s\n", i, ptr);
|
log_debug("On %d, token=%s\n", i, ptr);
|
||||||
|
|
||||||
@@ -252,7 +250,7 @@ phrase_array *trie_search_tokens(trie_t *self, tokenized_string_t *response) {
|
|||||||
phrase_start = i;
|
phrase_start = i;
|
||||||
}
|
}
|
||||||
if (strncmp((char *)current_tail, ptr + 1, ptr_len) == 0) {
|
if (strncmp((char *)current_tail, ptr + 1, ptr_len) == 0) {
|
||||||
log_debug("node tail matches first token\n");
|
log_debug("node tail matches first token\n", NULL);
|
||||||
int tail_search_result = trie_node_search_tail_tokens(self, node, response, ptr_len, i+1);
|
int tail_search_result = trie_node_search_tail_tokens(self, node, response, ptr_len, i+1);
|
||||||
if (tail_search_result == -1) {
|
if (tail_search_result == -1) {
|
||||||
node = trie_get_root(self);
|
node = trie_get_root(self);
|
||||||
@@ -284,7 +282,7 @@ phrase_array *trie_search_tokens(trie_t *self, tokenized_string_t *response) {
|
|||||||
phrase_start = 0;
|
phrase_start = 0;
|
||||||
continue;
|
continue;
|
||||||
} else if (last_state == SEARCH_STATE_PARTIAL_MATCH) {
|
} else if (last_state == SEARCH_STATE_PARTIAL_MATCH) {
|
||||||
log_debug("last_state == SEARCH_STATE_PARTIAL_MATCH\n");
|
log_debug("last_state == SEARCH_STATE_PARTIAL_MATCH\n", NULL);
|
||||||
i = phrase_start;
|
i = phrase_start;
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
@@ -365,38 +363,81 @@ phrase_t trie_search_suffixes(trie_t *self, char *word) {
|
|||||||
|
|
||||||
uint32_t value = 0, phrase_start = 0, phrase_len = 0;
|
uint32_t value = 0, phrase_start = 0, phrase_len = 0;
|
||||||
|
|
||||||
char *reversed = utf8_reversed_string(word);
|
ssize_t len;
|
||||||
char *ptr = reversed;
|
|
||||||
|
|
||||||
for (; *ptr; ptr++, phrase_len++, last_node = node, last_node_id = node_id) {
|
int32_t unich = 0;
|
||||||
log_debug("Getting transition index for %d, (%d, %d)\n", node_id, node.base, node.check);
|
|
||||||
node_id = trie_get_transition_index(self, node, *ptr);
|
const uint8_t *start = (const uint8_t *)word;
|
||||||
|
const uint8_t *ptr = (const uint8_t *)word + strlen(word);
|
||||||
|
const uint8_t *char_ptr;
|
||||||
|
|
||||||
|
bool done = false;
|
||||||
|
bool in_tail = false;
|
||||||
|
unsigned char *current_tail = {0};
|
||||||
|
size_t tail_remaining = 0;
|
||||||
|
|
||||||
|
uint32_t tail_value = 0;
|
||||||
|
|
||||||
|
while(1) {
|
||||||
|
len = utf8proc_iterate_reversed(ptr, start, &unich);
|
||||||
|
|
||||||
|
if (len <= 0) break;
|
||||||
|
if (!(utf8proc_codepoint_valid(unich))) break;
|
||||||
|
|
||||||
|
ptr -= len;
|
||||||
|
char_ptr = ptr;
|
||||||
|
|
||||||
|
for (int i=0; i < len; i++, char_ptr++, last_node = node, last_node_id = node_id) {
|
||||||
|
log_debug("char=%c\n", (unsigned char)*char_ptr);
|
||||||
|
|
||||||
|
if (in_tail && *current_tail && *current_tail == *char_ptr) {
|
||||||
|
tail_remaining--;
|
||||||
|
current_tail++;
|
||||||
|
if (i == len - 1) {
|
||||||
|
phrase_len += len;
|
||||||
|
phrase_start = ptr - start;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if (in_tail && tail_remaining == 0 && i == len - 1) {
|
||||||
|
log_debug("tail match!\n", NULL);
|
||||||
|
phrase_start = ptr - start;
|
||||||
|
phrase_len = strlen((char *)ptr);
|
||||||
|
value = tail_value;
|
||||||
|
done = true;
|
||||||
|
break;
|
||||||
|
} else if (in_tail) {
|
||||||
|
done = true;
|
||||||
|
log_debug("Done with tail\n", NULL);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
node_id = trie_get_transition_index(self, node, *char_ptr);
|
||||||
node = trie_get_node(self, node_id);
|
node = trie_get_node(self, node_id);
|
||||||
log_debug("Doing %c, got node_id=%d\n", *ptr, node_id);
|
|
||||||
if (node.check != last_node_id) {
|
if (node.check != last_node_id) {
|
||||||
log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);
|
log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);
|
||||||
|
done = true;
|
||||||
break;
|
break;
|
||||||
} else if (node.base < 0) {
|
} else if (node.base < 0) {
|
||||||
log_debug("Searching tail\n");
|
log_debug("Searching tail\n", NULL);
|
||||||
|
|
||||||
uint32_t data_index = -1*node.base;
|
uint32_t data_index = -1*node.base;
|
||||||
trie_data_node_t data_node = self->data->a[data_index];
|
trie_data_node_t data_node = self->data->a[data_index];
|
||||||
uint32_t current_tail_pos = data_node.tail;
|
uint32_t current_tail_pos = data_node.tail;
|
||||||
|
|
||||||
unsigned char *current_tail = self->tail->a + current_tail_pos;
|
tail_value = data_node.data;
|
||||||
|
|
||||||
log_debug("comparing tail: %s vs %s\n", current_tail, ptr + 1);
|
current_tail = self->tail->a + current_tail_pos;
|
||||||
size_t current_tail_len = strlen((char *)current_tail);
|
|
||||||
if (strncmp((char *)current_tail, ptr + 1, current_tail_len) == 0) {
|
tail_remaining = strlen((char *)current_tail);
|
||||||
phrase_len += current_tail_len + 1;
|
in_tail = true;
|
||||||
log_debug("tail match!\n");
|
|
||||||
value = data_node.data;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (done) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (phrase_len > 0) {
|
||||||
trie_node_t terminal_node = trie_get_transition(self, node, '\0');
|
trie_node_t terminal_node = trie_get_transition(self, node, '\0');
|
||||||
if (terminal_node.check == node_id) {
|
if (terminal_node.check == node_id) {
|
||||||
int32_t data_index = -1*terminal_node.base;
|
int32_t data_index = -1*terminal_node.base;
|
||||||
@@ -404,8 +445,7 @@ phrase_t trie_search_suffixes(trie_t *self, char *word) {
|
|||||||
value = data_node.data;
|
value = data_node.data;
|
||||||
log_debug("value = %d\n", value);
|
log_debug("value = %d\n", value);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
free(reversed);
|
|
||||||
|
|
||||||
return (phrase_t) {phrase_start, phrase_len, value};
|
return (phrase_t) {phrase_start, phrase_len, value};
|
||||||
}
|
}
|
||||||
@@ -427,7 +467,7 @@ phrase_t trie_search_prefixes(trie_t *self, char *word) {
|
|||||||
log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);
|
log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);
|
||||||
break;
|
break;
|
||||||
} else if (node.base < 0) {
|
} else if (node.base < 0) {
|
||||||
log_debug("Searching tail\n");
|
log_debug("Searching tail\n", NULL);
|
||||||
|
|
||||||
uint32_t data_index = -1*node.base;
|
uint32_t data_index = -1*node.base;
|
||||||
trie_data_node_t data_node = self->data->a[data_index];
|
trie_data_node_t data_node = self->data->a[data_index];
|
||||||
@@ -439,7 +479,7 @@ phrase_t trie_search_prefixes(trie_t *self, char *word) {
|
|||||||
size_t current_tail_len = strlen((char *)current_tail);
|
size_t current_tail_len = strlen((char *)current_tail);
|
||||||
if (strncmp((char *)current_tail, ptr + 1, current_tail_len) == 0) {
|
if (strncmp((char *)current_tail, ptr + 1, current_tail_len) == 0) {
|
||||||
phrase_len += current_tail_len + 1;
|
phrase_len += current_tail_len + 1;
|
||||||
log_debug("tail match!\n");
|
log_debug("tail match!\n", NULL);
|
||||||
value = data_node.data;
|
value = data_node.data;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ extern "C" {
|
|||||||
#include "collections.h"
|
#include "collections.h"
|
||||||
#include "klib/kvec.h"
|
#include "klib/kvec.h"
|
||||||
#include "log/log.h"
|
#include "log/log.h"
|
||||||
|
#include "string_utils.h"
|
||||||
#include "tokens.h"
|
#include "tokens.h"
|
||||||
#include "vector.h"
|
#include "vector.h"
|
||||||
#include "utf8proc/utf8proc.h"
|
#include "utf8proc/utf8proc.h"
|
||||||
|
|||||||
Reference in New Issue
Block a user