[phrases] Changing prefix/suffix chars so both are control characters and neither is the NUL-byte. Modifying transliteration special characters accordingly
This commit is contained in:
@@ -100,11 +100,11 @@ typedef struct transliteration_table {
|
|||||||
#define WORD_BOUNDARY_CHAR "\x01"
|
#define WORD_BOUNDARY_CHAR "\x01"
|
||||||
#define WORD_BOUNDARY_CODEPOINT 1
|
#define WORD_BOUNDARY_CODEPOINT 1
|
||||||
#define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
|
#define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
|
||||||
#define PRE_CONTEXT_CHAR "\x02"
|
#define PRE_CONTEXT_CHAR "\x86"
|
||||||
#define PRE_CONTEXT_CODEPOINT 2
|
#define PRE_CONTEXT_CODEPOINT 134
|
||||||
#define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
|
#define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
|
||||||
#define POST_CONTEXT_CHAR "\x03"
|
#define POST_CONTEXT_CHAR "\x87"
|
||||||
#define POST_CONTEXT_CODEPOINT 3
|
#define POST_CONTEXT_CODEPOINT 135
|
||||||
#define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
|
#define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
|
||||||
#define EMPTY_TRANSITION_CHAR "\x04"
|
#define EMPTY_TRANSITION_CHAR "\x04"
|
||||||
#define EMPTY_TRANSITION_CODEPOINT 4
|
#define EMPTY_TRANSITION_CODEPOINT 4
|
||||||
@@ -112,8 +112,8 @@ typedef struct transliteration_table {
|
|||||||
#define REPEAT_CHAR "\x05"
|
#define REPEAT_CHAR "\x05"
|
||||||
#define REPEAT_CODEPOINT 5
|
#define REPEAT_CODEPOINT 5
|
||||||
#define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
|
#define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
|
||||||
#define GROUP_INDICATOR_CHAR "\x06"
|
#define GROUP_INDICATOR_CHAR "\x1d"
|
||||||
#define GROUP_INDICATOR_CODEPOINT 6
|
#define GROUP_INDICATOR_CODEPOINT 29
|
||||||
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
|
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
|
||||||
#define BEGIN_SET_CHAR "\x0f"
|
#define BEGIN_SET_CHAR "\x0f"
|
||||||
#define BEGIN_SET_CODEPOINT 15
|
#define BEGIN_SET_CODEPOINT 15
|
||||||
|
|||||||
16
src/trie.c
16
src/trie.c
@@ -617,6 +617,10 @@ void trie_print(trie_t *self) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data) {
|
bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data) {
|
||||||
|
if (key[0] == TRIE_SUFFIX_CHAR[0] || key[0] == TRIE_PREFIX_CHAR[0]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned char *ptr = (unsigned char *)key;
|
unsigned char *ptr = (unsigned char *)key;
|
||||||
uint32_t last_node_id = node_id;
|
uint32_t last_node_id = node_id;
|
||||||
trie_node_t last_node = trie_get_node(self, node_id);
|
trie_node_t last_node = trie_get_node(self, node_id);
|
||||||
@@ -673,10 +677,12 @@ bool trie_add_prefix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
|
|||||||
|
|
||||||
trie_node_t start_node = trie_get_node(self, start_node_id);
|
trie_node_t start_node = trie_get_node(self, start_node_id);
|
||||||
|
|
||||||
uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_PREFIX_CHAR);
|
unsigned char prefix_char = TRIE_PREFIX_CHAR[0];
|
||||||
|
|
||||||
|
uint32_t node_id = trie_get_transition_index(self, start_node, prefix_char);
|
||||||
trie_node_t node = trie_get_node(self, node_id);
|
trie_node_t node = trie_get_node(self, node_id);
|
||||||
if (node.check != start_node_id) {
|
if (node.check != start_node_id) {
|
||||||
node_id = trie_add_transition(self, start_node_id, TRIE_PREFIX_CHAR);
|
node_id = trie_add_transition(self, start_node_id, prefix_char);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = trie_add_at_index(self, node_id, key, len, data);
|
bool success = trie_add_at_index(self, node_id, key, len, data);
|
||||||
@@ -694,10 +700,12 @@ bool trie_add_suffix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
|
|||||||
|
|
||||||
trie_node_t start_node = trie_get_node(self, start_node_id);
|
trie_node_t start_node = trie_get_node(self, start_node_id);
|
||||||
|
|
||||||
uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_SUFFIX_CHAR);
|
unsigned char suffix_char = TRIE_SUFFIX_CHAR[0];
|
||||||
|
|
||||||
|
uint32_t node_id = trie_get_transition_index(self, start_node, suffix_char);
|
||||||
trie_node_t node = trie_get_node(self, node_id);
|
trie_node_t node = trie_get_node(self, node_id);
|
||||||
if (node.check != start_node_id) {
|
if (node.check != start_node_id) {
|
||||||
node_id = trie_add_transition(self, start_node_id, TRIE_SUFFIX_CHAR);
|
node_id = trie_add_transition(self, start_node_id, suffix_char);
|
||||||
}
|
}
|
||||||
|
|
||||||
char *suffix = utf8_reversed_string(key);
|
char *suffix = utf8_reversed_string(key);
|
||||||
|
|||||||
19
src/trie.h
19
src/trie.h
@@ -5,11 +5,16 @@
|
|||||||
* so given an index into that array, we can treat the array as a C string
|
* so given an index into that array, we can treat the array as a C string
|
||||||
* starting at that index. It also makes serialization dead simple. We
|
* starting at that index. It also makes serialization dead simple. We
|
||||||
* implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix
|
* implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix
|
||||||
* is defined as the reversed UTF-8 suffix string prefixed by the NUL-byte.
|
* is defined as the reversed UTF-8 suffix string prefixed by TRIE_SUFFIX_CHAR.
|
||||||
* Since we do not allow zero-length strings, the transition from the root node
|
* Similarly, a prefix is defined as being prefixed by TRIE_PREFIX_CHAR.
|
||||||
* to a NUL-byte always denotes a suffix (i.e. we should be iterating
|
* trie_search defines several methods for searching strings, tokenized strings,
|
||||||
* backward through the query string/token). For more information on double-array
|
* prefixes and suffixes. Note that the single characters TRIE_SUFFIX_CHAR
|
||||||
* tries generally, see: http://linux.thai.net/~thep/datrie/datrie.html
|
* and TRIE_PREFIX_CHAR are not allowed as keys (both are defined as control
|
||||||
|
* characters, so are unlikely to affect natural language applications).
|
||||||
|
* This trie implementation also has several *_from_index methods which allow
|
||||||
|
* for effective namespacing e.g. adding the keys "en|blvd" and "fr|blvd"
|
||||||
|
* and searching by language. For more information on double-array tries
|
||||||
|
* generally, see: http://linux.thai.net/~thep/datrie/datrie.html
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
|
|
||||||
#ifndef TRIE_H
|
#ifndef TRIE_H
|
||||||
@@ -39,8 +44,8 @@
|
|||||||
#define TRIE_INDEX_ERROR 0
|
#define TRIE_INDEX_ERROR 0
|
||||||
#define TRIE_MAX_INDEX 0x7fffffff
|
#define TRIE_MAX_INDEX 0x7fffffff
|
||||||
|
|
||||||
#define TRIE_PREFIX_CHAR '\xff'
|
#define TRIE_PREFIX_CHAR "\x02"
|
||||||
#define TRIE_SUFFIX_CHAR '\x00'
|
#define TRIE_SUFFIX_CHAR "\x03"
|
||||||
|
|
||||||
// Using 256 characters can fit all UTF-8 encoded strings
|
// Using 256 characters can fit all UTF-8 encoded strings
|
||||||
#define NUM_CHARS 256
|
#define NUM_CHARS 256
|
||||||
|
|||||||
Reference in New Issue
Block a user