From 51572d65757efce01de5a8700cf3c23dc65ba9da Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 10 Aug 2015 16:01:22 -0400 Subject: [PATCH] [phrases] Changing prefix/suffix chars so both are control characters and neither is the NUL-byte. Modifying transliteration special characters accordingly --- src/transliterate.h | 12 ++++++------ src/trie.c | 16 ++++++++++++---- src/trie.h | 19 ++++++++++++------- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/transliterate.h b/src/transliterate.h index e89f72c0..693d33cb 100644 --- a/src/transliterate.h +++ b/src/transliterate.h @@ -100,11 +100,11 @@ typedef struct transliteration_table { #define WORD_BOUNDARY_CHAR "\x01" #define WORD_BOUNDARY_CODEPOINT 1 #define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR) -#define PRE_CONTEXT_CHAR "\x02" -#define PRE_CONTEXT_CODEPOINT 2 +#define PRE_CONTEXT_CHAR "\x86" +#define PRE_CONTEXT_CODEPOINT 134 #define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR) -#define POST_CONTEXT_CHAR "\x03" -#define POST_CONTEXT_CODEPOINT 3 +#define POST_CONTEXT_CHAR "\x87" +#define POST_CONTEXT_CODEPOINT 135 #define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR) #define EMPTY_TRANSITION_CHAR "\x04" #define EMPTY_TRANSITION_CODEPOINT 4 @@ -112,8 +112,8 @@ typedef struct transliteration_table { #define REPEAT_CHAR "\x05" #define REPEAT_CODEPOINT 5 #define REPEAT_CHAR_LEN strlen(REPEAT_CHAR) -#define GROUP_INDICATOR_CHAR "\x06" -#define GROUP_INDICATOR_CODEPOINT 6 +#define GROUP_INDICATOR_CHAR "\x1d" +#define GROUP_INDICATOR_CODEPOINT 29 #define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR) #define BEGIN_SET_CHAR "\x0f" #define BEGIN_SET_CODEPOINT 15 diff --git a/src/trie.c b/src/trie.c index 3c89ebbc..e7bb2d81 100644 --- a/src/trie.c +++ b/src/trie.c @@ -617,6 +617,10 @@ void trie_print(trie_t *self) { } bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data) { + if (key[0] == TRIE_SUFFIX_CHAR[0] || key[0] == TRIE_PREFIX_CHAR[0]) { + return false; + } + unsigned char *ptr = (unsigned char *)key; uint32_t last_node_id = node_id; trie_node_t last_node = trie_get_node(self, node_id); @@ -673,10 +677,12 @@ bool trie_add_prefix_at_index(trie_t *self, char *key, uint32_t start_node_id, u trie_node_t start_node = trie_get_node(self, start_node_id); - uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_PREFIX_CHAR); + unsigned char prefix_char = TRIE_PREFIX_CHAR[0]; + + uint32_t node_id = trie_get_transition_index(self, start_node, prefix_char); trie_node_t node = trie_get_node(self, node_id); if (node.check != start_node_id) { - node_id = trie_add_transition(self, start_node_id, TRIE_PREFIX_CHAR); + node_id = trie_add_transition(self, start_node_id, prefix_char); } bool success = trie_add_at_index(self, node_id, key, len, data); @@ -694,10 +700,12 @@ bool trie_add_suffix_at_index(trie_t *self, char *key, uint32_t start_node_id, u trie_node_t start_node = trie_get_node(self, start_node_id); - uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_SUFFIX_CHAR); + unsigned char suffix_char = TRIE_SUFFIX_CHAR[0]; + + uint32_t node_id = trie_get_transition_index(self, start_node, suffix_char); trie_node_t node = trie_get_node(self, node_id); if (node.check != start_node_id) { - node_id = trie_add_transition(self, start_node_id, TRIE_SUFFIX_CHAR); + node_id = trie_add_transition(self, start_node_id, suffix_char); } char *suffix = utf8_reversed_string(key); diff --git a/src/trie.h b/src/trie.h index d48f7658..b9a3cd18 100644 --- a/src/trie.h +++ b/src/trie.h @@ -5,11 +5,16 @@ * so given an index into that array, we can treat the array as a C string * starting at that index. It also makes serialization dead simple. We * implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix -* is defined as the reversed UTF-8 suffix string prefixed by the NUL-byte. -* Since we do not allow zero-length strings, the transition from the root node -* to a NUL-byte always denotes a suffix (i.e. we should be iterating -* backward through the query string/token). For more information on double-array -* tries generally, see: http://linux.thai.net/~thep/datrie/datrie.html +* is defined as the reversed UTF-8 suffix string prefixed by TRIE_SUFFIX_CHAR. +* Similarly, a prefix is defined as being prefixed by TRIE_PREFIX_CHAR. +* trie_search defines several methods for searching strings, tokenized strings, +* prefixes and suffixes. Note that the single characters TRIE_SUFFIX_CHAR +* and TRIE_PREFIX_CHAR are not allowed as keys (both are defined as control +* characters, so are unlikely to affect natural language applications). +* This trie implementation also has several *_from_index methods which allow +* for effective namespacing e.g. adding the keys "en|blvd" and "fr|blvd" +* and searching by language. For more information on double-array tries +* generally, see: http://linux.thai.net/~thep/datrie/datrie.html ******************************************************************************/ #ifndef TRIE_H @@ -39,8 +44,8 @@ #define TRIE_INDEX_ERROR 0 #define TRIE_MAX_INDEX 0x7fffffff -#define TRIE_PREFIX_CHAR '\xff' -#define TRIE_SUFFIX_CHAR '\x00' +#define TRIE_PREFIX_CHAR "\x02" +#define TRIE_SUFFIX_CHAR "\x03" // Using 256 characters can fit all UTF-8 encoded strings #define NUM_CHARS 256