From 51572d65757efce01de5a8700cf3c23dc65ba9da Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Mon, 10 Aug 2015 16:01:22 -0400
Subject: [PATCH] [phrases] Changing prefix/suffix chars so both are control
 characters and neither is the NUL-byte. Modifying transliteration special
 characters accordingly

---
 src/transliterate.h | 12 ++++++------
 src/trie.c          | 16 ++++++++++++----
 src/trie.h          | 19 ++++++++++++-------
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/src/transliterate.h b/src/transliterate.h
index e89f72c0..693d33cb 100644
--- a/src/transliterate.h
+++ b/src/transliterate.h
@@ -100,11 +100,11 @@ typedef struct transliteration_table {
 #define WORD_BOUNDARY_CHAR "\x01"
 #define WORD_BOUNDARY_CODEPOINT 1
 #define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
-#define PRE_CONTEXT_CHAR "\x02"
-#define PRE_CONTEXT_CODEPOINT 2
+#define PRE_CONTEXT_CHAR "\x86"
+#define PRE_CONTEXT_CODEPOINT 134
 #define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
-#define POST_CONTEXT_CHAR "\x03"
-#define POST_CONTEXT_CODEPOINT 3
+#define POST_CONTEXT_CHAR "\x87"
+#define POST_CONTEXT_CODEPOINT 135
 #define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
 #define EMPTY_TRANSITION_CHAR "\x04"
 #define EMPTY_TRANSITION_CODEPOINT 4
@@ -112,8 +112,8 @@ typedef struct transliteration_table {
 #define REPEAT_CHAR "\x05"
 #define REPEAT_CODEPOINT 5
 #define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
-#define GROUP_INDICATOR_CHAR "\x06"
-#define GROUP_INDICATOR_CODEPOINT 6
+#define GROUP_INDICATOR_CHAR "\x1d"
+#define GROUP_INDICATOR_CODEPOINT 29
 #define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
 #define BEGIN_SET_CHAR "\x0f"
 #define BEGIN_SET_CODEPOINT 15
diff --git a/src/trie.c b/src/trie.c
index 3c89ebbc..e7bb2d81 100644
--- a/src/trie.c
+++ b/src/trie.c
@@ -617,6 +617,10 @@ void trie_print(trie_t *self) {
 }
 
 bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data) {
+    if (key[0] == TRIE_SUFFIX_CHAR[0] || key[0] == TRIE_PREFIX_CHAR[0]) {
+        return false;
+    }
+
     unsigned char *ptr = (unsigned char *)key; 
     uint32_t last_node_id = node_id;
     trie_node_t last_node = trie_get_node(self, node_id);
@@ -673,10 +677,12 @@ bool trie_add_prefix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
 
     trie_node_t start_node = trie_get_node(self, start_node_id);
 
-    uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_PREFIX_CHAR);
+    unsigned char prefix_char = TRIE_PREFIX_CHAR[0];
+
+    uint32_t node_id = trie_get_transition_index(self, start_node, prefix_char);
     trie_node_t node = trie_get_node(self, node_id);
     if (node.check != start_node_id) {
-        node_id = trie_add_transition(self, start_node_id, TRIE_PREFIX_CHAR);
+        node_id = trie_add_transition(self, start_node_id, prefix_char);
     }
 
     bool success = trie_add_at_index(self, node_id, key, len, data);
@@ -694,10 +700,12 @@ bool trie_add_suffix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
 
     trie_node_t start_node = trie_get_node(self, start_node_id);
 
-    uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_SUFFIX_CHAR);
+    unsigned char suffix_char = TRIE_SUFFIX_CHAR[0];
+
+    uint32_t node_id = trie_get_transition_index(self, start_node, suffix_char);
     trie_node_t node = trie_get_node(self, node_id);
     if (node.check != start_node_id) {
-        node_id = trie_add_transition(self, start_node_id, TRIE_SUFFIX_CHAR);
+        node_id = trie_add_transition(self, start_node_id, suffix_char);
     }
 
     char *suffix = utf8_reversed_string(key);
diff --git a/src/trie.h b/src/trie.h
index d48f7658..b9a3cd18 100644
--- a/src/trie.h
+++ b/src/trie.h
@@ -5,11 +5,16 @@
 * so given an index into that array, we can treat the array as a C string
 * starting at that index. It also makes serialization dead simple. We
 * implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix
-* is defined as the reversed UTF-8 suffix string prefixed by the NUL-byte. 
-* Since we do not allow zero-length strings, the transition from the root node
-* to a NUL-byte always denotes a suffix (i.e. we should be iterating 
-* backward through the query string/token). For more information on double-array
-* tries generally, see: http://linux.thai.net/~thep/datrie/datrie.html
+* is defined as the reversed UTF-8 suffix string prefixed by TRIE_SUFFIX_CHAR.
+* Similarly, a prefix is defined as being prefixed by TRIE_PREFIX_CHAR. 
+* trie_search defines several methods for searching strings, tokenized strings,
+* prefixes and suffixes. Note that the single characters TRIE_SUFFIX_CHAR 
+* and TRIE_PREFIX_CHAR are not allowed as keys (both are defined as control 
+* characters, so are unlikely to affect natural language applications).
+* This trie implementation also has several *_from_index methods which allow 
+* for effective namespacing e.g. adding the keys "en|blvd" and "fr|blvd"
+* and searching by language. For more information on double-array tries
+* generally, see: http://linux.thai.net/~thep/datrie/datrie.html
 ******************************************************************************/
 
 #ifndef TRIE_H
@@ -39,8 +44,8 @@
 #define TRIE_INDEX_ERROR  0
 #define TRIE_MAX_INDEX 0x7fffffff
 
-#define TRIE_PREFIX_CHAR '\xff'
-#define TRIE_SUFFIX_CHAR '\x00'
+#define TRIE_PREFIX_CHAR "\x02"
+#define TRIE_SUFFIX_CHAR "\x03"
 
 // Using 256 characters can fit all UTF-8 encoded strings
 #define NUM_CHARS 256