[phrases] Changing prefix/suffix chars so both are control characters and neither is the NUL-byte. Modifying transliteration special characters accordingly

2015-08-10 16:01:22 -04:00
parent 11a9881988
commit 51572d6575
3 changed files with 30 additions and 17 deletions
--- a/src/transliterate.h
+++ b/src/transliterate.h
@@ -100,11 +100,11 @@ typedef struct transliteration_table {
 #define WORD_BOUNDARY_CHAR "\x01"
 #define WORD_BOUNDARY_CODEPOINT 1
 #define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
-#define PRE_CONTEXT_CHAR "\x02"
+#define PRE_CONTEXT_CHAR "\x86"
-#define PRE_CONTEXT_CODEPOINT 2
+#define PRE_CONTEXT_CODEPOINT 134
 #define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
-#define POST_CONTEXT_CHAR "\x03"
+#define POST_CONTEXT_CHAR "\x87"
-#define POST_CONTEXT_CODEPOINT 3
+#define POST_CONTEXT_CODEPOINT 135
 #define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
 #define EMPTY_TRANSITION_CHAR "\x04"
 #define EMPTY_TRANSITION_CODEPOINT 4
@@ -112,8 +112,8 @@ typedef struct transliteration_table {
 #define REPEAT_CHAR "\x05"
 #define REPEAT_CODEPOINT 5
 #define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
-#define GROUP_INDICATOR_CHAR "\x06"
+#define GROUP_INDICATOR_CHAR "\x1d"
-#define GROUP_INDICATOR_CODEPOINT 6
+#define GROUP_INDICATOR_CODEPOINT 29
 #define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
 #define BEGIN_SET_CHAR "\x0f"
 #define BEGIN_SET_CODEPOINT 15
--- a/src/trie.c
+++ b/src/trie.c
@@ -617,6 +617,10 @@ void trie_print(trie_t *self) {
 }
 bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data) {
    if (key[0] == TRIE_SUFFIX_CHAR[0] || key[0] == TRIE_PREFIX_CHAR[0]) {
        return false;
    }
    unsigned char *ptr = (unsigned char *)key; 
    uint32_t last_node_id = node_id;
    trie_node_t last_node = trie_get_node(self, node_id);
@@ -673,10 +677,12 @@ bool trie_add_prefix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
    trie_node_t start_node = trie_get_node(self, start_node_id);
-    uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_PREFIX_CHAR);
+    unsigned char prefix_char = TRIE_PREFIX_CHAR[0];
    uint32_t node_id = trie_get_transition_index(self, start_node, prefix_char);
    trie_node_t node = trie_get_node(self, node_id);
    if (node.check != start_node_id) {
-        node_id = trie_add_transition(self, start_node_id, TRIE_PREFIX_CHAR);
+        node_id = trie_add_transition(self, start_node_id, prefix_char);
    }
    bool success = trie_add_at_index(self, node_id, key, len, data);
@@ -694,10 +700,12 @@ bool trie_add_suffix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
    trie_node_t start_node = trie_get_node(self, start_node_id);
-    uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_SUFFIX_CHAR);
+    unsigned char suffix_char = TRIE_SUFFIX_CHAR[0];
    uint32_t node_id = trie_get_transition_index(self, start_node, suffix_char);
    trie_node_t node = trie_get_node(self, node_id);
    if (node.check != start_node_id) {
-        node_id = trie_add_transition(self, start_node_id, TRIE_SUFFIX_CHAR);
+        node_id = trie_add_transition(self, start_node_id, suffix_char);
    }
    char *suffix = utf8_reversed_string(key);
--- a/src/trie.h
+++ b/src/trie.h
@@ -5,11 +5,16 @@
 * so given an index into that array, we can treat the array as a C string
 * starting at that index. It also makes serialization dead simple. We
 * implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix
-* is defined as the reversed UTF-8 suffix string prefixed by the NUL-byte. 
+* is defined as the reversed UTF-8 suffix string prefixed by TRIE_SUFFIX_CHAR.
-* Since we do not allow zero-length strings, the transition from the root node
+* Similarly, a prefix is defined as being prefixed by TRIE_PREFIX_CHAR. 
-* to a NUL-byte always denotes a suffix (i.e. we should be iterating 
+* trie_search defines several methods for searching strings, tokenized strings,
-* backward through the query string/token). For more information on double-array
+* prefixes and suffixes. Note that the single characters TRIE_SUFFIX_CHAR 
-* tries generally, see: http://linux.thai.net/~thep/datrie/datrie.html
+* and TRIE_PREFIX_CHAR are not allowed as keys (both are defined as control 
 * characters, so are unlikely to affect natural language applications).
 * This trie implementation also has several *_from_index methods which allow 
 * for effective namespacing e.g. adding the keys "en|blvd" and "fr|blvd"
 * and searching by language. For more information on double-array tries
 * generally, see: http://linux.thai.net/~thep/datrie/datrie.html
 ******************************************************************************/
 #ifndef TRIE_H
@@ -39,8 +44,8 @@
 #define TRIE_INDEX_ERROR  0
 #define TRIE_MAX_INDEX 0x7fffffff
-#define TRIE_PREFIX_CHAR '\xff'
+#define TRIE_PREFIX_CHAR "\x02"
-#define TRIE_SUFFIX_CHAR '\x00'
+#define TRIE_SUFFIX_CHAR "\x03"
 // Using 256 characters can fit all UTF-8 encoded strings
 #define NUM_CHARS 256