[phrases] Changing prefix/suffix chars so both are control characters and neither is the NUL-byte. Modifying transliteration special characters accordingly

This commit is contained in:
Al
2015-08-10 16:01:22 -04:00
parent 11a9881988
commit 51572d6575
3 changed files with 30 additions and 17 deletions

View File

@@ -100,11 +100,11 @@ typedef struct transliteration_table {
#define WORD_BOUNDARY_CHAR "\x01" #define WORD_BOUNDARY_CHAR "\x01"
#define WORD_BOUNDARY_CODEPOINT 1 #define WORD_BOUNDARY_CODEPOINT 1
#define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR) #define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
#define PRE_CONTEXT_CHAR "\x02" #define PRE_CONTEXT_CHAR "\x86"
#define PRE_CONTEXT_CODEPOINT 2 #define PRE_CONTEXT_CODEPOINT 134
#define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR) #define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
#define POST_CONTEXT_CHAR "\x03" #define POST_CONTEXT_CHAR "\x87"
#define POST_CONTEXT_CODEPOINT 3 #define POST_CONTEXT_CODEPOINT 135
#define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR) #define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
#define EMPTY_TRANSITION_CHAR "\x04" #define EMPTY_TRANSITION_CHAR "\x04"
#define EMPTY_TRANSITION_CODEPOINT 4 #define EMPTY_TRANSITION_CODEPOINT 4
@@ -112,8 +112,8 @@ typedef struct transliteration_table {
#define REPEAT_CHAR "\x05" #define REPEAT_CHAR "\x05"
#define REPEAT_CODEPOINT 5 #define REPEAT_CODEPOINT 5
#define REPEAT_CHAR_LEN strlen(REPEAT_CHAR) #define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
#define GROUP_INDICATOR_CHAR "\x06" #define GROUP_INDICATOR_CHAR "\x1d"
#define GROUP_INDICATOR_CODEPOINT 6 #define GROUP_INDICATOR_CODEPOINT 29
#define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR) #define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
#define BEGIN_SET_CHAR "\x0f" #define BEGIN_SET_CHAR "\x0f"
#define BEGIN_SET_CODEPOINT 15 #define BEGIN_SET_CODEPOINT 15

View File

@@ -617,6 +617,10 @@ void trie_print(trie_t *self) {
} }
bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data) { bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data) {
if (key[0] == TRIE_SUFFIX_CHAR[0] || key[0] == TRIE_PREFIX_CHAR[0]) {
return false;
}
unsigned char *ptr = (unsigned char *)key; unsigned char *ptr = (unsigned char *)key;
uint32_t last_node_id = node_id; uint32_t last_node_id = node_id;
trie_node_t last_node = trie_get_node(self, node_id); trie_node_t last_node = trie_get_node(self, node_id);
@@ -673,10 +677,12 @@ bool trie_add_prefix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
trie_node_t start_node = trie_get_node(self, start_node_id); trie_node_t start_node = trie_get_node(self, start_node_id);
uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_PREFIX_CHAR); unsigned char prefix_char = TRIE_PREFIX_CHAR[0];
uint32_t node_id = trie_get_transition_index(self, start_node, prefix_char);
trie_node_t node = trie_get_node(self, node_id); trie_node_t node = trie_get_node(self, node_id);
if (node.check != start_node_id) { if (node.check != start_node_id) {
node_id = trie_add_transition(self, start_node_id, TRIE_PREFIX_CHAR); node_id = trie_add_transition(self, start_node_id, prefix_char);
} }
bool success = trie_add_at_index(self, node_id, key, len, data); bool success = trie_add_at_index(self, node_id, key, len, data);
@@ -694,10 +700,12 @@ bool trie_add_suffix_at_index(trie_t *self, char *key, uint32_t start_node_id, u
trie_node_t start_node = trie_get_node(self, start_node_id); trie_node_t start_node = trie_get_node(self, start_node_id);
uint32_t node_id = trie_get_transition_index(self, start_node, TRIE_SUFFIX_CHAR); unsigned char suffix_char = TRIE_SUFFIX_CHAR[0];
uint32_t node_id = trie_get_transition_index(self, start_node, suffix_char);
trie_node_t node = trie_get_node(self, node_id); trie_node_t node = trie_get_node(self, node_id);
if (node.check != start_node_id) { if (node.check != start_node_id) {
node_id = trie_add_transition(self, start_node_id, TRIE_SUFFIX_CHAR); node_id = trie_add_transition(self, start_node_id, suffix_char);
} }
char *suffix = utf8_reversed_string(key); char *suffix = utf8_reversed_string(key);

View File

@@ -5,11 +5,16 @@
* so given an index into that array, we can treat the array as a C string * so given an index into that array, we can treat the array as a C string
* starting at that index. It also makes serialization dead simple. We * starting at that index. It also makes serialization dead simple. We
* implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix * implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix
* is defined as the reversed UTF-8 suffix string prefixed by the NUL-byte. * is defined as the reversed UTF-8 suffix string prefixed by TRIE_SUFFIX_CHAR.
* Since we do not allow zero-length strings, the transition from the root node * Similarly, a prefix is defined as being prefixed by TRIE_PREFIX_CHAR.
* to a NUL-byte always denotes a suffix (i.e. we should be iterating * trie_search defines several methods for searching strings, tokenized strings,
* backward through the query string/token). For more information on double-array * prefixes and suffixes. Note that the single characters TRIE_SUFFIX_CHAR
* tries generally, see: http://linux.thai.net/~thep/datrie/datrie.html * and TRIE_PREFIX_CHAR are not allowed as keys (both are defined as control
* characters, so are unlikely to affect natural language applications).
* This trie implementation also has several *_from_index methods which allow
* for effective namespacing e.g. adding the keys "en|blvd" and "fr|blvd"
* and searching by language. For more information on double-array tries
* generally, see: http://linux.thai.net/~thep/datrie/datrie.html
******************************************************************************/ ******************************************************************************/
#ifndef TRIE_H #ifndef TRIE_H
@@ -39,8 +44,8 @@
#define TRIE_INDEX_ERROR 0 #define TRIE_INDEX_ERROR 0
#define TRIE_MAX_INDEX 0x7fffffff #define TRIE_MAX_INDEX 0x7fffffff
#define TRIE_PREFIX_CHAR '\xff' #define TRIE_PREFIX_CHAR "\x02"
#define TRIE_SUFFIX_CHAR '\x00' #define TRIE_SUFFIX_CHAR "\x03"
// Using 256 characters can fit all UTF-8 encoded strings // Using 256 characters can fit all UTF-8 encoded strings
#define NUM_CHARS 256 #define NUM_CHARS 256