[phrases] default constructor for a trie uses a default alphabet derived from Wikipedia character frequencies for convenience. In practice the alphabet size/ordering matters only for very small tries or specialized alphabets. Mostly just use trie_new()

This commit is contained in:
Al
2015-03-05 13:31:25 -05:00
parent 939c3af293
commit 38ec03bf2b
2 changed files with 38 additions and 2 deletions

View File

@@ -1,6 +1,35 @@
#include "trie.h"
#include <math.h>
/*
* Maps the 256 characters (suitable for UTF-8 strings) to array indices
* ordered by frequency of usage in Wikipedia titles.
* In practice the order of the chars shouldn't matter for larger key sets
* but may save space for a small number of keys
*/
uint8_t DEFAULT_ALPHABET[] = {
32, 97, 101, 105, 111, 110, 114, 0, 116, 108, 115, 117, 104, 99, 100, 109,
103, 121, 83, 112, 67, 98, 107, 77, 65, 102, 118, 66, 80, 84, 41, 40,
119, 82, 72, 68, 76, 71, 70, 87, 49, 44, 78, 75, 69, 74, 73, 48,
195, 122, 45, 50, 57, 79, 86, 46, 120, 85, 106, 39, 56, 51, 52, 89,
128, 226, 147, 55, 53, 54, 197, 113, 196, 90, 169, 161, 81, 179, 58, 88,
173, 188, 141, 182, 153, 177, 38, 130, 135, 164, 159, 47, 168, 33, 186, 167,
129, 200, 131, 162, 155, 184, 163, 171, 160, 137, 132, 190, 133, 34, 225, 187,
165, 189, 176, 63, 201, 140, 154, 180, 151, 170, 145, 175, 43, 152, 150, 166,
158, 194, 198, 178, 144, 181, 148, 134, 136, 42, 185, 174, 156, 143, 172, 191,
142, 96, 59, 202, 139, 183, 64, 206, 157, 61, 146, 36, 37, 199, 149, 126,
229, 230, 204, 233, 231, 207, 138, 208, 232, 92, 227, 228, 209, 94, 224, 239,
217, 205, 221, 218, 211, 4, 8, 12, 16, 20, 24, 28, 60, 203, 215, 219,
223, 235, 243, 247, 251, 124, 254, 3, 7, 11, 15, 19, 23, 27, 31, 35,
192, 212, 216, 91, 220, 95, 236, 240, 244, 248, 123, 252, 127, 2, 6, 10,
14, 18, 22, 26, 30, 62, 193, 213, 237, 241, 245, 249, 253, 1, 5, 9,
13, 17, 21, 25, 29, 210, 214, 93, 222, 234, 238, 242, 246, 250, 125, 255
};
#define DEFAULT_ALPHABET_SIZE sizeof(DEFAULT_ALPHABET)
/*
Constructors
*/
@@ -49,7 +78,7 @@ exit_no_malloc:
return NULL;
}
trie_t *trie_new(uint8_t *alphabet, uint32_t alphabet_size) {
trie_t *trie_new_alphabet(uint8_t *alphabet, uint32_t alphabet_size) {
trie_t *self = trie_new_empty(alphabet, alphabet_size);
if (!self)
return NULL;
@@ -67,6 +96,10 @@ trie_t *trie_new(uint8_t *alphabet, uint32_t alphabet_size) {
return self;
}
trie_t *trie_new(void) {
return trie_new_alphabet(DEFAULT_ALPHABET, sizeof(DEFAULT_ALPHABET));
}
bool trie_node_is_free(trie_node_t node) {
return node.check < 0;
}
@@ -787,6 +820,8 @@ trie_t *trie_read(FILE *file) {
goto exit_file_read;
log_debug("alphabet_size=%d\n", alphabet_size);
if (alphabet_size > NUM_CHARS)
goto exit_file_read;
if (!file_read_chars(file, (char *)alphabet, alphabet_size))
goto exit_file_read;

View File

@@ -68,7 +68,8 @@ typedef struct trie {
int alphabet_size;
} trie_t;
trie_t *trie_new(uint8_t *alphabet, uint32_t alphabet_size);
trie_t *trie_new_alphabet(uint8_t *alphabet, uint32_t alphabet_size);
trie_t *trie_new(void);
uint32_t trie_get_char_index(trie_t *self, unsigned char c);
uint32_t trie_get_transition_index(trie_t *self, trie_node_t node, unsigned char c);