diff --git a/src/trie.c b/src/trie.c index 284dbfbf..5eb9345c 100644 --- a/src/trie.c +++ b/src/trie.c @@ -1,7 +1,6 @@ #include "trie.h" #include - /* * Maps the 256 characters (suitable for UTF-8 strings) to array indices * ordered by frequency of usage in Wikipedia titles. @@ -103,6 +102,52 @@ trie_t *trie_new(void) { return trie_new_alphabet(DEFAULT_ALPHABET, sizeof(DEFAULT_ALPHABET)); } + +/* +Build a trie from the sorted keys of a hashtable. Adding +keys in sorted order to a double-array trie is faster than +adding them in random order. +*/ +trie_t *trie_new_from_hash(khash_t(str_uint32) *hash) { + trie_t *trie = trie_new(); + const char *key; + uint32_t value; + + string_array *hash_keys = string_array_new_size(kh_size(hash)); + kh_foreach(hash, key, value, { + if (strlen(key) == 0) continue; + string_array_push(hash_keys, (char *)key); + }) + + ks_introsort(str, hash_keys->n, (const char **)hash_keys->a); + + khiter_t k; + + for (int i = 0; i < hash_keys->n; i++) { + char *str = hash_keys->a[i]; + k = kh_get(str_uint32, hash, str); + if (k == kh_end(hash)) { + log_error("Key not found\n"); + string_array_destroy(hash_keys); + trie_destroy(trie); + return NULL; + } + + value = kh_value(hash, k); + + if (!trie_add(trie, str, value)) { + log_error("Error adding to trie\n"); + string_array_destroy(hash_keys); + trie_destroy(trie); + return NULL; + } + } + + string_array_destroy(hash_keys); + + return trie; +} + inline bool trie_node_is_free(trie_node_t node) { return node.check < 0; } diff --git a/src/trie.h b/src/trie.h index a121c4f2..ba6a8970 100644 --- a/src/trie.h +++ b/src/trie.h @@ -80,6 +80,7 @@ typedef struct trie { trie_t *trie_new_alphabet(uint8_t *alphabet, uint32_t alphabet_size); trie_t *trie_new(void); +trie_t *trie_new_from_hash(khash_t(str_uint32) *hash); uint32_t trie_get_char_index(trie_t *self, unsigned char c); uint32_t trie_get_transition_index(trie_t *self, trie_node_t node, unsigned char c);