diff --git a/src/averaged_perceptron_trainer.h b/src/averaged_perceptron_trainer.h index ccfa73d4..a100f3df 100644 --- a/src/averaged_perceptron_trainer.h +++ b/src/averaged_perceptron_trainer.h @@ -39,6 +39,7 @@ Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf #include "string_utils.h" #include "tokens.h" #include "trie.h" +#include "trie_utils.h" typedef struct class_weight { double value; diff --git a/src/trie.c b/src/trie.c index cae61640..3f9066db 100644 --- a/src/trie.c +++ b/src/trie.c @@ -102,52 +102,6 @@ trie_t *trie_new(void) { return trie_new_alphabet(DEFAULT_ALPHABET, sizeof(DEFAULT_ALPHABET)); } - -/* -Build a trie from the sorted keys of a hashtable. Adding -keys in sorted order to a double-array trie is faster than -adding them in random order. -*/ -trie_t *trie_new_from_hash(khash_t(str_uint32) *hash) { - trie_t *trie = trie_new(); - const char *key; - uint32_t value; - - string_array *hash_keys = string_array_new_size(kh_size(hash)); - kh_foreach(hash, key, value, { - if (strlen(key) == 0) continue; - string_array_push(hash_keys, (char *)key); - }) - - ks_introsort(str, hash_keys->n, (const char **)hash_keys->a); - - khiter_t k; - - for (int i = 0; i < hash_keys->n; i++) { - char *str = hash_keys->a[i]; - k = kh_get(str_uint32, hash, str); - if (k == kh_end(hash)) { - log_error("Key not found\n"); - string_array_destroy(hash_keys); - trie_destroy(trie); - return NULL; - } - - value = kh_value(hash, k); - - if (!trie_add(trie, str, value)) { - log_error("Error adding to trie\n"); - string_array_destroy(hash_keys); - trie_destroy(trie); - return NULL; - } - } - - string_array_destroy(hash_keys); - - return trie; -} - inline bool trie_node_is_free(trie_node_t node) { return node.check < 0; } diff --git a/src/trie.h b/src/trie.h index 88864f3e..1d673a66 100644 --- a/src/trie.h +++ b/src/trie.h @@ -80,7 +80,6 @@ typedef struct trie { trie_t *trie_new_alphabet(uint8_t *alphabet, uint32_t alphabet_size); trie_t *trie_new(void); -trie_t *trie_new_from_hash(khash_t(str_uint32) *hash); uint32_t trie_get_char_index(trie_t *self, unsigned char c); uint32_t trie_get_transition_index(trie_t *self, trie_node_t node, unsigned char c); diff --git a/src/trie_utils.c b/src/trie_utils.c new file mode 100644 index 00000000..e492e896 --- /dev/null +++ b/src/trie_utils.c @@ -0,0 +1,98 @@ +#include "trie_utils.h" + +/* +Build a trie from the sorted keys of a hashtable. Adding +keys in sorted order to a double-array trie is faster than +adding them in random order. +*/ +trie_t *trie_new_from_hash(khash_t(str_uint32) *hash) { + trie_t *trie = trie_new(); + const char *key; + uint32_t value; + + size_t hash_size = kh_size(hash); + log_info("hash_size=%zu\n", hash_size); + string_array *hash_keys = string_array_new_size(hash_size); + kh_foreach(hash, key, value, { + if (strlen(key) == 0) continue; + string_array_push(hash_keys, (char *)key); + }) + + ks_introsort(str, hash_keys->n, (const char **)hash_keys->a); + + khiter_t k; + + for (int i = 0; i < hash_keys->n; i++) { + char *str = hash_keys->a[i]; + k = kh_get(str_uint32, hash, str); + if (k == kh_end(hash)) { + log_error("Key not found\n"); + string_array_destroy(hash_keys); + trie_destroy(trie); + return NULL; + } + + value = kh_value(hash, k); + + if (!trie_add(trie, str, value)) { + log_error("Error adding to trie\n"); + string_array_destroy(hash_keys); + trie_destroy(trie); + return NULL; + } + } + + string_array_destroy(hash_keys); + + return trie; +} + +trie_t *trie_new_from_cstring_array_sorted(cstring_array *strings) { + char *key; + uint32_t i; + + int ret = 0; + uint32_t next_id = 0; + + size_t n = cstring_array_num_strings(strings); + + khash_t(str_uint32) *hash = kh_init(str_uint32); + kh_resize(str_uint32, hash, n); + + cstring_array_foreach(strings, i, key, { + if (strlen(key) == 0) continue; + + khiter_t k = kh_put(str_uint32, hash, key, &ret); + + if (ret < 0) { + kh_destroy(str_uint32, hash); + return NULL; + } + + kh_value(hash, k) = next_id++; + }) + + trie_t *trie = trie_new_from_hash(hash); + kh_destroy(str_uint32, hash); + + return trie; +} + +trie_t *trie_new_from_cstring_array(cstring_array *strings) { + char *key; + uint32_t i; + + uint32_t next_id; + + trie_t *trie = trie_new(); + + cstring_array_foreach(strings, i, key, { + if (strlen(key) == 0) continue; + if (!trie_add(trie, key, next_id++)) { + trie_destroy(trie); + return NULL; + } + }) + + return trie; +} diff --git a/src/trie_utils.h b/src/trie_utils.h new file mode 100644 index 00000000..1ac670c0 --- /dev/null +++ b/src/trie_utils.h @@ -0,0 +1,12 @@ +#ifndef TRIE_UTILS_H +#define TRIE_UTILS_H + +#include "collections.h" +#include "string_utils.h" +#include "trie.h" + +trie_t *trie_new_from_hash(khash_t(str_uint32) *hash); +trie_t *trie_new_from_cstring_array_sorted(cstring_array *strings); +trie_t *trie_new_from_cstring_array(cstring_array *strings); + +#endif \ No newline at end of file