diff --git a/src/averaged_perceptron.c b/src/averaged_perceptron.c index d40a517b..a66470e4 100644 --- a/src/averaged_perceptron.c +++ b/src/averaged_perceptron.c @@ -26,7 +26,7 @@ inline double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *s continue; } - for (int col = indptr[feature_id]; col < indptr[feature_id+1]; col++) { + for (int col = indptr[feature_id]; col < indptr[feature_id + 1]; col++) { uint32_t class_id = indices[col]; scores[class_id] += data[col]; } diff --git a/src/averaged_perceptron.h b/src/averaged_perceptron.h index 6147856f..2eb7703c 100644 --- a/src/averaged_perceptron.h +++ b/src/averaged_perceptron.h @@ -15,11 +15,6 @@ very little memory. The weights are stored as a sparse matrix in compressed sparse row format (see sparse_matrix.h) - -Paper: [Collins, 2002] Discriminative Training Methods for Hidden Markov Models: - Theory and Experiments with Perceptron Algorithms - -Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf */ #ifndef AVERAGED_PERCEPTRON_H #define AVERAGED_PERCEPTRON_H diff --git a/src/averaged_perceptron_trainer.c b/src/averaged_perceptron_trainer.c new file mode 100644 index 00000000..3039156a --- /dev/null +++ b/src/averaged_perceptron_trainer.c @@ -0,0 +1,337 @@ +#include "averaged_perceptron_trainer.h" + +void averaged_perceptron_trainer_destroy(averaged_perceptron_trainer_t *self) { + if (self == NULL) return; + + if (self->features != NULL) { + trie_destroy(self->features); + } + + if (self->classes != NULL) { + kh_destroy(str_uint32, self->classes); + } + + uint32_t feature_id; + khash_t(class_weights) *weights; + + kh_foreach(self->weights, feature_id, weights, { + kh_destroy(class_weights, weights); + }) + + if (self->weights != NULL) { + kh_destroy(feature_class_weights, self->weights); + } + + if (self->scores != NULL) { + double_array_destroy(self->scores); + } + + free(self); +} + + +bool averaged_perceptron_trainer_get_class_id(averaged_perceptron_trainer_t *self, char *class_name, uint32_t *class_id, bool add_if_missing) { + khiter_t k; + + khash_t(str_uint32) *classes = self->classes; + + k = kh_get(str_uint32, classes, class_name); + if (k != kh_end(classes)) { + *class_id = kh_value(classes, k); + return true; + } else if (add_if_missing) { + uint32_t new_id = kh_size(classes); + int ret; + k = kh_put(str_uint32, classes, class_name, &ret); + kh_value(classes, k) = new_id; + *class_id = new_id; + + cstring_array_add_string(self->class_strings, class_name); + self->num_classes++; + return true; + } + return false; +} + +bool averaged_perceptron_trainer_get_feature_id(averaged_perceptron_trainer_t *self, char *feature, uint32_t *feature_id, bool add_if_missing) { + trie_t *features = self->features; + + bool in_trie = trie_get_data(features, feature, feature_id); + + if (add_if_missing && !in_trie) { + uint32_t new_id = features->num_keys; + *feature_id = new_id; + if (!trie_add(features, feature, new_id)) { + return false; + } + self->num_features++; + return true; + } else if (in_trie) { + return true; + } + + return false; +} + +averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_trainer_t *self) { + if (self == NULL || self->num_classes == 0) return NULL; + + sparse_matrix_t *averaged_weights = sparse_matrix_new(); + + uint32_t class_id; + class_weight_t weight; + + uint64_t updates = self->num_updates; + khash_t(class_weights) *weights; + + for (uint32_t feature_id = 0; feature_id < self->num_features; feature_id++) { + khiter_t k; + k = kh_get(feature_class_weights, self->weights, feature_id); + if (k == kh_end(self->weights)) { + sparse_matrix_destroy(averaged_weights); + return NULL; + } + + weights = kh_value(self->weights, k); + uint32_t class_id; + + kh_foreach(weights, class_id, weight, { + weight.total += (updates - weight.last_updated) * weight.value; + double value = weight.total / updates; + sparse_matrix_append(averaged_weights, class_id, value); + }) + + sparse_matrix_finalize_row(averaged_weights); + } + + averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t)); + + perceptron->weights = averaged_weights; + + perceptron->num_features = self->num_features; + perceptron->num_classes = self->num_classes; + + perceptron->scores = double_array_new_zeros(perceptron->num_classes); + + // Set our pointers to NULL so they don't get free'd on destroy + perceptron->classes = self->class_strings; + self->class_strings = NULL; + + perceptron->features = self->features; + self->features = NULL; + + averaged_perceptron_trainer_destroy(self); + + return perceptron; +} + +khash_t(class_weights) *averaged_perceptron_trainer_get_class_weights(averaged_perceptron_trainer_t *self, uint32_t feature_id, bool add_if_missing) { + khiter_t k; + k = kh_get(feature_class_weights, self->weights, feature_id); + if (k != kh_end(self->weights)) { + return kh_value(self->weights, k); + } else if (add_if_missing) { + khash_t(class_weights) *weights = kh_init(class_weights); + int ret; + k = kh_put(feature_class_weights, self->weights, feature_id, &ret); + if (ret < 0) { + kh_destroy(class_weights, weights); + return NULL; + } + kh_value(self->weights, k) = weights; + return weights; + } + + return NULL; +} + + +static inline bool averaged_perceptron_trainer_update_weight(khash_t(class_weights) *weights, uint64_t iter, uint32_t class_id, double value) { + class_weight_t weight; + size_t index; + + khiter_t k; + k = kh_get(class_weights, weights, class_id); + if (k == kh_end(weights)) { + weight = NULL_WEIGHT; + } else { + weight = kh_value(weights, k); + } + + weight.total += (iter - weight.last_updated) * weight.value; + weight.last_updated = iter; + weight.value += value; + + int ret; + k = kh_put(class_weights, weights, class_id, &ret); + if (ret < 0) return false; + kh_value(weights, k) = weight; + + return true; + +} + +static inline bool averaged_perceptron_trainer_update_feature(averaged_perceptron_trainer_t *self, uint32_t feature_id, uint32_t guess, uint32_t truth, double value) { + bool add_if_missing = true; + + khash_t(class_weights) *weights = averaged_perceptron_trainer_get_class_weights(self, feature_id, add_if_missing); + + if (weights == NULL) { + return false; + } + + uint64_t updates = self->num_updates; + + if (!averaged_perceptron_trainer_update_weight(weights, updates, guess, -1.0 * value) || + !averaged_perceptron_trainer_update_weight(weights, updates, truth, value)) { + return false; + } + + return true; +} + +uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self, cstring_array *features) { + double_array *scores = self->scores; + size_t num_classes = (size_t)self->num_classes; + + uint32_t i = 0; + char *feature = NULL; + bool add_if_missing = false; + uint32_t feature_id; + + khash_t(class_weights) *weights; + uint32_t class_id; + class_weight_t weight; + + if (scores->m < num_classes) { + double_array_resize(scores, num_classes); + } + + if (scores->n < num_classes) { + scores->n = num_classes; + } + + double_array_set(scores->a, scores->n, 0.0); + + cstring_array_foreach(features, i, feature, { + if (!averaged_perceptron_trainer_get_feature_id(self, feature, &feature_id, add_if_missing)) { + continue; + } + + weights = averaged_perceptron_trainer_get_class_weights(self, feature_id, add_if_missing); + + if (weights == NULL) { + continue; + } + + kh_foreach(weights, class_id, weight, { + scores->a[class_id] += weight.value; + }) + }) + + int64_t max_score = double_array_argmax(scores->a, scores->n); + + return (uint32_t)max_score; +} + +bool averaged_perceptron_trainer_update(averaged_perceptron_trainer_t *self, uint32_t guess, uint32_t truth, cstring_array *features) { + uint32_t i = 0; + char *feature = NULL; + uint32_t feature_id; + bool add_if_missing = true; + + cstring_array_foreach(features, i, feature, { + if (!averaged_perceptron_trainer_get_feature_id(self, feature, &feature_id, add_if_missing)) { + return false; + } + + if (!averaged_perceptron_trainer_update_feature(self, feature_id, guess, truth, 1.0)) { + return false; + } + }) + + self->num_updates++; + + return true; +} + +bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *self, uint32_t guess, uint32_t truth, khash_t(str_uint32) *feature_counts) { + const char *feature; + uint32_t feature_id; + uint32_t count; + bool add_if_missing = true; + + kh_foreach(feature_counts, feature, count, { + if (!averaged_perceptron_trainer_get_feature_id(self, (char *)feature, &feature_id, add_if_missing)) { + return false; + } + + if (!averaged_perceptron_trainer_update_feature(self, feature_id, guess, truth, (double)count)) { + return false; + } + }) + + self->num_updates++; + + return true; +} + +bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, cstring_array *features, char *label) { + uint32_t truth; + bool add_if_missing = true; + + if (!averaged_perceptron_trainer_get_class_id(self, label, &truth, add_if_missing)) { + return false; + } + + uint32_t guess = averaged_perceptron_trainer_predict(self, features); + + // Online error-driven learning, only needs to update weights when it gets a wrong answer, making training fast + if (guess != truth) { + self->num_errors++; + return averaged_perceptron_trainer_update(self, guess, truth, features); + } + + return true; + +} + +averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) { + averaged_perceptron_trainer_t *self = malloc(sizeof(averaged_perceptron_trainer_t)); + + if (self == NULL) return NULL; + + self->num_features = 0; + self->num_classes = 0; + self->num_updates = 0; + self->num_errors = 0; + + self->features = trie_new(); + if (self->features == NULL) { + goto exit_trainer_created; + } + + self->classes = kh_init(str_uint32); + if (self->classes == NULL) { + goto exit_trainer_created; + } + + self->class_strings = cstring_array_new(); + if (self->class_strings == NULL) { + goto exit_trainer_created; + } + + self->weights = kh_init(feature_class_weights); + + if (self->weights == NULL) { + goto exit_trainer_created; + } + + self->scores = double_array_new(); + + return self; + +exit_trainer_created: + averaged_perceptron_trainer_destroy(self); + return NULL; +} diff --git a/src/averaged_perceptron_trainer.h b/src/averaged_perceptron_trainer.h new file mode 100644 index 00000000..b1ed3a1a --- /dev/null +++ b/src/averaged_perceptron_trainer.h @@ -0,0 +1,75 @@ +/* +averaged_perceptron_trainer.h +----------------------------- + +Trainer for a generic averaged perceptron model. + +The averaged perceptron uses a simple online error-driven +learning algorithm. Given some features and the true label, +it predicts the expected label under the current weights. If +it guess correctly, there's nothing to do and it moves +on to the next example. If it predicted the wrong answer, it +makes the following updates to its weights: + +weights[feature][predicted] -= 1.0 +weights[feature][actual] += 1.0 + +This seems overly simplistic, and it is. This is the regular +perceptron update rule. On the more difficult cases, this model +would tend to overfit by spending a lot of time fiddling with the +weights for the few cases it got wrong and building the whole model +around those few cases. The averaged perceptron is one way to account +for this and build a more robust model. + + +Paper: [Collins, 2002] Discriminative Training Methods for Hidden Markov Models: + Theory and Experiments with Perceptron Algorithms + +Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf +*/ +#ifndef AVERAGED_PERCEPTRON_TRAINER_H +#define AVERAGED_PERCEPTRON_TRAINER_H + +#include +#include + +#include "averaged_perceptron.h" +#include "collections.h" +#include "string_utils.h" +#include "trie.h" + +typedef struct class_weight { + double value; + double total; + uint64_t last_updated; +} class_weight_t; + +#define NULL_WEIGHT (class_weight_t){0.0, 0.0, 0} + +KHASH_MAP_INIT_INT(class_weights, class_weight_t) + +KHASH_MAP_INIT_INT(feature_class_weights, khash_t(class_weights) *) + +typedef struct averaged_perceptron_trainer { + uint32_t num_features; + uint32_t num_classes; + uint64_t num_updates; + uint64_t num_errors; + trie_t *features; + khash_t(str_uint32) *classes; + cstring_array *class_strings; + // {feature_id => {class_id => class_weight_t}} + khash_t(feature_class_weights) *weights; + double_array *scores; +} averaged_perceptron_trainer_t; + +averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void); + +uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self, cstring_array *features); +bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *trainer, cstring_array *features, char *label); + +averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_trainer_t *self); + +void averaged_perceptron_trainer_destroy(averaged_perceptron_trainer_t *self); + +#endif