diff --git a/src/averaged_perceptron.c b/src/averaged_perceptron.c new file mode 100644 index 00000000..d40a517b --- /dev/null +++ b/src/averaged_perceptron.c @@ -0,0 +1,219 @@ +#include "averaged_perceptron.h" + +#define PERCEPTRON_SIGNATURE 0xCBCBCBCB + +static inline bool averaged_perceptron_get_feature_id(averaged_perceptron_t *self, char *feature, uint32_t *feature_id) { + return trie_get_data(self->features, feature, feature_id); +} + +inline double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *self, cstring_array *features) { + if (self->scores == NULL || self->scores->n == 0) self->scores = double_array_new_zeros((size_t)self->num_classes); + + double_array_set(self->scores->a, self->scores->n, 0.0); + + double *scores = self->scores->a; + + uint32_t i = 0; + char *feature; + uint32_t feature_id; + + uint32_t *indptr = self->weights->indptr->a; + uint32_t *indices = self->weights->indices->a; + double *data = self->weights->data->a; + + cstring_array_foreach(features, i, feature, { + if (!averaged_perceptron_get_feature_id(self, feature, &feature_id)) { + continue; + } + + for (int col = indptr[feature_id]; col < indptr[feature_id+1]; col++) { + uint32_t class_id = indices[col]; + scores[class_id] += data[col]; + } + + }) + + return self->scores; +} + +inline double_array *averaged_perceptron_predict_scores_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts) { + if (self->scores == NULL || self->scores->n == 0) self->scores = double_array_new_zeros((size_t)self->num_classes); + + double_array_set(self->scores->a, self->scores->n, 0.0); + + double *scores = self->scores->a; + + uint32_t i = 0; + const char *feature; + uint32_t count; + uint32_t feature_id; + + uint32_t *indptr = self->weights->indptr->a; + uint32_t *indices = self->weights->indices->a; + double *data = self->weights->data->a; + + kh_foreach(feature_counts, feature, count, { + if (!averaged_perceptron_get_feature_id(self, (char *)feature, &feature_id)) { + continue; + } + + for (int col = indptr[feature_id]; col < indptr[feature_id + 1]; col++) { + uint32_t class_id = indices[col]; + scores[class_id] += data[col] * (double)count; + } + }) + + return self->scores; +} + + +inline uint32_t averaged_perceptron_predict(averaged_perceptron_t *self, cstring_array *features) { + double_array *scores = averaged_perceptron_predict_scores(self, features); + + int64_t max_score = double_array_argmax(scores->a, scores->n); + + return (uint32_t)max_score; + +} + +inline uint32_t averaged_perceptron_predict_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts) { + double_array *scores = averaged_perceptron_predict_scores_counts(self, feature_counts); + + int64_t max_score = double_array_argmax(scores->a, scores->n); + + return (uint32_t)max_score; +} + +averaged_perceptron_t *averaged_perceptron_read(FILE *f) { + if (f == NULL) return NULL; + + uint32_t signature; + + if (!file_read_uint32(f, &signature) || signature != PERCEPTRON_SIGNATURE) { + return NULL; + } + + averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t)); + + if (!file_read_uint32(f, &perceptron->num_features) || + !file_read_uint32(f, &perceptron->num_classes) || + perceptron->num_classes == 0) { + return NULL; + } + + perceptron->weights = sparse_matrix_read(f); + if (perceptron->weights == NULL) { + goto exit_perceptron_created; + } + + perceptron->scores = double_array_new_zeros((size_t)perceptron->num_classes); + + uint64_t classes_str_len; + + if (!file_read_uint64(f, &classes_str_len)) { + goto exit_perceptron_created; + } + + char_array *array = char_array_new_size(classes_str_len); + + if (array == NULL) { + goto exit_perceptron_created; + } + + if (!file_read_chars(f, array->a, classes_str_len)) { + char_array_destroy(array); + goto exit_perceptron_created; + } + + array->n = classes_str_len; + + perceptron->classes = cstring_array_from_char_array(array); + if (perceptron->classes == NULL) { + goto exit_perceptron_created; + } + + perceptron->features = trie_read(f); + + if (perceptron->features == NULL) { + goto exit_perceptron_created; + } + + return perceptron; + +exit_perceptron_created: + averaged_perceptron_destroy(perceptron); + return NULL; +} + +averaged_perceptron_t *averaged_perceptron_load(char *filename) { + if (filename == NULL) return NULL; + FILE *f = fopen(filename, "rb"); + if (f == NULL) return NULL; + averaged_perceptron_t *perceptron = averaged_perceptron_read(f); + fclose(f); + return perceptron; +} + +bool averaged_perceptron_write(averaged_perceptron_t *self, FILE *f) { + if (self == NULL || f == NULL || self->weights == NULL || self->classes == NULL || + self->features == NULL) { + return false; + } + + if (!file_write_uint32(f, PERCEPTRON_SIGNATURE) || + !file_write_uint32(f, self->num_features) || + !file_write_uint32(f, self->num_classes)) { + return false; + } + + if (!sparse_matrix_write(self->weights, f)) { + return false; + } + + uint64_t classes_str_len = (uint64_t) cstring_array_used(self->classes); + if (!file_write_uint64(f, classes_str_len)) { + return false; + } + + if (!file_write_chars(f, self->classes->str->a, classes_str_len)) { + return false; + } + + if (!trie_write(self->features, f)) { + return false; + } + + return true; +} + +bool averaged_perceptron_save(averaged_perceptron_t *self, char *filename) { + if (self == NULL || filename == NULL) return false; + FILE *f = fopen(filename, "wb"); + if (f == NULL) return false; + bool ret_val = averaged_perceptron_write(self, f); + fclose(f); + return ret_val; +} + + +void averaged_perceptron_destroy(averaged_perceptron_t *self) { + if (self == NULL) return; + + if (self->features != NULL) { + trie_destroy(self->features); + } + + if (self->classes != NULL) { + cstring_array_destroy(self->classes); + } + + if (self->weights != NULL) { + sparse_matrix_destroy(self->weights); + } + + if (self->scores != NULL) { + double_array_destroy(self->scores); + } + + free(self); +} \ No newline at end of file diff --git a/src/averaged_perceptron.h b/src/averaged_perceptron.h new file mode 100644 index 00000000..1c0e244e --- /dev/null +++ b/src/averaged_perceptron.h @@ -0,0 +1,61 @@ +/* +averaged_perceptron.h +--------------------- + +The averaged perceptron is a simple, efficient and effective method for +training sequence models. + +The averaged perceptron is a linear model, meaning the score for a given class +is the dot product of weights and the feature values. + +This implementation of the averaged perceptron uses a trie data structure to +store the mapping from features to ids, which can be quite memory efficient +as opposed to a hash table and allows us to store + +The weights are stored as a sparse matrix in compressed sparse row format +(see sparse_matrix.h) + +See [Collins, 2002] Discriminative Training Methods for Hidden Markov Models: + Theory and Experiments with Perceptron Algorithms + +Paper: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf +*/ +#ifndef AVERAGED_PERCEPTRON_H +#define AVERAGED_PERCEPTRON_H + +#include +#include +#include + +#include "collections.h" +#include "sparse_matrix.h" +#include "trie.h" + +typedef struct averaged_perceptron { + uint32_t num_features; + uint32_t num_classes; + trie_t *features; + cstring_array *classes; + sparse_matrix_t *weights; + double_array *scores; +} averaged_perceptron_t; + +averaged_perceptron_t *averaged_perceptron_read(FILE *f); +averaged_perceptron_t *averaged_perceptron_load(char *filename); + +uint32_t averaged_perceptron_predict(averaged_perceptron_t *self, cstring_array *features); +uint32_t averaged_perceptron_predict_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts); + +double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *self, cstring_array *features); +double_array *averaged_perceptron_predict_scores_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts); + +bool averaged_perceptron_write(averaged_perceptron_t *self, FILE *f); +bool averaged_perceptron_save(averaged_perceptron_t *self, char *filename); + +averaged_perceptron_t *averaged_perceptron_read(FILE *f); +averaged_perceptron_t *averaged_perceptron_load(char *filename); + +void averaged_perceptron_destroy(averaged_perceptron_t *self); + + +#endif \ No newline at end of file diff --git a/src/vector_math.h b/src/vector_math.h index 8b2ad6a2..e650195f 100644 --- a/src/vector_math.h +++ b/src/vector_math.h @@ -31,6 +31,7 @@ static inline name *name##_new_zeros(size_t n) { \ name *vector = name##_new_size(n); \ memset(vector->a, 0, n * sizeof(type)); \ + vector->n = n; \ return name##_new_value(n, (type)0); \ } \ \