[parsing] Averaged perceptron model data structure for storing the finalized, averaged, sparse weights

2015-09-08 12:02:15 -07:00
parent 8d642b45b9
commit c80d8b8067
3 changed files with 281 additions and 0 deletions
--- a/src/averaged_perceptron.c
+++ b/src/averaged_perceptron.c
@@ -0,0 +1,219 @@
+#include "averaged_perceptron.h"
+
+#define PERCEPTRON_SIGNATURE 0xCBCBCBCB
+
+static inline bool averaged_perceptron_get_feature_id(averaged_perceptron_t *self, char *feature, uint32_t *feature_id) {
+    return trie_get_data(self->features, feature, feature_id);
+}
+
+inline double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *self, cstring_array *features) {
+    if (self->scores == NULL || self->scores->n == 0) self->scores = double_array_new_zeros((size_t)self->num_classes);
+
+    double_array_set(self->scores->a, self->scores->n, 0.0);
+
+    double *scores = self->scores->a;
+
+    uint32_t i = 0;
+    char *feature;
+    uint32_t feature_id;
+
+    uint32_t *indptr = self->weights->indptr->a;
+    uint32_t *indices = self->weights->indices->a;
+    double *data = self->weights->data->a;
+
+    cstring_array_foreach(features, i, feature, {
+        if (!averaged_perceptron_get_feature_id(self, feature, &feature_id)) {
+            continue;
+        }
+
+        for (int col = indptr[feature_id]; col < indptr[feature_id+1]; col++) {
+            uint32_t class_id = indices[col];
+            scores[class_id] += data[col];
+        }
+
+    })
+
+    return self->scores;   
+}
+
+inline double_array *averaged_perceptron_predict_scores_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts) {
+    if (self->scores == NULL || self->scores->n == 0) self->scores = double_array_new_zeros((size_t)self->num_classes);
+
+    double_array_set(self->scores->a, self->scores->n, 0.0);
+
+    double *scores = self->scores->a;
+
+    uint32_t i = 0;
+    const char *feature;
+    uint32_t count;
+    uint32_t feature_id;
+
+    uint32_t *indptr = self->weights->indptr->a;
+    uint32_t *indices = self->weights->indices->a;
+    double *data = self->weights->data->a;
+
+    kh_foreach(feature_counts, feature, count, {
+        if (!averaged_perceptron_get_feature_id(self, (char *)feature, &feature_id)) {
+            continue;
+        }
+
+        for (int col = indptr[feature_id]; col < indptr[feature_id + 1]; col++) {
+            uint32_t class_id = indices[col];
+            scores[class_id] += data[col] * (double)count;
+        }
+    })
+
+    return self->scores;
+}
+
+
+inline uint32_t averaged_perceptron_predict(averaged_perceptron_t *self, cstring_array *features) {
+    double_array *scores = averaged_perceptron_predict_scores(self, features);
+
+    int64_t max_score = double_array_argmax(scores->a, scores->n);
+
+    return (uint32_t)max_score;
+
+}
+
+inline uint32_t averaged_perceptron_predict_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts) {
+    double_array *scores = averaged_perceptron_predict_scores_counts(self, feature_counts);
+
+    int64_t max_score = double_array_argmax(scores->a, scores->n);
+
+    return (uint32_t)max_score;
+}
+
+averaged_perceptron_t *averaged_perceptron_read(FILE *f) {
+    if (f == NULL) return NULL;
+
+    uint32_t signature;
+
+    if (!file_read_uint32(f, &signature) || signature != PERCEPTRON_SIGNATURE) {
+        return NULL;
+    }
+
+    averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t));
+
+    if (!file_read_uint32(f, &perceptron->num_features) ||
+        !file_read_uint32(f, &perceptron->num_classes) ||
+        perceptron->num_classes == 0) {
+        return NULL;
+    }
+
+    perceptron->weights = sparse_matrix_read(f);
+    if (perceptron->weights == NULL) {
+        goto exit_perceptron_created;
+    }
+
+    perceptron->scores = double_array_new_zeros((size_t)perceptron->num_classes);
+
+    uint64_t classes_str_len;
+
+    if (!file_read_uint64(f, &classes_str_len)) {
+        goto exit_perceptron_created;
+    }
+
+    char_array *array = char_array_new_size(classes_str_len);
+
+    if (array == NULL) {
+        goto exit_perceptron_created;
+    }
+
+    if (!file_read_chars(f, array->a, classes_str_len)) {
+        char_array_destroy(array);
+        goto exit_perceptron_created;
+    }
+
+    array->n = classes_str_len;
+
+    perceptron->classes = cstring_array_from_char_array(array);
+    if (perceptron->classes == NULL) {
+        goto exit_perceptron_created;
+    }
+
+    perceptron->features = trie_read(f);
+
+    if (perceptron->features == NULL) {
+        goto exit_perceptron_created;
+    }
+
+    return perceptron;
+
+exit_perceptron_created:
+    averaged_perceptron_destroy(perceptron);
+    return NULL;
+}
+
+averaged_perceptron_t *averaged_perceptron_load(char *filename) {
+    if (filename == NULL) return NULL;
+    FILE *f = fopen(filename, "rb");
+    if (f == NULL) return NULL;
+    averaged_perceptron_t *perceptron = averaged_perceptron_read(f);
+    fclose(f);
+    return perceptron;
+}
+
+bool averaged_perceptron_write(averaged_perceptron_t *self, FILE *f) {
+    if (self == NULL || f == NULL || self->weights == NULL || self->classes == NULL ||
+        self->features == NULL) {
+        return false;
+    }
+
+    if (!file_write_uint32(f, PERCEPTRON_SIGNATURE) ||
+        !file_write_uint32(f, self->num_features) ||
+        !file_write_uint32(f, self->num_classes)) {
+        return false;
+    }
+
+    if (!sparse_matrix_write(self->weights, f)) {
+        return false;
+    }
+
+    uint64_t classes_str_len = (uint64_t) cstring_array_used(self->classes);
+    if (!file_write_uint64(f, classes_str_len)) {
+        return false;
+    }
+
+    if (!file_write_chars(f, self->classes->str->a, classes_str_len)) {
+        return false;
+    }
+
+    if (!trie_write(self->features, f)) {
+        return false;
+    }
+
+    return true;
+}
+
+bool averaged_perceptron_save(averaged_perceptron_t *self, char *filename) {
+    if (self == NULL || filename == NULL) return false;
+    FILE *f = fopen(filename, "wb");
+    if (f == NULL) return false;
+    bool ret_val = averaged_perceptron_write(self, f);
+    fclose(f);
+    return ret_val;
+}
+
+
+void averaged_perceptron_destroy(averaged_perceptron_t *self) {
+    if (self == NULL) return;
+
+    if (self->features != NULL) {
+        trie_destroy(self->features);
+    }
+
+    if (self->classes != NULL) {
+        cstring_array_destroy(self->classes);
+    }
+
+    if (self->weights != NULL) {
+        sparse_matrix_destroy(self->weights);
+    }
+
+    if (self->scores != NULL) {
+        double_array_destroy(self->scores);
+    }
+
+    free(self);
+}
--- a/src/averaged_perceptron.h
+++ b/src/averaged_perceptron.h
@@ -0,0 +1,61 @@
+/*
+averaged_perceptron.h
+---------------------
+
+The averaged perceptron is a simple, efficient and effective method for
+training sequence models.
+
+The averaged perceptron is a linear model, meaning the score for a given class
+is the dot product of weights and the feature values.
+
+This implementation of the averaged perceptron uses a trie data structure to
+store the mapping from features to ids, which can be quite memory efficient
+as opposed to a hash table and allows us to store 
+
+The weights are stored as a sparse matrix in compressed sparse row format
+(see sparse_matrix.h)
+
+See [Collins, 2002] Discriminative Training Methods for Hidden Markov Models: 
+                    Theory and Experiments with Perceptron Algorithms
+
+Paper: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf
+*/
+#ifndef AVERAGED_PERCEPTRON_H
+#define AVERAGED_PERCEPTRON_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "collections.h"
+#include "sparse_matrix.h"
+#include "trie.h"
+
+typedef struct averaged_perceptron {
+    uint32_t num_features;
+    uint32_t num_classes;
+    trie_t *features;
+    cstring_array *classes;
+    sparse_matrix_t *weights;
+    double_array *scores;
+} averaged_perceptron_t;
+
+averaged_perceptron_t *averaged_perceptron_read(FILE *f);
+averaged_perceptron_t *averaged_perceptron_load(char *filename);
+
+uint32_t averaged_perceptron_predict(averaged_perceptron_t *self, cstring_array *features);
+uint32_t averaged_perceptron_predict_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts);
+
+double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *self, cstring_array *features);
+double_array *averaged_perceptron_predict_scores_counts(averaged_perceptron_t *self, khash_t(str_uint32) *feature_counts);
+
+bool averaged_perceptron_write(averaged_perceptron_t *self, FILE *f);
+bool averaged_perceptron_save(averaged_perceptron_t *self, char *filename);
+
+averaged_perceptron_t *averaged_perceptron_read(FILE *f);
+averaged_perceptron_t *averaged_perceptron_load(char *filename);
+
+void averaged_perceptron_destroy(averaged_perceptron_t *self);
+
+
+#endif
--- a/src/vector_math.h
+++ b/src/vector_math.h
@@ -31,6 +31,7 @@
    static inline name *name##_new_zeros(size_t n) {                           \
        name *vector = name##_new_size(n);                                     \
        memset(vector->a, 0, n * sizeof(type));                                \
+        vector->n = n;                                                         \
        return name##_new_value(n, (type)0);                                   \
    }                                                                          \
                                                                               \