diff --git a/src/averaged_perceptron.c b/src/averaged_perceptron.c
index d40a517b..a66470e4 100644
--- a/src/averaged_perceptron.c
+++ b/src/averaged_perceptron.c
@@ -26,7 +26,7 @@ inline double_array *averaged_perceptron_predict_scores(averaged_perceptron_t *s
             continue;
         }
 
-        for (int col = indptr[feature_id]; col < indptr[feature_id+1]; col++) {
+        for (int col = indptr[feature_id]; col < indptr[feature_id + 1]; col++) {
             uint32_t class_id = indices[col];
             scores[class_id] += data[col];
         }
diff --git a/src/averaged_perceptron.h b/src/averaged_perceptron.h
index 6147856f..2eb7703c 100644
--- a/src/averaged_perceptron.h
+++ b/src/averaged_perceptron.h
@@ -15,11 +15,6 @@ very little memory.
 
 The weights are stored as a sparse matrix in compressed sparse row format
 (see sparse_matrix.h)
-
-Paper: [Collins, 2002] Discriminative Training Methods for Hidden Markov Models: 
-                       Theory and Experiments with Perceptron Algorithms
-
-Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf
 */
 #ifndef AVERAGED_PERCEPTRON_H
 #define AVERAGED_PERCEPTRON_H
diff --git a/src/averaged_perceptron_trainer.c b/src/averaged_perceptron_trainer.c
new file mode 100644
index 00000000..3039156a
--- /dev/null
+++ b/src/averaged_perceptron_trainer.c
@@ -0,0 +1,337 @@
+#include "averaged_perceptron_trainer.h"
+
+void averaged_perceptron_trainer_destroy(averaged_perceptron_trainer_t *self) {
+    if (self == NULL) return;
+
+    if (self->features != NULL) {
+        trie_destroy(self->features);
+    }
+
+    if (self->classes != NULL) {
+        kh_destroy(str_uint32, self->classes);
+    }
+
+    uint32_t feature_id;
+    khash_t(class_weights) *weights;
+
+    kh_foreach(self->weights, feature_id, weights, {
+        kh_destroy(class_weights, weights);
+    })
+
+    if (self->weights != NULL) {
+        kh_destroy(feature_class_weights, self->weights);
+    }
+
+    if (self->scores != NULL) {
+        double_array_destroy(self->scores);
+    }
+
+    free(self);
+}
+
+
+bool averaged_perceptron_trainer_get_class_id(averaged_perceptron_trainer_t *self, char *class_name, uint32_t *class_id, bool add_if_missing) {
+    khiter_t k;
+
+    khash_t(str_uint32) *classes = self->classes;
+
+    k = kh_get(str_uint32, classes, class_name);
+    if (k != kh_end(classes)) {
+        *class_id = kh_value(classes, k);
+        return true;
+    } else if (add_if_missing) {
+        uint32_t new_id = kh_size(classes);
+        int ret;
+        k = kh_put(str_uint32, classes, class_name, &ret);
+        kh_value(classes, k) = new_id;
+        *class_id = new_id;
+
+        cstring_array_add_string(self->class_strings, class_name);
+        self->num_classes++;
+        return true;
+    }
+    return false;
+}
+
+bool averaged_perceptron_trainer_get_feature_id(averaged_perceptron_trainer_t *self, char *feature, uint32_t *feature_id, bool add_if_missing) {
+    trie_t *features = self->features;
+
+    bool in_trie = trie_get_data(features, feature, feature_id);
+
+    if (add_if_missing && !in_trie) {
+        uint32_t new_id = features->num_keys;
+        *feature_id = new_id;
+        if (!trie_add(features, feature, new_id)) {
+            return false;
+        }
+        self->num_features++;
+        return true;
+    } else if (in_trie) {
+        return true;
+    }
+
+    return false;
+}
+
+averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_trainer_t *self) {
+    if (self == NULL || self->num_classes == 0) return NULL;
+
+    sparse_matrix_t *averaged_weights = sparse_matrix_new();
+
+    uint32_t class_id;
+    class_weight_t weight;
+
+    uint64_t updates = self->num_updates;
+    khash_t(class_weights) *weights;
+
+    for (uint32_t feature_id = 0; feature_id < self->num_features; feature_id++) {
+        khiter_t k;
+        k = kh_get(feature_class_weights, self->weights, feature_id);
+        if (k == kh_end(self->weights)) {
+            sparse_matrix_destroy(averaged_weights);
+            return NULL;
+        }
+
+        weights = kh_value(self->weights, k);
+        uint32_t class_id;
+
+        kh_foreach(weights, class_id, weight, {
+            weight.total += (updates - weight.last_updated) * weight.value;
+            double value = weight.total / updates;
+            sparse_matrix_append(averaged_weights, class_id, value);
+        })
+
+        sparse_matrix_finalize_row(averaged_weights);
+    }
+
+    averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t));
+
+    perceptron->weights = averaged_weights;
+
+    perceptron->num_features = self->num_features;
+    perceptron->num_classes = self->num_classes;
+
+    perceptron->scores = double_array_new_zeros(perceptron->num_classes);
+
+    // Set our pointers to NULL so they don't get free'd on destroy
+    perceptron->classes = self->class_strings;
+    self->class_strings = NULL;
+
+    perceptron->features = self->features;
+    self->features = NULL;
+
+    averaged_perceptron_trainer_destroy(self);
+
+    return perceptron;
+}
+
+khash_t(class_weights) *averaged_perceptron_trainer_get_class_weights(averaged_perceptron_trainer_t *self, uint32_t feature_id, bool add_if_missing) {
+    khiter_t k;
+    k = kh_get(feature_class_weights, self->weights, feature_id);
+    if (k != kh_end(self->weights)) {
+        return kh_value(self->weights, k);
+    } else if (add_if_missing) {
+        khash_t(class_weights) *weights = kh_init(class_weights);
+        int ret;
+        k = kh_put(feature_class_weights, self->weights, feature_id, &ret);
+        if (ret < 0) {
+            kh_destroy(class_weights, weights);
+            return NULL;
+        }
+        kh_value(self->weights, k) = weights;
+        return weights;
+    }
+
+    return NULL;
+}
+
+
+static inline bool averaged_perceptron_trainer_update_weight(khash_t(class_weights) *weights, uint64_t iter, uint32_t class_id, double value) {
+    class_weight_t weight;
+    size_t index;
+
+    khiter_t k;
+    k = kh_get(class_weights, weights, class_id);
+    if (k == kh_end(weights)) {
+        weight = NULL_WEIGHT;
+    } else {
+        weight = kh_value(weights, k);
+    }
+
+    weight.total += (iter - weight.last_updated) * weight.value;
+    weight.last_updated = iter;
+    weight.value += value;
+
+    int ret;
+    k = kh_put(class_weights, weights, class_id, &ret);
+    if (ret < 0) return false;
+    kh_value(weights, k) = weight;
+
+    return true;
+
+}
+
+static inline bool averaged_perceptron_trainer_update_feature(averaged_perceptron_trainer_t *self, uint32_t feature_id, uint32_t guess, uint32_t truth, double value) {
+    bool add_if_missing = true;
+
+    khash_t(class_weights) *weights = averaged_perceptron_trainer_get_class_weights(self, feature_id, add_if_missing);
+
+    if (weights == NULL) {
+        return false;
+    }
+
+    uint64_t updates = self->num_updates;
+
+    if (!averaged_perceptron_trainer_update_weight(weights, updates, guess, -1.0 * value) ||
+       !averaged_perceptron_trainer_update_weight(weights, updates, truth, value)) {
+        return false;
+    }
+
+    return true;
+}
+
+uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self, cstring_array *features) {
+    double_array *scores = self->scores;
+    size_t num_classes = (size_t)self->num_classes;
+
+    uint32_t i = 0;
+    char *feature = NULL;
+    bool add_if_missing = false;
+    uint32_t feature_id;
+
+    khash_t(class_weights) *weights;
+    uint32_t class_id;
+    class_weight_t weight;
+
+    if (scores->m < num_classes) {
+        double_array_resize(scores, num_classes);
+    }
+
+    if (scores->n < num_classes) {
+        scores->n = num_classes;
+    }
+
+    double_array_set(scores->a, scores->n, 0.0);
+
+    cstring_array_foreach(features, i, feature, {
+        if (!averaged_perceptron_trainer_get_feature_id(self, feature, &feature_id, add_if_missing)) {
+            continue;
+        }
+
+        weights = averaged_perceptron_trainer_get_class_weights(self, feature_id, add_if_missing);
+
+        if (weights == NULL) {
+            continue;
+        }
+
+        kh_foreach(weights, class_id, weight, {
+            scores->a[class_id] += weight.value;
+        })
+    })
+
+    int64_t max_score = double_array_argmax(scores->a, scores->n);
+
+    return (uint32_t)max_score;
+}
+
+bool averaged_perceptron_trainer_update(averaged_perceptron_trainer_t *self, uint32_t guess, uint32_t truth, cstring_array *features) {
+    uint32_t i = 0;
+    char *feature = NULL;
+    uint32_t feature_id;
+    bool add_if_missing = true;
+
+    cstring_array_foreach(features, i, feature, {
+        if (!averaged_perceptron_trainer_get_feature_id(self, feature, &feature_id, add_if_missing)) {
+            return false;
+        }
+
+        if (!averaged_perceptron_trainer_update_feature(self, feature_id, guess, truth, 1.0)) {
+            return false;
+        }
+    })
+
+    self->num_updates++;
+
+    return true;
+}
+
+bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *self, uint32_t guess, uint32_t truth, khash_t(str_uint32) *feature_counts) {
+    const char *feature;
+    uint32_t feature_id;
+    uint32_t count;
+    bool add_if_missing = true;
+
+    kh_foreach(feature_counts, feature, count, {
+        if (!averaged_perceptron_trainer_get_feature_id(self, (char *)feature, &feature_id, add_if_missing)) {
+            return false;
+        }
+
+        if (!averaged_perceptron_trainer_update_feature(self, feature_id, guess, truth, (double)count)) {
+            return false;
+        }
+    })
+
+    self->num_updates++;
+
+    return true;
+}
+
+bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, cstring_array *features, char *label) {
+    uint32_t truth;
+    bool add_if_missing = true;
+
+    if (!averaged_perceptron_trainer_get_class_id(self, label, &truth, add_if_missing)) {
+        return false;
+    }
+
+    uint32_t guess = averaged_perceptron_trainer_predict(self, features);
+
+    // Online error-driven learning, only needs to update weights when it gets a wrong answer, making training fast
+    if (guess != truth) {
+        self->num_errors++;
+        return averaged_perceptron_trainer_update(self, guess, truth, features);
+    }
+
+    return true;
+
+}
+
+averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) {
+    averaged_perceptron_trainer_t *self = malloc(sizeof(averaged_perceptron_trainer_t));
+
+    if (self == NULL) return NULL;
+
+    self->num_features = 0;
+    self->num_classes = 0;
+    self->num_updates = 0;
+    self->num_errors = 0;
+
+    self->features = trie_new();
+    if (self->features == NULL) {
+        goto exit_trainer_created;
+    }
+
+    self->classes = kh_init(str_uint32);
+    if (self->classes == NULL) {
+        goto exit_trainer_created;
+    }
+
+    self->class_strings = cstring_array_new();
+    if (self->class_strings == NULL) {
+        goto exit_trainer_created;
+    }
+
+    self->weights = kh_init(feature_class_weights);
+
+    if (self->weights == NULL) {
+        goto exit_trainer_created;
+    }
+
+    self->scores = double_array_new();
+
+    return self;
+
+exit_trainer_created:
+    averaged_perceptron_trainer_destroy(self);
+    return NULL;
+}
diff --git a/src/averaged_perceptron_trainer.h b/src/averaged_perceptron_trainer.h
new file mode 100644
index 00000000..b1ed3a1a
--- /dev/null
+++ b/src/averaged_perceptron_trainer.h
@@ -0,0 +1,75 @@
+/*
+averaged_perceptron_trainer.h
+-----------------------------
+
+Trainer for a generic averaged perceptron model.
+
+The averaged perceptron uses a simple online error-driven
+learning algorithm. Given some features and the true label,
+it predicts the expected label under the current weights. If
+it guess correctly, there's nothing to do and it moves
+on to the next example. If it predicted the wrong answer, it
+makes the following updates to its weights:
+
+weights[feature][predicted] -= 1.0
+weights[feature][actual] += 1.0
+
+This seems overly simplistic, and it is. This is the regular
+perceptron update rule. On the more difficult cases, this model
+would tend to overfit by spending a lot of time fiddling with the
+weights for the few cases it got wrong and building the whole model
+around those few cases. The averaged perceptron is one way to account
+for this and build a more robust model. 
+
+
+Paper: [Collins, 2002] Discriminative Training Methods for Hidden Markov Models: 
+                       Theory and Experiments with Perceptron Algorithms
+
+Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf
+*/
+#ifndef AVERAGED_PERCEPTRON_TRAINER_H
+#define AVERAGED_PERCEPTRON_TRAINER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "averaged_perceptron.h"
+#include "collections.h"
+#include "string_utils.h"
+#include "trie.h"
+
+typedef struct class_weight {
+    double value;
+    double total;
+    uint64_t last_updated;
+} class_weight_t;
+
+#define NULL_WEIGHT (class_weight_t){0.0, 0.0, 0}
+
+KHASH_MAP_INIT_INT(class_weights, class_weight_t)
+
+KHASH_MAP_INIT_INT(feature_class_weights, khash_t(class_weights) *) 
+
+typedef struct averaged_perceptron_trainer {
+    uint32_t num_features;
+    uint32_t num_classes;
+    uint64_t num_updates;
+    uint64_t num_errors;
+    trie_t *features;
+    khash_t(str_uint32) *classes;
+    cstring_array *class_strings;
+    // {feature_id => {class_id => class_weight_t}}
+    khash_t(feature_class_weights) *weights;
+    double_array *scores;
+} averaged_perceptron_trainer_t;
+
+averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void);
+
+uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self, cstring_array *features);
+bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *trainer, cstring_array *features, char *label);
+
+averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_trainer_t *self);
+
+void averaged_perceptron_trainer_destroy(averaged_perceptron_trainer_t *self);
+
+#endif