diff --git a/src/logistic_regression_trainer.c b/src/logistic_regression_trainer.c
index df67afc1..dc0843bf 100644
--- a/src/logistic_regression_trainer.c
+++ b/src/logistic_regression_trainer.c
@@ -1,4 +1,5 @@
 #include "logistic_regression_trainer.h"
+#include "sparse_matrix_utils.h"
 
 void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) {
     if (self == NULL) return;
@@ -15,6 +16,22 @@ void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) {
         matrix_destroy(self->weights);
     }
 
+    if (self->last_updated != NULL) {
+        uint32_array_destroy(self->last_updated);
+    }
+
+    if (self->unique_columns != NULL) {
+        kh_destroy(int_set, self->unique_columns);
+    }
+    
+    if (self->batch_columns != NULL) {
+        uint32_array_destroy(self->batch_columns);
+    }
+
+    if (self->gradient != NULL) {
+        matrix_destroy(self->gradient);
+    }
+
     free(self);
 }
 
@@ -33,6 +50,13 @@ logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_
 
     trainer->weights = matrix_new_zeros(trainer->num_features, trainer->num_labels);
 
+    trainer->gradient = matrix_new_zeros(trainer->num_features, trainer->num_labels);
+
+    trainer->unique_columns = kh_init(int_set);
+    trainer->batch_columns = uint32_array_new_size(trainer->num_features);
+
+    trainer->last_updated = uint32_array_new_zeros(trainer->num_features);
+
     trainer->lambda = DEFAULT_LAMBDA;
     trainer->iters = 0;
     trainer->epochs = 0;
@@ -75,12 +99,12 @@ double logistic_regression_trainer_batch_cost(logistic_regression_trainer_t *sel
     return cost;    
 }
 
-
 bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) {
     size_t m = self->weights->m;
     size_t n = self->weights->n;
 
-    matrix_t *gradient = matrix_new_zeros(m, n);
+    // Optimize
+    matrix_t *gradient = self->gradient;
 
     sparse_matrix_t *x = feature_matrix(self->feature_ids, features);
     uint32_array *y = label_vector(self->label_ids, labels);
@@ -89,21 +113,40 @@ bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self
 
     bool ret = false;
 
-    if (!logistic_regression_gradient(self->weights, gradient, x, y, p_y, self->lambda)) {
+    if (!sparse_matrix_add_unique_columns(x, self->unique_columns, self->batch_columns)) {
+        log_error("Unique columns failed\n");
+        goto exit_matrices_created;
+    }
+
+    if (self->lambda > 0.0 && !stochastic_gradient_descent_sparse_regularize_weights(self->weights, self->batch_columns, self->last_updated, self->iters, self->lambda)) {
+        log_error("Error regularizing weights\n");
+        goto exit_matrices_created;
+    }
+
+    if (!logistic_regression_gradient_sparse(self->weights, gradient, x, y, p_y, self->batch_columns, self->lambda)) {
         log_error("Gradient failed\n");
         goto exit_matrices_created;
     }
 
     size_t data_len = m * n;
 
-    ret = stochastic_gradient_descent(self->weights, gradient, self->gamma);
+    ret = stochastic_gradient_descent_sparse(self->weights, gradient, self->batch_columns, self->gamma);
 
     self->iters++;
 
 exit_matrices_created:
-    matrix_destroy(gradient);
     matrix_destroy(p_y);
     uint32_array_destroy(y);
     sparse_matrix_destroy(x);
     return ret;
 }
+
+bool logistic_regression_trainer_finalize(logistic_regression_trainer_t *self) {
+    if (self == NULL) return false;
+
+    if (self->lambda > 0.0) {
+        return stochastic_gradient_descent_sparse_finalize_weights(self->weights, self->last_updated, self->iters, self->lambda);
+    }
+
+    return true;
+}
diff --git a/src/logistic_regression_trainer.h b/src/logistic_regression_trainer.h
index f00dc12e..a552ac41 100644
--- a/src/logistic_regression_trainer.h
+++ b/src/logistic_regression_trainer.h
@@ -4,6 +4,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <math.h>
 
 #include "averaged_perceptron_tagger.h"
 #include "collections.h"
@@ -34,6 +35,9 @@ typedef struct logistic_regression_trainer {
     khash_t(str_uint32) *label_ids;     // Hashtable mapping labels to array indices
     size_t num_labels;                  // Number of labels
     matrix_t *weights;                  // Matrix of logistic regression weights
+    matrix_t *gradient;                 // Gradient matrix to be reused
+    khash_t(int_set) *unique_columns;   // Unique columns set
+    uint32_array *batch_columns;        // Unique columns as array
     uint32_array *last_updated;         // Array of length N indicating the last time each feature was updated
     double lambda;                      // Regularization weight
     uint32_t iters;                     // Number of iterations, used to decay learning rate
@@ -47,6 +51,7 @@ logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_
 
 bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels);
 double logistic_regression_trainer_batch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels);
+bool logistic_regression_trainer_finalize(logistic_regression_trainer_t *self);
 
 void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self);