diff --git a/src/logistic_regression.c b/src/logistic_regression.c
index 9bebfef9..ef970a73 100644
--- a/src/logistic_regression.c
+++ b/src/logistic_regression.c
@@ -138,7 +138,7 @@ static bool logistic_regression_gradient_params(matrix_t *theta, matrix_t *gradi
     }
 
 
-    // If the vector last_updated was provided, update the only the relevant columns in x
+    // Update the only the relevant columns in x
     if (regularize && x_cols != NULL) {
         size_t batch_rows = x_cols->n;
         uint32_t *cols = x_cols->a;
diff --git a/src/stochastic_gradient_descent.c b/src/stochastic_gradient_descent.c
index e56057b0..05785402 100644
--- a/src/stochastic_gradient_descent.c
+++ b/src/stochastic_gradient_descent.c
@@ -8,18 +8,127 @@ bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gam
     size_t m = gradient->m;
     size_t n = gradient->n;
 
-    for (size_t i = 0; i < m; i++) {
+    return matrix_sub_matrix_times_scalar(theta, gradient, gamma);
+}
+
+bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma) {
+    if (gradient->m != theta->m || gradient->n != theta->n) {
+        return false;
+    }
+
+    size_t m = gradient->m;
+    size_t n = gradient->n;
+
+    double *gradient_values = gradient->values;
+    double *theta_values = theta->values;
+
+    uint32_t *indices = update_indices->a;
+    size_t num_updated = update_indices->n;
+
+    for (size_t i = 0; i < num_updated; i++) {
+        uint32_t row = indices[i];
         for (size_t j = 0; j < n; j++) {
-            double grad_ij = matrix_get(gradient, i, j);
-            matrix_sub_scalar(theta, i, j, gamma * grad_ij);
+            size_t idx = row * n + j;
+            double value = gradient_values[idx];
+            theta_values[idx] -= gamma * value;
         }
     }
 
     return true;
 }
 
-inline bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, float lambda, uint32_t t, double gamma_0) {
-    double gamma = gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
+
+/*
+Sparse regularization
+---------------------
+
+Stochastic/minibatch gradients can be decomposed into 2 updates
+1. The derivative of the loss function itself (0 for features not observed in the current batch)
+2. The derivative of the regularization term (applies to all weights)
+
+Reference: http://leon.bottou.org/publications/pdf/tricks-2012.pdf
+
+Here we take sparsity a step further and do "lazy" or "just-in-time" regularization.
+
+Updating all the weights on each iteration requires m * n operations for each minibatch
+regardless of the number of parameters active in the minibatch.
+
+However, the "correct" value of a given parameter theta_ij is only really needed in two places:
+
+1. Before computing the gradient, since the current value of theta is used in said computation
+2. When we're done training the model and want to save/persist it
+
+In L2 regularization, the derivative of the regularization term is simply:
+
+lambda * theta
+
+Since theta changes proportional to itself, we can rewrite this for multiple timesteps as:
+
+theta_i *= e^(-lambda * t)
+
+where t is the number of timesteps since theta_i was last updated. This requires storing
+a vector of size n containing the last updated timestamps, as well the set of columns
+used by the minibatch (this implementation assumes it is computed elsewehre and passed in).
+
+In NLP applications, where the updates are very sparse, only a small fraction of the
+features are likely to be active in a given batch.
+
+This means that if, say, an infrequently used word like "fecund" or "bucolic" is seen
+in only one or two batches in the entire training corpus, we only touch that parameter
+twice (three times counting the finalization step), while still getting roughly the same
+results as though we had done the per-iteration weight updates.
+*/
+
+static inline void regularize_row(double *theta_i, size_t n, double lambda, uint32_t last_updated, uint32_t t) {
+    uint32_t timesteps = t - last_updated;
+    double update = exp(-lambda * timesteps);
+    double_array_mul(theta_i, update, n);
+}
+
+bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda) {
+    if (lambda > 0.0) {        
+        uint32_t *updates = last_updated->a;
+
+        size_t n = theta->n;
+
+        size_t batch_rows = update_indices->n;
+        uint32_t *rows = update_indices->a;
+
+        for (size_t i = 0; i < batch_rows; i++) {
+            uint32_t row = rows[i];
+            double *theta_i = matrix_get_row(theta, row);
+            uint32_t last_updated = updates[row];
+            regularize_row(theta_i, n, lambda, last_updated, t);
+            updates[row] = t;
+        }
+
+    }
+
+    return true;
+}
+
+inline bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda) {
+    if (lambda > 0.0) {
+        uint32_t *updates = last_updated->a;
+        size_t m = theta->m;
+        size_t n = theta->n;
+
+        for (size_t i = 0; i < m; i++) {
+            double *theta_i = matrix_get_row(theta, i);
+            uint32_t last_updated = updates[i];
+            regularize_row(theta_i, n, lambda, last_updated, t);
+            updates[row] = t;
+        }
+    }
+    return true;
+}
+
+inline double gamma_t(double gamma_0, double lambda, uint32_t t) {
+    return gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
+}
+
+inline bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, double lambda, uint32_t t, double gamma_0) {
+    double gamma = gamma_t(gamma_0, lambda, t);
 
     return stochastic_gradient_descent(theta, gradient, gamma);
 }
diff --git a/src/stochastic_gradient_descent.h b/src/stochastic_gradient_descent.h
index 8f699993..5cc44fdf 100644
--- a/src/stochastic_gradient_descent.h
+++ b/src/stochastic_gradient_descent.h
@@ -16,7 +16,10 @@ gamma_t = gamma_0(1 + gamma_0 * lambda * t)^-1
 
 #include "matrix.h"
 
-bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double alpha);
-bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, float lambda, uint32_t t, double gamma_0);
+bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gamma);
+bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma);
+bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda);
+bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda);
+bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, double lambda, uint32_t t, double gamma_0);
 
 #endif
\ No newline at end of file