From 62017fd33d708a227373d4091929178b0fb069fe Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 9 Jan 2016 03:12:54 -0500
Subject: [PATCH] [optimization] Using sparse updates in stochastic gradient
 descent. Decomposing the updates into the gradient of the loss function (zero
 for features not observed in the current batch) and the gradient of the
 regularization term. The derivative of the regularization term in
 L2-regularized models is equivalent to an exponential decay function. Before
 computing the gradient for the current batch, we bring the weights up to date
 only for the features observed in that batch, and update only those values

---
 src/logistic_regression.c         |   2 +-
 src/stochastic_gradient_descent.c | 119 ++++++++++++++++++++++++++++--
 src/stochastic_gradient_descent.h |   7 +-
 3 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/src/logistic_regression.c b/src/logistic_regression.c
index 9bebfef9..ef970a73 100644
--- a/src/logistic_regression.c
+++ b/src/logistic_regression.c
@@ -138,7 +138,7 @@ static bool logistic_regression_gradient_params(matrix_t *theta, matrix_t *gradi
     }
 
 
-    // If the vector last_updated was provided, update the only the relevant columns in x
+    // Update the only the relevant columns in x
     if (regularize && x_cols != NULL) {
         size_t batch_rows = x_cols->n;
         uint32_t *cols = x_cols->a;
diff --git a/src/stochastic_gradient_descent.c b/src/stochastic_gradient_descent.c
index e56057b0..05785402 100644
--- a/src/stochastic_gradient_descent.c
+++ b/src/stochastic_gradient_descent.c
@@ -8,18 +8,127 @@ bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gam
     size_t m = gradient->m;
     size_t n = gradient->n;
 
-    for (size_t i = 0; i < m; i++) {
+    return matrix_sub_matrix_times_scalar(theta, gradient, gamma);
+}
+
+bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma) {
+    if (gradient->m != theta->m || gradient->n != theta->n) {
+        return false;
+    }
+
+    size_t m = gradient->m;
+    size_t n = gradient->n;
+
+    double *gradient_values = gradient->values;
+    double *theta_values = theta->values;
+
+    uint32_t *indices = update_indices->a;
+    size_t num_updated = update_indices->n;
+
+    for (size_t i = 0; i < num_updated; i++) {
+        uint32_t row = indices[i];
         for (size_t j = 0; j < n; j++) {
-            double grad_ij = matrix_get(gradient, i, j);
-            matrix_sub_scalar(theta, i, j, gamma * grad_ij);
+            size_t idx = row * n + j;
+            double value = gradient_values[idx];
+            theta_values[idx] -= gamma * value;
         }
     }
 
     return true;
 }
 
-inline bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, float lambda, uint32_t t, double gamma_0) {
-    double gamma = gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
+
+/*
+Sparse regularization
+---------------------
+
+Stochastic/minibatch gradients can be decomposed into 2 updates
+1. The derivative of the loss function itself (0 for features not observed in the current batch)
+2. The derivative of the regularization term (applies to all weights)
+
+Reference: http://leon.bottou.org/publications/pdf/tricks-2012.pdf
+
+Here we take sparsity a step further and do "lazy" or "just-in-time" regularization.
+
+Updating all the weights on each iteration requires m * n operations for each minibatch
+regardless of the number of parameters active in the minibatch.
+
+However, the "correct" value of a given parameter theta_ij is only really needed in two places:
+
+1. Before computing the gradient, since the current value of theta is used in said computation
+2. When we're done training the model and want to save/persist it
+
+In L2 regularization, the derivative of the regularization term is simply:
+
+lambda * theta
+
+Since theta changes proportional to itself, we can rewrite this for multiple timesteps as:
+
+theta_i *= e^(-lambda * t)
+
+where t is the number of timesteps since theta_i was last updated. This requires storing
+a vector of size n containing the last updated timestamps, as well the set of columns
+used by the minibatch (this implementation assumes it is computed elsewehre and passed in).
+
+In NLP applications, where the updates are very sparse, only a small fraction of the
+features are likely to be active in a given batch.
+
+This means that if, say, an infrequently used word like "fecund" or "bucolic" is seen
+in only one or two batches in the entire training corpus, we only touch that parameter
+twice (three times counting the finalization step), while still getting roughly the same
+results as though we had done the per-iteration weight updates.
+*/
+
+static inline void regularize_row(double *theta_i, size_t n, double lambda, uint32_t last_updated, uint32_t t) {
+    uint32_t timesteps = t - last_updated;
+    double update = exp(-lambda * timesteps);
+    double_array_mul(theta_i, update, n);
+}
+
+bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda) {
+    if (lambda > 0.0) {        
+        uint32_t *updates = last_updated->a;
+
+        size_t n = theta->n;
+
+        size_t batch_rows = update_indices->n;
+        uint32_t *rows = update_indices->a;
+
+        for (size_t i = 0; i < batch_rows; i++) {
+            uint32_t row = rows[i];
+            double *theta_i = matrix_get_row(theta, row);
+            uint32_t last_updated = updates[row];
+            regularize_row(theta_i, n, lambda, last_updated, t);
+            updates[row] = t;
+        }
+
+    }
+
+    return true;
+}
+
+inline bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda) {
+    if (lambda > 0.0) {
+        uint32_t *updates = last_updated->a;
+        size_t m = theta->m;
+        size_t n = theta->n;
+
+        for (size_t i = 0; i < m; i++) {
+            double *theta_i = matrix_get_row(theta, i);
+            uint32_t last_updated = updates[i];
+            regularize_row(theta_i, n, lambda, last_updated, t);
+            updates[row] = t;
+        }
+    }
+    return true;
+}
+
+inline double gamma_t(double gamma_0, double lambda, uint32_t t) {
+    return gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
+}
+
+inline bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, double lambda, uint32_t t, double gamma_0) {
+    double gamma = gamma_t(gamma_0, lambda, t);
 
     return stochastic_gradient_descent(theta, gradient, gamma);
 }
diff --git a/src/stochastic_gradient_descent.h b/src/stochastic_gradient_descent.h
index 8f699993..5cc44fdf 100644
--- a/src/stochastic_gradient_descent.h
+++ b/src/stochastic_gradient_descent.h
@@ -16,7 +16,10 @@ gamma_t = gamma_0(1 + gamma_0 * lambda * t)^-1
 
 #include "matrix.h"
 
-bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double alpha);
-bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, float lambda, uint32_t t, double gamma_0);
+bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gamma);
+bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma);
+bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda);
+bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda);
+bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, double lambda, uint32_t t, double gamma_0);
 
 #endif
\ No newline at end of file