[optimization] Using sparse updates in stochastic gradient descent. Decomposing the updates into the gradient of the loss function (zero for features not observed in the current batch) and the gradient of the regularization term. The derivative of the regularization term in L2-regularized models is equivalent to an exponential decay function. Before computing the gradient for the current batch, we bring the weights up to date only for the features observed in that batch, and update only those values

2016-01-09 03:12:54 -05:00
parent aa22db11b2
commit 62017fd33d
3 changed files with 120 additions and 8 deletions
--- a/src/logistic_regression.c
+++ b/src/logistic_regression.c
@@ -138,7 +138,7 @@ static bool logistic_regression_gradient_params(matrix_t *theta, matrix_t *gradi
    }


-    // If the vector last_updated was provided, update the only the relevant columns in x
+    // Update the only the relevant columns in x
    if (regularize && x_cols != NULL) {
        size_t batch_rows = x_cols->n;
        uint32_t *cols = x_cols->a;
--- a/src/stochastic_gradient_descent.c
+++ b/src/stochastic_gradient_descent.c
@@ -8,18 +8,127 @@ bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gam
    size_t m = gradient->m;
    size_t n = gradient->n;

-    for (size_t i = 0; i < m; i++) {
+    return matrix_sub_matrix_times_scalar(theta, gradient, gamma);
+}
+
+bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma) {
+    if (gradient->m != theta->m || gradient->n != theta->n) {
+        return false;
+    }
+
+    size_t m = gradient->m;
+    size_t n = gradient->n;
+
+    double *gradient_values = gradient->values;
+    double *theta_values = theta->values;
+
+    uint32_t *indices = update_indices->a;
+    size_t num_updated = update_indices->n;
+
+    for (size_t i = 0; i < num_updated; i++) {
+        uint32_t row = indices[i];
        for (size_t j = 0; j < n; j++) {
-            double grad_ij = matrix_get(gradient, i, j);
-            matrix_sub_scalar(theta, i, j, gamma * grad_ij);
+            size_t idx = row * n + j;
+            double value = gradient_values[idx];
+            theta_values[idx] -= gamma * value;
        }
    }

    return true;
 }

-inline bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, float lambda, uint32_t t, double gamma_0) {
-    double gamma = gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
+
+/*
+Sparse regularization
+---------------------
+
+Stochastic/minibatch gradients can be decomposed into 2 updates
+1. The derivative of the loss function itself (0 for features not observed in the current batch)
+2. The derivative of the regularization term (applies to all weights)
+
+Reference: http://leon.bottou.org/publications/pdf/tricks-2012.pdf
+
+Here we take sparsity a step further and do "lazy" or "just-in-time" regularization.
+
+Updating all the weights on each iteration requires m * n operations for each minibatch
+regardless of the number of parameters active in the minibatch.
+
+However, the "correct" value of a given parameter theta_ij is only really needed in two places:
+
+1. Before computing the gradient, since the current value of theta is used in said computation
+2. When we're done training the model and want to save/persist it
+
+In L2 regularization, the derivative of the regularization term is simply:
+
+lambda * theta
+
+Since theta changes proportional to itself, we can rewrite this for multiple timesteps as:
+
+theta_i *= e^(-lambda * t)
+
+where t is the number of timesteps since theta_i was last updated. This requires storing
+a vector of size n containing the last updated timestamps, as well the set of columns
+used by the minibatch (this implementation assumes it is computed elsewehre and passed in).
+
+In NLP applications, where the updates are very sparse, only a small fraction of the
+features are likely to be active in a given batch.
+
+This means that if, say, an infrequently used word like "fecund" or "bucolic" is seen
+in only one or two batches in the entire training corpus, we only touch that parameter
+twice (three times counting the finalization step), while still getting roughly the same
+results as though we had done the per-iteration weight updates.
+*/
+
+static inline void regularize_row(double *theta_i, size_t n, double lambda, uint32_t last_updated, uint32_t t) {
+    uint32_t timesteps = t - last_updated;
+    double update = exp(-lambda * timesteps);
+    double_array_mul(theta_i, update, n);
+}
+
+bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda) {
+    if (lambda > 0.0) {        
+        uint32_t *updates = last_updated->a;
+
+        size_t n = theta->n;
+
+        size_t batch_rows = update_indices->n;
+        uint32_t *rows = update_indices->a;
+
+        for (size_t i = 0; i < batch_rows; i++) {
+            uint32_t row = rows[i];
+            double *theta_i = matrix_get_row(theta, row);
+            uint32_t last_updated = updates[row];
+            regularize_row(theta_i, n, lambda, last_updated, t);
+            updates[row] = t;
+        }
+
+    }
+
+    return true;
+}
+
+inline bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda) {
+    if (lambda > 0.0) {
+        uint32_t *updates = last_updated->a;
+        size_t m = theta->m;
+        size_t n = theta->n;
+
+        for (size_t i = 0; i < m; i++) {
+            double *theta_i = matrix_get_row(theta, i);
+            uint32_t last_updated = updates[i];
+            regularize_row(theta_i, n, lambda, last_updated, t);
+            updates[row] = t;
+        }
+    }
+    return true;
+}
+
+inline double gamma_t(double gamma_0, double lambda, uint32_t t) {
+    return gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
+}
+
+inline bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, double lambda, uint32_t t, double gamma_0) {
+    double gamma = gamma_t(gamma_0, lambda, t);

    return stochastic_gradient_descent(theta, gradient, gamma);
 }
--- a/src/stochastic_gradient_descent.h
+++ b/src/stochastic_gradient_descent.h
@@ -16,7 +16,10 @@ gamma_t = gamma_0(1 + gamma_0 * lambda * t)^-1

 #include "matrix.h"

-bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double alpha);
-bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, float lambda, uint32_t t, double gamma_0);
+bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gamma);
+bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma);
+bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda);
+bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda);
+bool stochastic_gradient_descent_scheduled(matrix_t *theta, matrix_t *gradient, double lambda, uint32_t t, double gamma_0);

 #endif