[optimization] new sgd_trainer struct to manage weights in stochastic gradient descent, allows L1 or L2 regularization, cumulative penalties instead of exponential decay, SGD using L1 regularization encouraged sparsity and can produce a sparse matrix after training rather than a dense one

2017-04-02 13:44:40 -04:00
parent 19fe084974
commit a2563a4dcd
2 changed files with 340 additions and 58 deletions
--- a/src/stochastic_gradient_descent.h
+++ b/src/stochastic_gradient_descent.h
@@ -1,26 +1,47 @@
 /*
 Stochastic gradient descent implementation

-Based on Leon Bottou's Stochastic Gradient Descent Tricks:
+Based on Bob Carpenter's Lazy Sparse Stochastic Gradient Descent for Regularized Mutlinomial Logistic Regression:
+https://lingpipe.files.wordpress.com/2008/04/lazysgdregression.pdf
+
+Learning rate update based on Leon Bottou's Stochastic Gradient Descent Tricks:
 http://leon.bottou.org/publications/pdf/tricks-2012.pdf

 Learning rate calculated as:
 gamma_t = gamma_0(1 + gamma_0 * lambda * t)^-1
-
 */
 #ifndef STOCHASTIC_GRADIENT_DESCENT_H
 #define STOCHASTIC_GRADIENT_DESCENT_H

 #include <stdlib.h>
 #include <stdbool.h>
+#include <math.h>

 #include "matrix.h"
+#include "regularization.h"
+#include "sparse_matrix.h"

-bool stochastic_gradient_descent(double_matrix_t *theta, double_matrix_t *gradient, double gamma);
-bool stochastic_gradient_descent_sparse(double_matrix_t *theta, double_matrix_t *gradient, uint32_array *update_indices, double gamma);
-bool stochastic_gradient_descent_regularize_weights(double_matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
-bool stochastic_gradient_descent_finalize_weights(double_matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
-double stochastic_gradient_descent_gamma_t(double gamma_0, double lambda, uint32_t t);
+typedef struct sgd_trainer {
+    double_matrix_t *theta;
+    regularization_type_t reg_type;
+    double lambda;
+    double gamma_0;
+    bool fit_intercept;
+    uint32_t iterations;
+    uint32_array *last_updated;
+    double_array *penalties;
+} sgd_trainer_t;

+sgd_trainer_t *sgd_trainer_new(size_t m, size_t n, bool fit_intercept, regularization_type_t reg_type, double lambda, double gamma_0);
+
+bool sgd_trainer_reset_params(sgd_trainer_t *self, double lambda, double gamma_0);
+bool stochastic_gradient_descent_update(sgd_trainer_t *self, double_matrix_t *gradient, size_t batch_size);
+bool stochastic_gradient_descent_update_sparse(sgd_trainer_t *self, double_matrix_t *gradient, uint32_array *update_indices, size_t batch_size);
+double stochastic_gradient_descent_reg_cost(sgd_trainer_t *self, uint32_array *indices, size_t batch_size);
+bool stochastic_gradient_descent_regularize_weights(sgd_trainer_t *self);
+double_matrix_t *stochastic_gradient_descent_get_weights(sgd_trainer_t *self);
+sparse_matrix_t *stochastic_gradient_descent_get_weights_sparse(sgd_trainer_t *self);
+
+void sgd_trainer_destroy(sgd_trainer_t *self);

 #endif