From 64c049730a083272257e7c648cd59d912a749e7d Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sun, 2 Apr 2017 14:30:14 -0400
Subject: [PATCH] [classification] flexible logistic regression trainer that
 can handle either SGD (with either L1 or L2) or FTRL as optimiers

---
 src/logistic_regression_trainer.c | 268 +++++++++++++++++++++++-------
 src/logistic_regression_trainer.h |  48 ++++--
 2 files changed, 241 insertions(+), 75 deletions(-)

diff --git a/src/logistic_regression_trainer.c b/src/logistic_regression_trainer.c
index f2be12f0..929441cd 100644
--- a/src/logistic_regression_trainer.c
+++ b/src/logistic_regression_trainer.c
@@ -1,6 +1,8 @@
 #include "logistic_regression_trainer.h"
 #include "sparse_matrix_utils.h"
 
+#define INITIAL_FEATURE_BATCH_SIZE 1024
+
 void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) {
     if (self == NULL) return;
 
@@ -12,22 +14,18 @@ void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) {
         kh_destroy(str_uint32, self->label_ids);
     }
 
-    if (self->weights != NULL) {
-        double_matrix_destroy(self->weights);
-    }
-
-    if (self->last_updated != NULL) {
-        uint32_array_destroy(self->last_updated);
-    }
-
     if (self->unique_columns != NULL) {
-        kh_destroy(int_set, self->unique_columns);
+        kh_destroy(int_uint32, self->unique_columns);
     }
     
     if (self->batch_columns != NULL) {
         uint32_array_destroy(self->batch_columns);
     }
 
+    if (self->batch_weights != NULL) {
+        double_matrix_destroy(self->batch_weights);
+    }
+
     if (self->gradient != NULL) {
         double_matrix_destroy(self->gradient);
     }
@@ -35,7 +33,7 @@ void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) {
     free(self);
 }
 
-logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double gamma_0, double lambda) {
+static logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_ids, khash_t(str_uint32) *label_ids) {
     if (feature_ids == NULL || label_ids == NULL) return NULL;
 
     logistic_regression_trainer_t *trainer = malloc(sizeof(logistic_regression_trainer_t));
@@ -48,19 +46,26 @@ logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_
     trainer->label_ids = label_ids;
     trainer->num_labels = kh_size(label_ids);
 
-    trainer->weights = double_matrix_new_zeros(trainer->num_features, trainer->num_labels);
+    trainer->gradient = double_matrix_new_zeros(INITIAL_FEATURE_BATCH_SIZE, trainer->num_labels);
+    if (trainer->gradient == NULL) {
+        goto exit_trainer_created;
+    }
 
-    trainer->gradient = double_matrix_new_zeros(trainer->num_features, trainer->num_labels);
+    trainer->unique_columns = kh_init(int_uint32);
+    if (trainer->unique_columns == NULL) {
+        goto exit_trainer_created;
+    }
+    trainer->batch_columns = uint32_array_new_size(INITIAL_FEATURE_BATCH_SIZE);
+    if (trainer->batch_columns == NULL) {
+        goto exit_trainer_created;
+    }
 
-    trainer->unique_columns = kh_init(int_set);
-    trainer->batch_columns = uint32_array_new_size(trainer->num_features);
+    trainer->batch_weights = double_matrix_new_zeros(INITIAL_FEATURE_BATCH_SIZE, trainer->num_labels);
+    if (trainer->batch_weights == NULL) {
+        goto exit_trainer_created;
+    }
 
-    trainer->last_updated = uint32_array_new_zeros(trainer->num_features);
-
-    trainer->lambda = lambda;
-    trainer->iters = 0;
     trainer->epochs = 0;
-    trainer->gamma_0 = gamma_0;
 
     return trainer;
 
@@ -69,70 +74,144 @@ exit_trainer_created:
     return NULL;
 }
 
-
-static double_matrix_t *model_expectation(sparse_matrix_t *x, double_matrix_t *theta) {
-    double_matrix_t *p_y = double_matrix_new_zeros(x->m, theta->n);
-    if (p_y == NULL) return NULL;
-
-    if(logistic_regression_model_expectation(theta, x, p_y)) {
-        return p_y;
-    } else {
-        double_matrix_destroy(p_y);
+logistic_regression_trainer_t *logistic_regression_trainer_init_sgd(trie_t *feature_ids, khash_t(str_uint32) *label_ids, bool fit_intercept, regularization_type_t reg_type, double lambda, double gamma_0) {
+    logistic_regression_trainer_t *trainer = logistic_regression_trainer_init(feature_ids, label_ids);
+    if (trainer == NULL) {
         return NULL;
     }
+
+    trainer->optimizer_type = LOGISTIC_REGRESSION_OPTIMIZER_SGD;
+    trainer->optimizer.sgd = sgd_trainer_new(trainer->num_features, trainer->num_labels, fit_intercept, reg_type, lambda, gamma_0);
+    if (trainer->optimizer.sgd == NULL) {
+        logistic_regression_trainer_destroy(trainer);
+        return NULL;
+    }
+
+    return trainer;
 }
 
-double logistic_regression_trainer_batch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) {
-    size_t m = self->weights->m;
-    size_t n = self->weights->n;
+logistic_regression_trainer_t *logistic_regression_trainer_init_ftrl(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double lambda1, double lambda2, double alpha, double beta) {
+    logistic_regression_trainer_t *trainer = logistic_regression_trainer_init(feature_ids, label_ids);
+    if (trainer == NULL) {
+        return NULL;
+    }
+
+    trainer->optimizer_type = LOGISTIC_REGRESSION_OPTIMIZER_FTRL;
+    bool fit_intercept = true;
+    log_info("num_features = %zu\n", trainer->num_features);
+    trainer->optimizer.ftrl = ftrl_trainer_new(trainer->num_features, trainer->num_labels, fit_intercept, alpha, beta, lambda1, lambda2);
+    if (trainer->optimizer.sgd == NULL) {
+        logistic_regression_trainer_destroy(trainer);
+        return NULL;
+    }
+
+    return trainer;
+}
+
+bool logistic_regression_trainer_reset_params_sgd(logistic_regression_trainer_t *self, double lambda, double gamma_0) {
+    if (self == NULL || self->optimizer_type != LOGISTIC_REGRESSION_OPTIMIZER_SGD || self->optimizer.sgd == NULL) return false;
+
+    sgd_trainer_t *sgd_trainer = self->optimizer.sgd;
+    return sgd_trainer_reset_params(sgd_trainer, lambda, gamma_0);
+}
+
+bool logistic_regression_trainer_reset_params_ftrl(logistic_regression_trainer_t *self, double alpha, double beta, double lambda1, double lambda2) {
+    if (self == NULL || self->optimizer_type != LOGISTIC_REGRESSION_OPTIMIZER_FTRL || self->optimizer.ftrl == NULL) return false;
+
+    ftrl_trainer_t *ftrl_trainer = self->optimizer.ftrl;
+    return ftrl_trainer_reset_params(ftrl_trainer, alpha, beta, lambda1, lambda2);
+}
+
+double logistic_regression_trainer_minibatch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) {
+    size_t n = self->num_labels;
 
     sparse_matrix_t *x = feature_matrix(self->feature_ids, features);
     uint32_array *y = label_vector(self->label_ids, labels);
-    double_matrix_t *p_y = double_matrix_new_zeros(x->m, n);
+    double_matrix_t *p_y = double_matrix_new_aligned(x->m, n, 16);
+    double_matrix_zero(p_y);
 
-    double cost = logistic_regression_cost_function(self->weights, x, y, p_y, self->lambda);
+    double cost;
 
+    if (!sparse_matrix_add_unique_columns_alias(x, self->unique_columns, self->batch_columns)) {
+        cost = -1.0;
+        goto exit_cost_matrices_created;
+    }
+
+    double_matrix_t *weights = logistic_regression_trainer_get_weights(self);
+
+    cost = logistic_regression_cost_function(weights, x, y, p_y);
+
+    if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) {
+        sgd_trainer_t *sgd_trainer = self->optimizer.sgd;
+        double reg_cost = stochastic_gradient_descent_reg_cost(sgd_trainer, self->batch_columns, x->m);
+        cost += reg_cost;
+    } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) {
+        ftrl_trainer_t *ftrl_trainer = self->optimizer.ftrl;
+        double reg_cost = ftrl_reg_cost(ftrl_trainer, weights, self->batch_columns, x->m);
+        cost += reg_cost;
+    }
+
+exit_cost_matrices_created:
     double_matrix_destroy(p_y);
     uint32_array_destroy(y);
     sparse_matrix_destroy(x);
     return cost;    
 }
 
-bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) {
-    size_t m = self->weights->m;
-    size_t n = self->weights->n;
-
-    // Optimize
+bool logistic_regression_trainer_train_minibatch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) {
     double_matrix_t *gradient = self->gradient;
 
     sparse_matrix_t *x = feature_matrix(self->feature_ids, features);
+    if (x == NULL) {
+        log_error("x == NULL\n");
+        return false;
+    }
     uint32_array *y = label_vector(self->label_ids, labels);
-
-    double_matrix_t *p_y = double_matrix_new_zeros(x->m, n);
+    if (y == NULL) {
+        log_error("y == NULL\n");
+        return false;
+    }
 
     bool ret = false;
 
-    if (!sparse_matrix_add_unique_columns(x, self->unique_columns, self->batch_columns)) {
+    if (!sparse_matrix_add_unique_columns_alias(x, self->unique_columns, self->batch_columns)) {
         log_error("Unique columns failed\n");
-        goto exit_matrices_created;
+        return false;
     }
 
-    if (self->lambda > 0.0 && !stochastic_gradient_descent_regularize_weights(self->weights, self->batch_columns, self->last_updated, self->iters, self->lambda, self->gamma_0)) {
-        log_error("Error regularizing weights\n");
-        goto exit_matrices_created;
+    if(!double_matrix_resize(gradient, self->batch_columns->n, self->num_labels)) {
+        log_error("Gradient resize failed\n");
+        return false;
     }
 
-    if (!logistic_regression_gradient_sparse(self->weights, gradient, x, y, p_y, self->batch_columns, self->lambda)) {
+    double_matrix_t *weights = logistic_regression_trainer_get_weights(self);
+    if (weights == NULL) {
+        log_error("Error getting weights\n");
+        return false;
+    }
+    size_t batch_size = x->m;
+
+    double_matrix_t *p_y = double_matrix_new_aligned(batch_size, self->num_labels, 16);
+    if (p_y == NULL) {
+        log_error("Error allocating p_y\n");
+        return false;
+    }
+
+    if (!logistic_regression_gradient(weights, gradient, x, y, p_y)) {
         log_error("Gradient failed\n");
         goto exit_matrices_created;
     }
 
-    size_t data_len = m * n;
-
-    double gamma = stochastic_gradient_descent_gamma_t(self->gamma_0, self->lambda, self->iters);
-    ret = stochastic_gradient_descent_sparse(self->weights, gradient, self->batch_columns, gamma);
-
-    self->iters++;
+    if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) {
+        ret = stochastic_gradient_descent_update_sparse(self->optimizer.sgd, gradient, self->batch_columns, batch_size);        
+    } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) {
+        ret = ftrl_update_gradient(self->optimizer.ftrl, gradient, weights, self->batch_columns, batch_size);
+        if (!ret) {
+            log_error("ftrl_update_gradient failed\n");
+        }
+    } else {
+        ret = false;
+    }
 
 exit_matrices_created:
     double_matrix_destroy(p_y);
@@ -141,12 +220,87 @@ exit_matrices_created:
     return ret;
 }
 
-bool logistic_regression_trainer_finalize(logistic_regression_trainer_t *self) {
-    if (self == NULL) return false;
+double_matrix_t *logistic_regression_trainer_get_weights(logistic_regression_trainer_t *self) {
+    if (self == NULL) return NULL;
 
-    if (self->lambda > 0.0) {
-        return stochastic_gradient_descent_finalize_weights(self->weights, self->last_updated, self->iters, self->lambda, self->gamma_0);
+    size_t m = self->batch_columns->n;
+    size_t n = self->num_labels;
+    double_matrix_t *batch_weights = self->batch_weights;
+    if (batch_weights == NULL || !double_matrix_resize(batch_weights, m, n)) {
+        return NULL;
+    }
+    double_matrix_zero(batch_weights);
+
+    if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) {
+        if (self->optimizer.sgd == NULL) return NULL;
+        double_matrix_t *full_weights = self->optimizer.sgd->theta;
+        uint32_t *columns = self->batch_columns->a;
+
+        for (size_t i = 0; i < m; i++) {
+            uint32_t col = columns[i];
+            double *theta_row = double_matrix_get_row(full_weights, col);
+            double *row = double_matrix_get_row(batch_weights, i);
+            for (size_t j = 0; j < n; j++) {
+                row[j] = theta_row[j];
+            }
+        }
+
+        return batch_weights;
+    } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) {
+        if (self->optimizer.ftrl == NULL) return NULL;
+
+        if (!ftrl_set_weights(self->optimizer.ftrl, batch_weights, self->batch_columns)) {
+            return NULL;
+        }
+
+        return batch_weights;
+
+    }
+    return NULL;
+}
+
+double_matrix_t *logistic_regression_trainer_get_regularized_weights(logistic_regression_trainer_t *self) {
+    if (self == NULL) return NULL;
+
+    if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) {
+        if (self->optimizer.sgd == NULL) return NULL;
+        return stochastic_gradient_descent_get_weights(self->optimizer.sgd);
+    } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) {
+        if (self->optimizer.ftrl == NULL) return NULL;
+        if (!ftrl_set_weights(self->optimizer.ftrl, self->batch_weights, NULL)) {
+            return NULL;
+        }
+        return self->batch_weights;
+    }
+    return NULL;
+}
+
+double_matrix_t *logistic_regression_trainer_final_weights(logistic_regression_trainer_t *self) {
+    if (self == NULL) return NULL;
+
+    if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) {
+        if (self->optimizer.sgd == NULL) return NULL;
+        double_matrix_t *weights = stochastic_gradient_descent_get_weights(self->optimizer.sgd);
+        self->optimizer.sgd->theta = NULL;
+        return weights;
+    } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) {
+        if (self->optimizer.ftrl == NULL) return NULL;
+        return ftrl_weights_finalize(self->optimizer.ftrl);
+    }
+    return NULL;
+}
+
+
+sparse_matrix_t *logistic_regression_trainer_final_weights_sparse(logistic_regression_trainer_t *self) {
+    if (self == NULL) return NULL;
+
+    if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) {
+        if (self->optimizer.sgd == NULL) return NULL;
+        return stochastic_gradient_descent_get_weights_sparse(self->optimizer.sgd);
+    } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) {
+        if (self->optimizer.ftrl == NULL) return NULL;
+        return ftrl_weights_finalize_sparse(self->optimizer.ftrl);
     }
 
-    return true;
+    return NULL;
 }
diff --git a/src/logistic_regression_trainer.h b/src/logistic_regression_trainer.h
index fc7f0a8e..2a03ac3c 100644
--- a/src/logistic_regression_trainer.h
+++ b/src/logistic_regression_trainer.h
@@ -9,6 +9,7 @@
 #include "averaged_perceptron_tagger.h"
 #include "collections.h"
 #include "features.h"
+#include "ftrl.h"
 #include "logistic_regression.h"
 #include "minibatch.h"
 #include "sparse_matrix.h"
@@ -21,28 +22,39 @@
  * Helper struct for training logistic regression model
  */
 
+ typedef enum {
+    LOGISTIC_REGRESSION_OPTIMIZER_SGD,
+    LOGISTIC_REGRESSION_OPTIMIZER_FTRL
+ } logistic_regression_optimizer_type;
+
 typedef struct logistic_regression_trainer {
-    trie_t *feature_ids;                // Trie mapping features to array indices
-    size_t num_features;                // Number of features
-    khash_t(str_uint32) *label_ids;     // Hashtable mapping labels to array indices
-    size_t num_labels;                  // Number of labels
-    double_matrix_t *weights;                  // Matrix of logistic regression weights
-    double_matrix_t *gradient;                 // Gradient matrix to be reused
-    khash_t(int_set) *unique_columns;   // Unique columns set
-    uint32_array *batch_columns;        // Unique columns as array
-    uint32_array *last_updated;         // Array of length N indicating the last time each feature was updated
-    double lambda;                      // Regularization weight
-    uint32_t iters;                     // Number of iterations, used to decay learning rate
-    uint32_t epochs;                    // Number of epochs
-    double gamma_0;                     // Initial learning rate
+    trie_t *feature_ids;                                // Trie mapping features to array indices
+    size_t num_features;                                // Number of features
+    khash_t(str_uint32) *label_ids;                     // Hashtable mapping labels to array indices
+    size_t num_labels;                                  // Number of labels
+    double_matrix_t *gradient;                          // Gradient matrix to be reused
+    khash_t(int_uint32) *unique_columns;                // Unique columns set
+    uint32_array *batch_columns;                        // Unique columns as array
+    double_matrix_t *batch_weights;                     // Weights updated in this batch
+    uint32_t epochs;                                    // Number of epochs
+    logistic_regression_optimizer_type optimizer_type;  // Trainer type
+    union {
+        sgd_trainer_t *sgd;                             // Stochastic (ok, minibatch) gradient descent
+        ftrl_trainer_t *ftrl;                           // Follow-the-regularized-leader (FTRL) Proximal
+    } optimizer;
 } logistic_regression_trainer_t;
 
+logistic_regression_trainer_t *logistic_regression_trainer_init_sgd(trie_t *feature_ids, khash_t(str_uint32) *label_ids, bool fit_intercept, regularization_type_t reg_type, double lambda, double gamma_0);
+logistic_regression_trainer_t *logistic_regression_trainer_init_ftrl(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double lambda1, double lambda2, double alpha, double beta);
+bool logistic_regression_trainer_reset_params_sgd(logistic_regression_trainer_t *self, double lambda, double gamma_0);
+bool logistic_regression_trainer_reset_params_ftrl(logistic_regression_trainer_t *self, double alpha, double beta, double lambda1, double lambda2);
+bool logistic_regression_trainer_train_minibatch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels);
+double logistic_regression_trainer_minibatch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels);
 
-logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double gamma_0, double lambda);
-
-bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels);
-double logistic_regression_trainer_batch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels);
-bool logistic_regression_trainer_finalize(logistic_regression_trainer_t *self);
+double_matrix_t *logistic_regression_trainer_get_weights(logistic_regression_trainer_t *self);
+double_matrix_t *logistic_regression_trainer_get_regularized_weights(logistic_regression_trainer_t *self);
+double_matrix_t *logistic_regression_trainer_final_weights(logistic_regression_trainer_t *self);
+sparse_matrix_t *logistic_regression_trainer_final_weights_sparse(logistic_regression_trainer_t *self);
 
 void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self);