From cf88bc7f653818e2788ababaf1caee253d4402ac Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sun, 2 Apr 2017 14:28:25 -0400
Subject: [PATCH] [optimization] implemented Google's FTRL-Proximal, adapted
 for the multiclass/multinomial case. It is L1 and L2 regularized, and should
 both encourage sparsity with the L1 penalty while being robust to
 collinearity of features due to the L2 penalty. Ref:
 https://research.google.com/pubs/archive/41159.pdf

---
 src/ftrl.c   | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/ftrl.h   |  57 ++++++++++
 src/matrix.h |   5 +
 3 files changed, 348 insertions(+)
 create mode 100644 src/ftrl.c
 create mode 100644 src/ftrl.h

diff --git a/src/ftrl.c b/src/ftrl.c
new file mode 100644
index 00000000..658d764a
--- /dev/null
+++ b/src/ftrl.c
@@ -0,0 +1,286 @@
+#include "ftrl.h"
+#include "log/log.h"
+
+// Follow-the-regularized leader (FTRL) Proximal
+
+ftrl_trainer_t *ftrl_trainer_new(size_t m, size_t n, bool fit_intercept, double alpha, double beta, double lambda1, double lambda2) {
+    ftrl_trainer_t *trainer = malloc(sizeof(ftrl_trainer_t));
+    if (trainer == NULL) return NULL;
+
+    trainer->z = double_matrix_new_zeros(m, n);
+    if (trainer->z == NULL) {
+        goto exit_ftrl_trainer_created;
+    }
+
+    trainer->fit_intercept = fit_intercept;
+    trainer->alpha = alpha;
+    trainer->beta = beta;
+    trainer->lambda1 = lambda1;
+    trainer->lambda2 = lambda2;
+
+    trainer->num_features = m;
+
+    trainer->learning_rates = double_array_new_zeros(m);
+    if (trainer->learning_rates == NULL) {
+        goto exit_ftrl_trainer_created;
+    }
+
+    return trainer;
+exit_ftrl_trainer_created:
+    ftrl_trainer_destroy(trainer);
+    return NULL;
+}
+
+bool ftrl_trainer_reset_params(ftrl_trainer_t *self, double alpha, double beta, double lambda1, double lambda2) {
+    double_matrix_zero(self->z);
+    double_array_zero(self->learning_rates->a, self->learning_rates->n);
+
+    self->alpha = alpha;
+    self->beta = beta;
+    self->lambda1 = lambda1;
+    self->lambda2 = lambda2;
+    return true;
+}
+
+
+bool ftrl_trainer_extend(ftrl_trainer_t *self, size_t m) {
+    if (self == NULL || self->z == NULL || self->learning_rates == NULL) return false;
+
+    if (!double_matrix_resize_fill_zeros(self->z, m, self->z->n) ||
+        !double_array_resize_fill_zeros(self->learning_rates, m)) {
+        return false;
+    }
+
+    self->num_features = m;
+
+    return true;
+}
+
+bool ftrl_set_weights(ftrl_trainer_t *self, double_matrix_t *w, uint32_array *indices) {
+    if (self == NULL || w == NULL) return false;
+
+    size_t m = self->z->m;
+    size_t n = self->z->n;
+
+    size_t num_indices = m;
+
+    if (indices != NULL) {
+        num_indices = indices->n;
+    }
+
+    if (!double_matrix_resize(w, num_indices, n)) {
+        log_error("Resizing weights failed\n");
+        return false;
+    }
+
+    double lambda1 = self->lambda1;
+    double lambda2 = self->lambda2;
+
+    double_matrix_t *z = self->z;
+    double *learning_rates = self->learning_rates->a;
+
+    double alpha = self->alpha;
+    double beta = self->beta;
+
+    uint32_t *row_indices = NULL;
+
+    if (indices != NULL) {
+        row_indices = indices->a;
+    }
+
+    uint32_t row_idx;
+    size_t i_start = self->fit_intercept ? 1 : 0;
+
+    for (size_t i = 0; i < num_indices; i++) {
+        if (indices != NULL) {
+            row_idx = row_indices[i];            
+        } else {
+            row_idx = i;
+        }
+
+        double *row = double_matrix_get_row(z, (size_t)row_idx);
+        double lr = learning_rates[row_idx];
+        double *weights_row = double_matrix_get_row(w, i);
+
+        if (row_idx >= i_start) {
+            for (size_t j = 0; j < n; j++) {
+                double z_ij = row[j];
+                double sign_z_ij = sign(z_ij);
+                if (sign_z_ij * z_ij > lambda1) {
+                    double w_ij = -(1.0/(((beta + sqrt(lr)) / alpha) + lambda2)) * (z_ij - sign_z_ij * lambda1);
+                    weights_row[j] = w_ij;
+                } else {
+                    weights_row[j] = 0.0;
+                }
+            }
+        } else {
+            for (size_t j = 0; j < n; j++) {
+                double z_ij = row[j];
+                double w_ij = -(1.0/((beta + sqrt(lr)) / alpha)) * z_ij;
+                weights_row[j] = w_ij;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool ftrl_update_gradient(ftrl_trainer_t *self, double_matrix_t *gradient, double_matrix_t *weights, uint32_array *indices, size_t batch_size) {
+    if (self == NULL || indices == NULL || gradient == NULL || gradient->m != weights->m || gradient->n != weights->n) {
+        if (indices == NULL) {
+            log_error("indices was NULL\n");
+        }
+        log_error("gradient->m = %zu, gradient->n = %zu, weights->m = %zu, weights->n = %zu\n", gradient->m, gradient->n, weights->m, weights->n);
+        return false;
+    }
+
+    size_t m = self->z->m;
+    size_t n = self->z->n;
+
+    size_t num_indices = indices->n;
+    uint32_t *row_indices = indices->a;
+
+    double_matrix_t *z = self->z;
+
+    double *learning_rates = self->learning_rates->a;
+
+    double alpha = self->alpha;
+
+    for (size_t i = 0; i < num_indices; i++) {
+        uint32_t row_idx = row_indices[i];
+        if (row_idx >= m) {
+            log_error("row_idx = %u, m = %zu\n", row_idx, m);
+            return false;
+        }
+        double lr = learning_rates[row_idx];
+
+        double *weights_row = double_matrix_get_row(weights, i);
+        double *gradient_row = double_matrix_get_row(gradient, i);
+        double *z_row = double_matrix_get_row(z, row_idx);
+
+        double lr_update = lr;
+
+        for (size_t j = 0; j < n; j++) {
+            double grad_ij = gradient_row[j];
+            lr_update += grad_ij * grad_ij;
+        }
+
+        double sigma = (1.0 / (alpha * batch_size)) * (sqrt(lr_update) - sqrt(lr));
+
+        for (size_t j = 0; j < n; j++) {
+            double z_ij_update = gradient_row[j] - sigma * weights_row[j];
+            z_row[j] += z_ij_update;
+        }
+
+        learning_rates[row_idx] = lr_update;
+    }
+
+    return true;
+}
+
+double ftrl_reg_cost(ftrl_trainer_t *self, double_matrix_t *theta, uint32_array *indices, size_t batch_size) {
+    double cost = 0.0;
+
+    size_t m = theta->m;
+    size_t n = theta->n;
+
+    double lambda1 = self->lambda1;
+    double lambda2 = self->lambda2;
+
+    size_t i_start = self->fit_intercept ? 1 : 0;
+
+    double l2_cost = 0.0;
+    double l1_cost = 0.0;
+
+    for (size_t i = 0; i < m; i++) {
+        uint32_t row_idx = indices->a[i];
+        if (row_idx >= i_start) {
+            double *theta_i = double_matrix_get_row(theta, i);
+
+            l2_cost += double_array_l2_norm(theta_i, n);
+            l1_cost += double_array_l1_norm(theta_i, n);
+        }
+    }
+
+    cost += lambda2 / 2.0 * l2_cost;
+    cost += lambda1 * l1_cost;
+
+    return cost;
+}
+
+double_matrix_t *ftrl_weights_finalize(ftrl_trainer_t *self) {
+    if (!ftrl_set_weights(self, self->z, NULL)) {
+        return NULL;
+    }
+
+    double_matrix_t *weights = self->z;
+    self->z = NULL;
+    return weights;
+}
+
+
+sparse_matrix_t *ftrl_weights_finalize_sparse(ftrl_trainer_t *self) {
+    size_t m = self->z->m;
+    size_t n = self->z->n;
+
+    double *learning_rates = self->learning_rates->a;
+
+    double alpha = self->alpha;
+    double beta = self->beta;
+    double lambda1 = self->lambda1;
+    double lambda2 = self->lambda2;
+
+    sparse_matrix_t *weights = sparse_matrix_new();
+    log_info("weights->m = %zu\n", weights->m);
+
+    size_t i_start = 0;
+
+    if (self->fit_intercept) {
+        double *row = double_matrix_get_row(self->z, 0);
+        double lr = learning_rates[0];
+
+        for (size_t j = 0; j < n; j++) {
+            double z_ij = row[j];
+            double w_ij = -(1.0/((beta + sqrt(lr)) / alpha)) * z_ij;
+            sparse_matrix_append(weights, j, w_ij);
+        }
+        sparse_matrix_finalize_row(weights);
+        i_start = 1;
+    }
+    log_info("after intercept weights->m = %zu\n", weights->m);
+
+    for (size_t i = i_start; i < m; i++) {
+        double *row = double_matrix_get_row(self->z, (size_t)i);
+        double lr = learning_rates[i];
+        for (size_t j = 0; j < n; j++) {
+            double z_ij = row[j];
+            double sign_z_ij = sign(z_ij);
+            if (sign_z_ij * z_ij > lambda1) {
+                double w_ij = -(1.0/(((beta + sqrt(lr)) / alpha) + lambda2)) * (z_ij - sign_z_ij * lambda1);
+                sparse_matrix_append(weights, j, w_ij);
+            }
+        }
+        sparse_matrix_finalize_row(weights);
+
+        if (i % 1000 == 0 && i > 0) {
+            log_info("adding rows, weights->m = %zu\n", weights->m);
+        }
+    }
+
+    return weights;
+}
+
+
+void ftrl_trainer_destroy(ftrl_trainer_t *self) {
+    if (self == NULL) return;
+
+    if (self->z != NULL) {
+        double_matrix_destroy(self->z);
+    }
+
+    if (self->learning_rates != NULL) {
+        double_array_destroy(self->learning_rates);
+    }
+
+    free(self);
+}
\ No newline at end of file
diff --git a/src/ftrl.h b/src/ftrl.h
new file mode 100644
index 00000000..08538ca3
--- /dev/null
+++ b/src/ftrl.h
@@ -0,0 +1,57 @@
+/*
+Follow-the-regularized-leader (FTRL) Proximal inference
+
+This is a regularization scheme for online gradient descent
+which produces sparse solutions.
+
+From: https://research.google.com/pubs/archive/41159.pdf
+*/
+
+#ifndef FTRL_H
+#define FTRL_H
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+
+#include "float_utils.h"
+#include "matrix.h"
+#include "sparse_matrix.h"
+
+typedef struct ftrl_trainer {
+    // Dense weights matrix, only needed to store at training time
+    double_matrix_t *z;
+    // Whether to consider feature_id=0 the intercept
+    bool fit_intercept;
+    // number of features (may change)
+    size_t num_features;
+    // alpha hyperparameter
+    double alpha;
+    // beta hyperparameter
+    double beta;
+    // Coefficient for L1 regularization
+    double lambda1;
+    // Coefficient for L2 regularization
+    double lambda2;
+    // Vector of learning rates
+    double_array *learning_rates;
+} ftrl_trainer_t;
+
+ftrl_trainer_t *ftrl_trainer_new(size_t m, size_t n, bool fit_intercept, double alpha, double beta, double lambda1, double lambda2);
+
+bool ftrl_trainer_reset_params(ftrl_trainer_t *self, double alpha, double beta, double lambda1, double lambda2);
+
+bool ftrl_trainer_extend(ftrl_trainer_t *trainer, size_t m);
+
+bool ftrl_set_weights(ftrl_trainer_t *trainer, double_matrix_t *w, uint32_array *indices);
+
+double ftrl_reg_cost(ftrl_trainer_t *self, double_matrix_t *theta, uint32_array *indices, size_t batch_size);
+
+bool ftrl_update_gradient(ftrl_trainer_t *self, double_matrix_t *gradient, double_matrix_t *weights, uint32_array *indices, size_t batch_size);
+
+double_matrix_t *ftrl_weights_finalize(ftrl_trainer_t *trainer);
+sparse_matrix_t *ftrl_weights_finalize_sparse(ftrl_trainer_t *trainer);
+
+void ftrl_trainer_destroy(ftrl_trainer_t *self);
+
+#endif
\ No newline at end of file
diff --git a/src/matrix.h b/src/matrix.h
index e8b03e84..d0f43a71 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -19,6 +19,11 @@
 #warning "No CLBAS"
 #endif
 
+typedef enum {
+    MATRIX_DENSE,
+    MATRIX_SPARSE
+} matrix_type_t;
+
 #define MATRIX_INIT(name, type, type_name, array_type)                                                          \
     typedef struct {                                                                                            \
         size_t m, n;                                                                                            \