From cf88bc7f653818e2788ababaf1caee253d4402ac Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 2 Apr 2017 14:28:25 -0400 Subject: [PATCH] [optimization] implemented Google's FTRL-Proximal, adapted for the multiclass/multinomial case. It is L1 and L2 regularized, and should both encourage sparsity with the L1 penalty while being robust to collinearity of features due to the L2 penalty. Ref: https://research.google.com/pubs/archive/41159.pdf --- src/ftrl.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/ftrl.h | 57 ++++++++++ src/matrix.h | 5 + 3 files changed, 348 insertions(+) create mode 100644 src/ftrl.c create mode 100644 src/ftrl.h diff --git a/src/ftrl.c b/src/ftrl.c new file mode 100644 index 00000000..658d764a --- /dev/null +++ b/src/ftrl.c @@ -0,0 +1,286 @@ +#include "ftrl.h" +#include "log/log.h" + +// Follow-the-regularized leader (FTRL) Proximal + +ftrl_trainer_t *ftrl_trainer_new(size_t m, size_t n, bool fit_intercept, double alpha, double beta, double lambda1, double lambda2) { + ftrl_trainer_t *trainer = malloc(sizeof(ftrl_trainer_t)); + if (trainer == NULL) return NULL; + + trainer->z = double_matrix_new_zeros(m, n); + if (trainer->z == NULL) { + goto exit_ftrl_trainer_created; + } + + trainer->fit_intercept = fit_intercept; + trainer->alpha = alpha; + trainer->beta = beta; + trainer->lambda1 = lambda1; + trainer->lambda2 = lambda2; + + trainer->num_features = m; + + trainer->learning_rates = double_array_new_zeros(m); + if (trainer->learning_rates == NULL) { + goto exit_ftrl_trainer_created; + } + + return trainer; +exit_ftrl_trainer_created: + ftrl_trainer_destroy(trainer); + return NULL; +} + +bool ftrl_trainer_reset_params(ftrl_trainer_t *self, double alpha, double beta, double lambda1, double lambda2) { + double_matrix_zero(self->z); + double_array_zero(self->learning_rates->a, self->learning_rates->n); + + self->alpha = alpha; + self->beta = beta; + self->lambda1 = lambda1; + self->lambda2 = lambda2; + return true; +} + + +bool ftrl_trainer_extend(ftrl_trainer_t *self, size_t m) { + if (self == NULL || self->z == NULL || self->learning_rates == NULL) return false; + + if (!double_matrix_resize_fill_zeros(self->z, m, self->z->n) || + !double_array_resize_fill_zeros(self->learning_rates, m)) { + return false; + } + + self->num_features = m; + + return true; +} + +bool ftrl_set_weights(ftrl_trainer_t *self, double_matrix_t *w, uint32_array *indices) { + if (self == NULL || w == NULL) return false; + + size_t m = self->z->m; + size_t n = self->z->n; + + size_t num_indices = m; + + if (indices != NULL) { + num_indices = indices->n; + } + + if (!double_matrix_resize(w, num_indices, n)) { + log_error("Resizing weights failed\n"); + return false; + } + + double lambda1 = self->lambda1; + double lambda2 = self->lambda2; + + double_matrix_t *z = self->z; + double *learning_rates = self->learning_rates->a; + + double alpha = self->alpha; + double beta = self->beta; + + uint32_t *row_indices = NULL; + + if (indices != NULL) { + row_indices = indices->a; + } + + uint32_t row_idx; + size_t i_start = self->fit_intercept ? 1 : 0; + + for (size_t i = 0; i < num_indices; i++) { + if (indices != NULL) { + row_idx = row_indices[i]; + } else { + row_idx = i; + } + + double *row = double_matrix_get_row(z, (size_t)row_idx); + double lr = learning_rates[row_idx]; + double *weights_row = double_matrix_get_row(w, i); + + if (row_idx >= i_start) { + for (size_t j = 0; j < n; j++) { + double z_ij = row[j]; + double sign_z_ij = sign(z_ij); + if (sign_z_ij * z_ij > lambda1) { + double w_ij = -(1.0/(((beta + sqrt(lr)) / alpha) + lambda2)) * (z_ij - sign_z_ij * lambda1); + weights_row[j] = w_ij; + } else { + weights_row[j] = 0.0; + } + } + } else { + for (size_t j = 0; j < n; j++) { + double z_ij = row[j]; + double w_ij = -(1.0/((beta + sqrt(lr)) / alpha)) * z_ij; + weights_row[j] = w_ij; + } + } + } + + return true; +} + +bool ftrl_update_gradient(ftrl_trainer_t *self, double_matrix_t *gradient, double_matrix_t *weights, uint32_array *indices, size_t batch_size) { + if (self == NULL || indices == NULL || gradient == NULL || gradient->m != weights->m || gradient->n != weights->n) { + if (indices == NULL) { + log_error("indices was NULL\n"); + } + log_error("gradient->m = %zu, gradient->n = %zu, weights->m = %zu, weights->n = %zu\n", gradient->m, gradient->n, weights->m, weights->n); + return false; + } + + size_t m = self->z->m; + size_t n = self->z->n; + + size_t num_indices = indices->n; + uint32_t *row_indices = indices->a; + + double_matrix_t *z = self->z; + + double *learning_rates = self->learning_rates->a; + + double alpha = self->alpha; + + for (size_t i = 0; i < num_indices; i++) { + uint32_t row_idx = row_indices[i]; + if (row_idx >= m) { + log_error("row_idx = %u, m = %zu\n", row_idx, m); + return false; + } + double lr = learning_rates[row_idx]; + + double *weights_row = double_matrix_get_row(weights, i); + double *gradient_row = double_matrix_get_row(gradient, i); + double *z_row = double_matrix_get_row(z, row_idx); + + double lr_update = lr; + + for (size_t j = 0; j < n; j++) { + double grad_ij = gradient_row[j]; + lr_update += grad_ij * grad_ij; + } + + double sigma = (1.0 / (alpha * batch_size)) * (sqrt(lr_update) - sqrt(lr)); + + for (size_t j = 0; j < n; j++) { + double z_ij_update = gradient_row[j] - sigma * weights_row[j]; + z_row[j] += z_ij_update; + } + + learning_rates[row_idx] = lr_update; + } + + return true; +} + +double ftrl_reg_cost(ftrl_trainer_t *self, double_matrix_t *theta, uint32_array *indices, size_t batch_size) { + double cost = 0.0; + + size_t m = theta->m; + size_t n = theta->n; + + double lambda1 = self->lambda1; + double lambda2 = self->lambda2; + + size_t i_start = self->fit_intercept ? 1 : 0; + + double l2_cost = 0.0; + double l1_cost = 0.0; + + for (size_t i = 0; i < m; i++) { + uint32_t row_idx = indices->a[i]; + if (row_idx >= i_start) { + double *theta_i = double_matrix_get_row(theta, i); + + l2_cost += double_array_l2_norm(theta_i, n); + l1_cost += double_array_l1_norm(theta_i, n); + } + } + + cost += lambda2 / 2.0 * l2_cost; + cost += lambda1 * l1_cost; + + return cost; +} + +double_matrix_t *ftrl_weights_finalize(ftrl_trainer_t *self) { + if (!ftrl_set_weights(self, self->z, NULL)) { + return NULL; + } + + double_matrix_t *weights = self->z; + self->z = NULL; + return weights; +} + + +sparse_matrix_t *ftrl_weights_finalize_sparse(ftrl_trainer_t *self) { + size_t m = self->z->m; + size_t n = self->z->n; + + double *learning_rates = self->learning_rates->a; + + double alpha = self->alpha; + double beta = self->beta; + double lambda1 = self->lambda1; + double lambda2 = self->lambda2; + + sparse_matrix_t *weights = sparse_matrix_new(); + log_info("weights->m = %zu\n", weights->m); + + size_t i_start = 0; + + if (self->fit_intercept) { + double *row = double_matrix_get_row(self->z, 0); + double lr = learning_rates[0]; + + for (size_t j = 0; j < n; j++) { + double z_ij = row[j]; + double w_ij = -(1.0/((beta + sqrt(lr)) / alpha)) * z_ij; + sparse_matrix_append(weights, j, w_ij); + } + sparse_matrix_finalize_row(weights); + i_start = 1; + } + log_info("after intercept weights->m = %zu\n", weights->m); + + for (size_t i = i_start; i < m; i++) { + double *row = double_matrix_get_row(self->z, (size_t)i); + double lr = learning_rates[i]; + for (size_t j = 0; j < n; j++) { + double z_ij = row[j]; + double sign_z_ij = sign(z_ij); + if (sign_z_ij * z_ij > lambda1) { + double w_ij = -(1.0/(((beta + sqrt(lr)) / alpha) + lambda2)) * (z_ij - sign_z_ij * lambda1); + sparse_matrix_append(weights, j, w_ij); + } + } + sparse_matrix_finalize_row(weights); + + if (i % 1000 == 0 && i > 0) { + log_info("adding rows, weights->m = %zu\n", weights->m); + } + } + + return weights; +} + + +void ftrl_trainer_destroy(ftrl_trainer_t *self) { + if (self == NULL) return; + + if (self->z != NULL) { + double_matrix_destroy(self->z); + } + + if (self->learning_rates != NULL) { + double_array_destroy(self->learning_rates); + } + + free(self); +} \ No newline at end of file diff --git a/src/ftrl.h b/src/ftrl.h new file mode 100644 index 00000000..08538ca3 --- /dev/null +++ b/src/ftrl.h @@ -0,0 +1,57 @@ +/* +Follow-the-regularized-leader (FTRL) Proximal inference + +This is a regularization scheme for online gradient descent +which produces sparse solutions. + +From: https://research.google.com/pubs/archive/41159.pdf +*/ + +#ifndef FTRL_H +#define FTRL_H + +#include +#include +#include + +#include "float_utils.h" +#include "matrix.h" +#include "sparse_matrix.h" + +typedef struct ftrl_trainer { + // Dense weights matrix, only needed to store at training time + double_matrix_t *z; + // Whether to consider feature_id=0 the intercept + bool fit_intercept; + // number of features (may change) + size_t num_features; + // alpha hyperparameter + double alpha; + // beta hyperparameter + double beta; + // Coefficient for L1 regularization + double lambda1; + // Coefficient for L2 regularization + double lambda2; + // Vector of learning rates + double_array *learning_rates; +} ftrl_trainer_t; + +ftrl_trainer_t *ftrl_trainer_new(size_t m, size_t n, bool fit_intercept, double alpha, double beta, double lambda1, double lambda2); + +bool ftrl_trainer_reset_params(ftrl_trainer_t *self, double alpha, double beta, double lambda1, double lambda2); + +bool ftrl_trainer_extend(ftrl_trainer_t *trainer, size_t m); + +bool ftrl_set_weights(ftrl_trainer_t *trainer, double_matrix_t *w, uint32_array *indices); + +double ftrl_reg_cost(ftrl_trainer_t *self, double_matrix_t *theta, uint32_array *indices, size_t batch_size); + +bool ftrl_update_gradient(ftrl_trainer_t *self, double_matrix_t *gradient, double_matrix_t *weights, uint32_array *indices, size_t batch_size); + +double_matrix_t *ftrl_weights_finalize(ftrl_trainer_t *trainer); +sparse_matrix_t *ftrl_weights_finalize_sparse(ftrl_trainer_t *trainer); + +void ftrl_trainer_destroy(ftrl_trainer_t *self); + +#endif \ No newline at end of file diff --git a/src/matrix.h b/src/matrix.h index e8b03e84..d0f43a71 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -19,6 +19,11 @@ #warning "No CLBAS" #endif +typedef enum { + MATRIX_DENSE, + MATRIX_SPARSE +} matrix_type_t; + #define MATRIX_INIT(name, type, type_name, array_type) \ typedef struct { \ size_t m, n; \