From 64c049730a083272257e7c648cd59d912a749e7d Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 2 Apr 2017 14:30:14 -0400 Subject: [PATCH] [classification] flexible logistic regression trainer that can handle either SGD (with either L1 or L2) or FTRL as optimiers --- src/logistic_regression_trainer.c | 268 +++++++++++++++++++++++------- src/logistic_regression_trainer.h | 48 ++++-- 2 files changed, 241 insertions(+), 75 deletions(-) diff --git a/src/logistic_regression_trainer.c b/src/logistic_regression_trainer.c index f2be12f0..929441cd 100644 --- a/src/logistic_regression_trainer.c +++ b/src/logistic_regression_trainer.c @@ -1,6 +1,8 @@ #include "logistic_regression_trainer.h" #include "sparse_matrix_utils.h" +#define INITIAL_FEATURE_BATCH_SIZE 1024 + void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) { if (self == NULL) return; @@ -12,22 +14,18 @@ void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) { kh_destroy(str_uint32, self->label_ids); } - if (self->weights != NULL) { - double_matrix_destroy(self->weights); - } - - if (self->last_updated != NULL) { - uint32_array_destroy(self->last_updated); - } - if (self->unique_columns != NULL) { - kh_destroy(int_set, self->unique_columns); + kh_destroy(int_uint32, self->unique_columns); } if (self->batch_columns != NULL) { uint32_array_destroy(self->batch_columns); } + if (self->batch_weights != NULL) { + double_matrix_destroy(self->batch_weights); + } + if (self->gradient != NULL) { double_matrix_destroy(self->gradient); } @@ -35,7 +33,7 @@ void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) { free(self); } -logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double gamma_0, double lambda) { +static logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_ids, khash_t(str_uint32) *label_ids) { if (feature_ids == NULL || label_ids == NULL) return NULL; logistic_regression_trainer_t *trainer = malloc(sizeof(logistic_regression_trainer_t)); @@ -48,19 +46,26 @@ logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_ trainer->label_ids = label_ids; trainer->num_labels = kh_size(label_ids); - trainer->weights = double_matrix_new_zeros(trainer->num_features, trainer->num_labels); + trainer->gradient = double_matrix_new_zeros(INITIAL_FEATURE_BATCH_SIZE, trainer->num_labels); + if (trainer->gradient == NULL) { + goto exit_trainer_created; + } - trainer->gradient = double_matrix_new_zeros(trainer->num_features, trainer->num_labels); + trainer->unique_columns = kh_init(int_uint32); + if (trainer->unique_columns == NULL) { + goto exit_trainer_created; + } + trainer->batch_columns = uint32_array_new_size(INITIAL_FEATURE_BATCH_SIZE); + if (trainer->batch_columns == NULL) { + goto exit_trainer_created; + } - trainer->unique_columns = kh_init(int_set); - trainer->batch_columns = uint32_array_new_size(trainer->num_features); + trainer->batch_weights = double_matrix_new_zeros(INITIAL_FEATURE_BATCH_SIZE, trainer->num_labels); + if (trainer->batch_weights == NULL) { + goto exit_trainer_created; + } - trainer->last_updated = uint32_array_new_zeros(trainer->num_features); - - trainer->lambda = lambda; - trainer->iters = 0; trainer->epochs = 0; - trainer->gamma_0 = gamma_0; return trainer; @@ -69,70 +74,144 @@ exit_trainer_created: return NULL; } - -static double_matrix_t *model_expectation(sparse_matrix_t *x, double_matrix_t *theta) { - double_matrix_t *p_y = double_matrix_new_zeros(x->m, theta->n); - if (p_y == NULL) return NULL; - - if(logistic_regression_model_expectation(theta, x, p_y)) { - return p_y; - } else { - double_matrix_destroy(p_y); +logistic_regression_trainer_t *logistic_regression_trainer_init_sgd(trie_t *feature_ids, khash_t(str_uint32) *label_ids, bool fit_intercept, regularization_type_t reg_type, double lambda, double gamma_0) { + logistic_regression_trainer_t *trainer = logistic_regression_trainer_init(feature_ids, label_ids); + if (trainer == NULL) { return NULL; } + + trainer->optimizer_type = LOGISTIC_REGRESSION_OPTIMIZER_SGD; + trainer->optimizer.sgd = sgd_trainer_new(trainer->num_features, trainer->num_labels, fit_intercept, reg_type, lambda, gamma_0); + if (trainer->optimizer.sgd == NULL) { + logistic_regression_trainer_destroy(trainer); + return NULL; + } + + return trainer; } -double logistic_regression_trainer_batch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) { - size_t m = self->weights->m; - size_t n = self->weights->n; +logistic_regression_trainer_t *logistic_regression_trainer_init_ftrl(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double lambda1, double lambda2, double alpha, double beta) { + logistic_regression_trainer_t *trainer = logistic_regression_trainer_init(feature_ids, label_ids); + if (trainer == NULL) { + return NULL; + } + + trainer->optimizer_type = LOGISTIC_REGRESSION_OPTIMIZER_FTRL; + bool fit_intercept = true; + log_info("num_features = %zu\n", trainer->num_features); + trainer->optimizer.ftrl = ftrl_trainer_new(trainer->num_features, trainer->num_labels, fit_intercept, alpha, beta, lambda1, lambda2); + if (trainer->optimizer.sgd == NULL) { + logistic_regression_trainer_destroy(trainer); + return NULL; + } + + return trainer; +} + +bool logistic_regression_trainer_reset_params_sgd(logistic_regression_trainer_t *self, double lambda, double gamma_0) { + if (self == NULL || self->optimizer_type != LOGISTIC_REGRESSION_OPTIMIZER_SGD || self->optimizer.sgd == NULL) return false; + + sgd_trainer_t *sgd_trainer = self->optimizer.sgd; + return sgd_trainer_reset_params(sgd_trainer, lambda, gamma_0); +} + +bool logistic_regression_trainer_reset_params_ftrl(logistic_regression_trainer_t *self, double alpha, double beta, double lambda1, double lambda2) { + if (self == NULL || self->optimizer_type != LOGISTIC_REGRESSION_OPTIMIZER_FTRL || self->optimizer.ftrl == NULL) return false; + + ftrl_trainer_t *ftrl_trainer = self->optimizer.ftrl; + return ftrl_trainer_reset_params(ftrl_trainer, alpha, beta, lambda1, lambda2); +} + +double logistic_regression_trainer_minibatch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) { + size_t n = self->num_labels; sparse_matrix_t *x = feature_matrix(self->feature_ids, features); uint32_array *y = label_vector(self->label_ids, labels); - double_matrix_t *p_y = double_matrix_new_zeros(x->m, n); + double_matrix_t *p_y = double_matrix_new_aligned(x->m, n, 16); + double_matrix_zero(p_y); - double cost = logistic_regression_cost_function(self->weights, x, y, p_y, self->lambda); + double cost; + if (!sparse_matrix_add_unique_columns_alias(x, self->unique_columns, self->batch_columns)) { + cost = -1.0; + goto exit_cost_matrices_created; + } + + double_matrix_t *weights = logistic_regression_trainer_get_weights(self); + + cost = logistic_regression_cost_function(weights, x, y, p_y); + + if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) { + sgd_trainer_t *sgd_trainer = self->optimizer.sgd; + double reg_cost = stochastic_gradient_descent_reg_cost(sgd_trainer, self->batch_columns, x->m); + cost += reg_cost; + } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) { + ftrl_trainer_t *ftrl_trainer = self->optimizer.ftrl; + double reg_cost = ftrl_reg_cost(ftrl_trainer, weights, self->batch_columns, x->m); + cost += reg_cost; + } + +exit_cost_matrices_created: double_matrix_destroy(p_y); uint32_array_destroy(y); sparse_matrix_destroy(x); return cost; } -bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) { - size_t m = self->weights->m; - size_t n = self->weights->n; - - // Optimize +bool logistic_regression_trainer_train_minibatch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels) { double_matrix_t *gradient = self->gradient; sparse_matrix_t *x = feature_matrix(self->feature_ids, features); + if (x == NULL) { + log_error("x == NULL\n"); + return false; + } uint32_array *y = label_vector(self->label_ids, labels); - - double_matrix_t *p_y = double_matrix_new_zeros(x->m, n); + if (y == NULL) { + log_error("y == NULL\n"); + return false; + } bool ret = false; - if (!sparse_matrix_add_unique_columns(x, self->unique_columns, self->batch_columns)) { + if (!sparse_matrix_add_unique_columns_alias(x, self->unique_columns, self->batch_columns)) { log_error("Unique columns failed\n"); - goto exit_matrices_created; + return false; } - if (self->lambda > 0.0 && !stochastic_gradient_descent_regularize_weights(self->weights, self->batch_columns, self->last_updated, self->iters, self->lambda, self->gamma_0)) { - log_error("Error regularizing weights\n"); - goto exit_matrices_created; + if(!double_matrix_resize(gradient, self->batch_columns->n, self->num_labels)) { + log_error("Gradient resize failed\n"); + return false; } - if (!logistic_regression_gradient_sparse(self->weights, gradient, x, y, p_y, self->batch_columns, self->lambda)) { + double_matrix_t *weights = logistic_regression_trainer_get_weights(self); + if (weights == NULL) { + log_error("Error getting weights\n"); + return false; + } + size_t batch_size = x->m; + + double_matrix_t *p_y = double_matrix_new_aligned(batch_size, self->num_labels, 16); + if (p_y == NULL) { + log_error("Error allocating p_y\n"); + return false; + } + + if (!logistic_regression_gradient(weights, gradient, x, y, p_y)) { log_error("Gradient failed\n"); goto exit_matrices_created; } - size_t data_len = m * n; - - double gamma = stochastic_gradient_descent_gamma_t(self->gamma_0, self->lambda, self->iters); - ret = stochastic_gradient_descent_sparse(self->weights, gradient, self->batch_columns, gamma); - - self->iters++; + if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) { + ret = stochastic_gradient_descent_update_sparse(self->optimizer.sgd, gradient, self->batch_columns, batch_size); + } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) { + ret = ftrl_update_gradient(self->optimizer.ftrl, gradient, weights, self->batch_columns, batch_size); + if (!ret) { + log_error("ftrl_update_gradient failed\n"); + } + } else { + ret = false; + } exit_matrices_created: double_matrix_destroy(p_y); @@ -141,12 +220,87 @@ exit_matrices_created: return ret; } -bool logistic_regression_trainer_finalize(logistic_regression_trainer_t *self) { - if (self == NULL) return false; +double_matrix_t *logistic_regression_trainer_get_weights(logistic_regression_trainer_t *self) { + if (self == NULL) return NULL; - if (self->lambda > 0.0) { - return stochastic_gradient_descent_finalize_weights(self->weights, self->last_updated, self->iters, self->lambda, self->gamma_0); + size_t m = self->batch_columns->n; + size_t n = self->num_labels; + double_matrix_t *batch_weights = self->batch_weights; + if (batch_weights == NULL || !double_matrix_resize(batch_weights, m, n)) { + return NULL; + } + double_matrix_zero(batch_weights); + + if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) { + if (self->optimizer.sgd == NULL) return NULL; + double_matrix_t *full_weights = self->optimizer.sgd->theta; + uint32_t *columns = self->batch_columns->a; + + for (size_t i = 0; i < m; i++) { + uint32_t col = columns[i]; + double *theta_row = double_matrix_get_row(full_weights, col); + double *row = double_matrix_get_row(batch_weights, i); + for (size_t j = 0; j < n; j++) { + row[j] = theta_row[j]; + } + } + + return batch_weights; + } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) { + if (self->optimizer.ftrl == NULL) return NULL; + + if (!ftrl_set_weights(self->optimizer.ftrl, batch_weights, self->batch_columns)) { + return NULL; + } + + return batch_weights; + + } + return NULL; +} + +double_matrix_t *logistic_regression_trainer_get_regularized_weights(logistic_regression_trainer_t *self) { + if (self == NULL) return NULL; + + if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) { + if (self->optimizer.sgd == NULL) return NULL; + return stochastic_gradient_descent_get_weights(self->optimizer.sgd); + } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) { + if (self->optimizer.ftrl == NULL) return NULL; + if (!ftrl_set_weights(self->optimizer.ftrl, self->batch_weights, NULL)) { + return NULL; + } + return self->batch_weights; + } + return NULL; +} + +double_matrix_t *logistic_regression_trainer_final_weights(logistic_regression_trainer_t *self) { + if (self == NULL) return NULL; + + if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) { + if (self->optimizer.sgd == NULL) return NULL; + double_matrix_t *weights = stochastic_gradient_descent_get_weights(self->optimizer.sgd); + self->optimizer.sgd->theta = NULL; + return weights; + } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) { + if (self->optimizer.ftrl == NULL) return NULL; + return ftrl_weights_finalize(self->optimizer.ftrl); + } + return NULL; +} + + +sparse_matrix_t *logistic_regression_trainer_final_weights_sparse(logistic_regression_trainer_t *self) { + if (self == NULL) return NULL; + + if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_SGD) { + if (self->optimizer.sgd == NULL) return NULL; + return stochastic_gradient_descent_get_weights_sparse(self->optimizer.sgd); + } else if (self->optimizer_type == LOGISTIC_REGRESSION_OPTIMIZER_FTRL) { + if (self->optimizer.ftrl == NULL) return NULL; + return ftrl_weights_finalize_sparse(self->optimizer.ftrl); } - return true; + return NULL; } diff --git a/src/logistic_regression_trainer.h b/src/logistic_regression_trainer.h index fc7f0a8e..2a03ac3c 100644 --- a/src/logistic_regression_trainer.h +++ b/src/logistic_regression_trainer.h @@ -9,6 +9,7 @@ #include "averaged_perceptron_tagger.h" #include "collections.h" #include "features.h" +#include "ftrl.h" #include "logistic_regression.h" #include "minibatch.h" #include "sparse_matrix.h" @@ -21,28 +22,39 @@ * Helper struct for training logistic regression model */ + typedef enum { + LOGISTIC_REGRESSION_OPTIMIZER_SGD, + LOGISTIC_REGRESSION_OPTIMIZER_FTRL + } logistic_regression_optimizer_type; + typedef struct logistic_regression_trainer { - trie_t *feature_ids; // Trie mapping features to array indices - size_t num_features; // Number of features - khash_t(str_uint32) *label_ids; // Hashtable mapping labels to array indices - size_t num_labels; // Number of labels - double_matrix_t *weights; // Matrix of logistic regression weights - double_matrix_t *gradient; // Gradient matrix to be reused - khash_t(int_set) *unique_columns; // Unique columns set - uint32_array *batch_columns; // Unique columns as array - uint32_array *last_updated; // Array of length N indicating the last time each feature was updated - double lambda; // Regularization weight - uint32_t iters; // Number of iterations, used to decay learning rate - uint32_t epochs; // Number of epochs - double gamma_0; // Initial learning rate + trie_t *feature_ids; // Trie mapping features to array indices + size_t num_features; // Number of features + khash_t(str_uint32) *label_ids; // Hashtable mapping labels to array indices + size_t num_labels; // Number of labels + double_matrix_t *gradient; // Gradient matrix to be reused + khash_t(int_uint32) *unique_columns; // Unique columns set + uint32_array *batch_columns; // Unique columns as array + double_matrix_t *batch_weights; // Weights updated in this batch + uint32_t epochs; // Number of epochs + logistic_regression_optimizer_type optimizer_type; // Trainer type + union { + sgd_trainer_t *sgd; // Stochastic (ok, minibatch) gradient descent + ftrl_trainer_t *ftrl; // Follow-the-regularized-leader (FTRL) Proximal + } optimizer; } logistic_regression_trainer_t; +logistic_regression_trainer_t *logistic_regression_trainer_init_sgd(trie_t *feature_ids, khash_t(str_uint32) *label_ids, bool fit_intercept, regularization_type_t reg_type, double lambda, double gamma_0); +logistic_regression_trainer_t *logistic_regression_trainer_init_ftrl(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double lambda1, double lambda2, double alpha, double beta); +bool logistic_regression_trainer_reset_params_sgd(logistic_regression_trainer_t *self, double lambda, double gamma_0); +bool logistic_regression_trainer_reset_params_ftrl(logistic_regression_trainer_t *self, double alpha, double beta, double lambda1, double lambda2); +bool logistic_regression_trainer_train_minibatch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels); +double logistic_regression_trainer_minibatch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels); -logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_ids, khash_t(str_uint32) *label_ids, double gamma_0, double lambda); - -bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels); -double logistic_regression_trainer_batch_cost(logistic_regression_trainer_t *self, feature_count_array *features, cstring_array *labels); -bool logistic_regression_trainer_finalize(logistic_regression_trainer_t *self); +double_matrix_t *logistic_regression_trainer_get_weights(logistic_regression_trainer_t *self); +double_matrix_t *logistic_regression_trainer_get_regularized_weights(logistic_regression_trainer_t *self); +double_matrix_t *logistic_regression_trainer_final_weights(logistic_regression_trainer_t *self); +sparse_matrix_t *logistic_regression_trainer_final_weights_sparse(logistic_regression_trainer_t *self); void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self);