[optimization] Adding learning rate to lazy sparse update in stochastic gradient descent

This commit is contained in:
Al
2016-01-12 11:02:12 -05:00
parent 79f2b7c192
commit 622dc354e7
3 changed files with 17 additions and 15 deletions

View File

@@ -118,7 +118,7 @@ bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self
goto exit_matrices_created; goto exit_matrices_created;
} }
if (self->lambda > 0.0 && !stochastic_gradient_descent_sparse_regularize_weights(self->weights, self->batch_columns, self->last_updated, self->iters, self->lambda)) { if (self->lambda > 0.0 && !stochastic_gradient_descent_sparse_regularize_weights(self->weights, self->batch_columns, self->last_updated, self->iters, self->lambda, self->gamma_0)) {
log_error("Error regularizing weights\n"); log_error("Error regularizing weights\n");
goto exit_matrices_created; goto exit_matrices_created;
} }
@@ -145,7 +145,7 @@ bool logistic_regression_trainer_finalize(logistic_regression_trainer_t *self) {
if (self == NULL) return false; if (self == NULL) return false;
if (self->lambda > 0.0) { if (self->lambda > 0.0) {
return stochastic_gradient_descent_sparse_finalize_weights(self->weights, self->last_updated, self->iters, self->lambda); return stochastic_gradient_descent_sparse_finalize_weights(self->weights, self->last_updated, self->iters, self->lambda, self->gamma_0);
} }
return true; return true;

View File

@@ -79,13 +79,18 @@ twice (three times counting the finalization step), while still getting roughly
results as though we had done the per-iteration weight updates. results as though we had done the per-iteration weight updates.
*/ */
static inline void regularize_row(double *theta_i, size_t n, double lambda, uint32_t last_updated, uint32_t t) {
inline double stochastic_gradient_descent_gamma_t(double gamma_0, double lambda, uint32_t t) {
return gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
}
static inline void regularize_row(double *theta_i, size_t n, double lambda, uint32_t last_updated, uint32_t t, double gamma) {
uint32_t timesteps = t - last_updated; uint32_t timesteps = t - last_updated;
double update = exp(-lambda * timesteps); double update = exp(gamma * -lambda * timesteps);
double_array_mul(theta_i, update, n); double_array_mul(theta_i, update, n);
} }
bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda) { bool stochastic_gradient_descent_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0) {
if (lambda > 0.0) { if (lambda > 0.0) {
uint32_t *updates = last_updated->a; uint32_t *updates = last_updated->a;
@@ -98,7 +103,7 @@ bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint
uint32_t row = rows[i]; uint32_t row = rows[i];
double *theta_i = matrix_get_row(theta, row); double *theta_i = matrix_get_row(theta, row);
uint32_t last_updated = updates[row]; uint32_t last_updated = updates[row];
regularize_row(theta_i, n, lambda, last_updated, t); regularize_row(theta_i, n, lambda, last_updated, t, gamma_0);
updates[row] = t; updates[row] = t;
} }
@@ -107,7 +112,7 @@ bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint
return true; return true;
} }
inline bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda) { inline bool stochastic_gradient_descent_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0) {
if (lambda > 0.0) { if (lambda > 0.0) {
uint32_t *updates = last_updated->a; uint32_t *updates = last_updated->a;
size_t m = theta->m; size_t m = theta->m;
@@ -116,14 +121,10 @@ inline bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta,
for (size_t i = 0; i < m; i++) { for (size_t i = 0; i < m; i++) {
double *theta_i = matrix_get_row(theta, i); double *theta_i = matrix_get_row(theta, i);
uint32_t last_updated = updates[i]; uint32_t last_updated = updates[i];
regularize_row(theta_i, n, lambda, last_updated, t); regularize_row(theta_i, n, lambda, last_updated, t, gamma_0);
updates[i] = t; updates[i] = t;
} }
} }
return true; return true;
} }
inline double stochastic_gradient_descent_gamma_t(double gamma_0, double lambda, uint32_t t) {
return gamma_0 / (1.0 + lambda * gamma_0 * (double)t);
}

View File

@@ -18,8 +18,9 @@ gamma_t = gamma_0(1 + gamma_0 * lambda * t)^-1
bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gamma); bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gamma);
bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma); bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma);
bool stochastic_gradient_descent_sparse_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda); bool stochastic_gradient_descent_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
bool stochastic_gradient_descent_sparse_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda); bool stochastic_gradient_descent_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
double stochastic_gradient_descent_gamma_t(double gamma_0, double lambda, uint32_t t); double stochastic_gradient_descent_gamma_t(double gamma_0, double lambda, uint32_t t);
#endif #endif