[classification] removing regularization update from gradient computation in logistic regression, as that's now handled by the optimizer

2017-04-02 14:32:14 -04:00
parent 64c049730a
commit a4431dbb27
2 changed files with 52 additions and 107 deletions
--- a/src/logistic_regression.c
+++ b/src/logistic_regression.c
@@ -4,12 +4,16 @@
 #include "logistic.h"
 #include "file_utils.h"

-#define NEAR_ZERO_WEIGHT 1e-6
+bool logistic_regression_model_expectation_sparse(sparse_matrix_t *theta, sparse_matrix_t *x, double_matrix_t *p_y) {
+    if (theta == NULL || x == NULL || p_y == NULL) {
+        if (theta == NULL) log_error("theta = NULL\n");
+        if (x == NULL) log_error("x = NULL\n");
+        if (p_y == NULL) log_error("p_y = NULL\n");
+        return false;
+    }

-bool logistic_regression_model_expectation(double_matrix_t *theta, sparse_matrix_t *x, double_matrix_t *p_y) {
-    if (theta == NULL || x == NULL || p_y == NULL) return false;
-
-    if (sparse_matrix_dot_dense(x, theta, p_y) != 0) {
+    if (sparse_matrix_dot_sparse(x, theta, p_y) != 0) {
+        log_error("x->m = %zu, x->n = %zu, theta->m = %zu, theta->n = %zu, p_y->m = %zu, p_y->n = %zu\n", x->m, x->n, theta->m, theta->n, p_y->m, p_y->n);
        return false;
    }

@@ -18,16 +22,41 @@ bool logistic_regression_model_expectation(double_matrix_t *theta, sparse_matrix
    return true;
 }

-double logistic_regression_cost_function(double_matrix_t *theta, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda) {
+bool logistic_regression_model_expectation(double_matrix_t *theta, sparse_matrix_t *x, double_matrix_t *p_y) {
+    if (theta == NULL || x == NULL || p_y == NULL) {
+        if (theta == NULL) log_error("theta = NULL\n");
+        if (x == NULL) log_error("x = NULL\n");
+        if (p_y == NULL) log_error("p_y = NULL\n");
+        return false;
+    }
+
+    if (sparse_matrix_dot_dense(x, theta, p_y) != 0) {
+        log_error("x->m = %zu, x->n = %zu, theta->m = %zu, theta->n = %zu, p_y->m = %zu, p_y->n = %zu\n", x->m, x->n, theta->m, theta->n, p_y->m, p_y->n);
+        return false;
+    }
+
+    softmax_matrix(p_y);
+
+    return true;
+}
+
+double logistic_regression_cost_function(double_matrix_t *theta, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y) {
    size_t m = x->m;
    size_t n = x->n;
-    if (m != y->n) return -1.0;
-
-    if (!double_matrix_resize(p_y, x->m, theta->n)) {
+    if (m != y->n) {
+        log_error("m = %zu, y->n = %zu\n", m, y->n);
        return -1.0;
    }

+    if (!double_matrix_resize_aligned(p_y, x->m, theta->n, 16)) {
+        log_error("resize_aligned failed\n");
+        return -1.0;
+    }
+
+    double_matrix_zero(p_y);
+
    if (!logistic_regression_model_expectation(theta, x, p_y)) {
+        log_error("model expectation failed\n");
        return -1.0;
    }

@@ -37,39 +66,23 @@ double logistic_regression_cost_function(double_matrix_t *theta, sparse_matrix_t
    for (size_t i = 0; i < p_y->m; i++) {
        uint32_t y_i = y->a[i];
        double value = double_matrix_get(p_y, i, y_i);
-        if (value > NEAR_ZERO_WEIGHT) {
-            cost += log(value);
-        }
+        cost += log(value);
    }

    cost *= -(1.0 / m);

-    if (lambda > 0.0) {
-        double reg_cost = 0.0;
-        for (size_t i = 1; i < theta->m; i++) {
-            for (size_t j = 0; j < theta->n; j++) {
-                double theta_ij = double_matrix_get(theta, i, j);
-                reg_cost += theta_ij * theta_ij;
-            }
-
-        }
-
-        cost += reg_cost * (lambda / 2.0);
-    }
-
    return cost;
-
 }

-static bool logistic_regression_gradient_params(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, 
-                                                uint32_array *x_cols, double lambda) {
+bool logistic_regression_gradient(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y) {
    size_t m = x->m;
    size_t n = x->n;
-    if (m != y->n) return false;
+    if (m != y->n || theta->m != gradient->m || theta->n != gradient->n) return false;

-    if (!double_matrix_resize(p_y, x->m, theta->n) || !double_matrix_resize(p_y, x->m, theta->n)) {
+    if (!double_matrix_resize_aligned(p_y, x->m, theta->n, 16)) {
        return false;
    }
+    double_matrix_zero(p_y);

    if (!logistic_regression_model_expectation(theta, x, p_y)) {
        return false;
@@ -85,93 +98,26 @@ static bool logistic_regression_gradient_params(double_matrix_t *theta, double_m
    uint32_t col;
    double data;

-
-    bool regularize = lambda > 0.0;
+    double_matrix_zero(gradient);

    double *theta_values = theta->values;
    double *predicted_values = p_y->values;
    double *gradient_values = gradient->values;

-    // Zero the relevant rows of the gradient
-    if (x_cols != NULL) {
-        double *gradient_i;
-
-        size_t batch_rows = x_cols->n;
-        uint32_t *cols = x_cols->a;
-        for (i = 0; i < batch_rows; i++) {
-            col = cols[i];
-            gradient_i = double_matrix_get_row(gradient, col);
-            double_array_zero(gradient_i, num_classes);
-        }
-    } else {
-        double_matrix_zero(gradient);
-    }
-
-    // gradient = -(1. / m) * x.T.dot(y - p_y) + lambda * theta
+    // gradient = -(1. / m) * x.T.dot(y - p_y)

    sparse_matrix_foreach(x, row, col, data, {
        uint32_t y_i = y->a[row];
        for (j = 0; j < num_classes; j++) {
-
-            double class_prob = predicted_values[row * num_classes + j];
+            double class_prob = double_matrix_get(p_y, row, j);
            double residual = (y_i == j ? 1.0 : 0.0) - class_prob;
-            gradient_values[col * num_classes + j] += data * residual;
+            double gradient_update = data * residual;
+            double_matrix_add_scalar(gradient, col, j, gradient_update);
        }
    })

    double scale = -1.0 / m;
-
-    // Scale the vector by -1 / m using only the unique columns in X 
-    // Useful for stochastic and minibatch gradients
-    if (x_cols != NULL) {
-        size_t batch_rows = x_cols->n;
-        uint32_t *cols = x_cols->a;
-        for (i = 0; i < batch_rows; i++) {
-            col = cols[i];
-            for (j = 0; j < num_classes; j++) {
-                gradient_values[col * num_classes + j] *= scale;
-            }
-        }
-    } else {
-        double_matrix_mul(gradient, scale);
-    }
-
-
-    // Update the only the relevant columns in x
-    if (regularize) {
-        size_t num_rows = num_features;
-        uint32_t *cols = NULL;
-
-        if (x_cols != NULL) {
-            cols = x_cols->a;
-            num_rows = x_cols->n;
-        }
-
-        for (i = 0; i < num_rows; i++) {
-            col = x_cols != NULL ? cols[i] : i;
-
-            for (j = 0; j < num_classes; j++) {
-                size_t idx = col * num_classes + j;
-                double theta_ij = theta_values[idx];
-                double reg_update = theta_ij * lambda;
-                double current_value = gradient_values[idx];
-                double updated_value = current_value + reg_update;
-                if ((updated_value > 0) == (current_value > 0)) {
-                    gradient_values[idx] = updated_value;
-                }
-            }
-        }
-    }
+    double_matrix_mul(gradient, scale);

    return true;
 }
-
-inline bool logistic_regression_gradient_sparse(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, 
-                                                uint32_array *x_cols, double lambda) {
-    return logistic_regression_gradient_params(theta, gradient, x, y, p_y, x_cols, lambda);
-}
-
-
-inline bool logistic_regression_gradient(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda) {
-    return logistic_regression_gradient_params(theta, gradient, x, y, p_y, NULL, lambda);
-}
--- a/src/logistic_regression.h
+++ b/src/logistic_regression.h
@@ -27,9 +27,8 @@ may be called softmax regression.
 #include "sparse_matrix.h"

 bool logistic_regression_model_expectation(double_matrix_t *theta, sparse_matrix_t *x, double_matrix_t *p_y);
-double logistic_regression_cost_function(double_matrix_t *theta, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda);
-bool logistic_regression_gradient(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda);
-bool logistic_regression_gradient_sparse(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, 
-                                         uint32_array *x_cols, double lambda);
+bool logistic_regression_model_expectation_sparse(sparse_matrix_t *theta, sparse_matrix_t *x, double_matrix_t *p_y);
+double logistic_regression_cost_function(double_matrix_t *theta, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y);
+bool logistic_regression_gradient(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y);

 #endif