From 46cd725c13e385449b4bcf40b94dd20f7db2c3ca Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Sat, 6 Aug 2016 00:40:01 -0400
Subject: [PATCH] [math] Generic dense matrix implementation using BLAS calls
 for matrix-matrix multiplication if available

---
 src/file_utils.c                  |  42 ++++
 src/file_utils.h                  |   5 +
 src/language_classifier.c         |  13 +-
 src/language_classifier.h         |   2 +-
 src/language_classifier_train.c   |  10 +-
 src/logistic.c                    |   2 +-
 src/logistic.h                    |   2 +-
 src/logistic_regression.c         |  24 +-
 src/logistic_regression.h         |   8 +-
 src/logistic_regression_trainer.c |  24 +-
 src/logistic_regression_trainer.h |   4 +-
 src/matrix.c                      | 320 ------------------------
 src/matrix.h                      | 402 ++++++++++++++++++++++++++----
 src/sparse_matrix.c               |   4 +-
 src/sparse_matrix.h               |   4 +-
 src/sparse_matrix_utils.c         |   4 +-
 src/sparse_matrix_utils.h         |   2 +-
 src/stochastic_gradient_descent.c |  14 +-
 src/stochastic_gradient_descent.h |   8 +-
 19 files changed, 469 insertions(+), 425 deletions(-)
 delete mode 100644 src/matrix.c

diff --git a/src/file_utils.c b/src/file_utils.c
index 1747bbd8..dfd3c814 100644
--- a/src/file_utils.c
+++ b/src/file_utils.c
@@ -132,6 +132,48 @@ bool file_write_double(FILE *file, double value) {
     return file_write_uint64(file, ud.u);
 }
 
+typedef union {
+    uint32_t u;
+    float f;
+} uint32_float_t;
+
+bool file_read_float(FILE *file, float *value) {
+    uint32_float_t uf;
+
+    if (!file_read_uint32(file, &uf.u)) {
+        return false;
+    }
+    *value = uf.f;
+    return true;
+}
+
+bool file_read_float_array(FILE *file, float *value, size_t n) {
+    unsigned char *buf = malloc(n * sizeof(uint32_t));
+
+    if (buf == NULL) return false;
+
+    bool ret = false;
+
+    if (fread(buf, sizeof(uint32_t), n, file) == n) {
+        uint32_float_t uf;
+
+        for (size_t i = 0, byte_offset = 0; i < n; i++, byte_offset += sizeof(uint32_t)) {
+            unsigned char *ptr = buf + byte_offset;
+            uf.u = file_deserialize_uint32(ptr);
+            value[i] = uf.f;
+        }
+        ret = true;
+    }
+    free(buf);
+    return ret;
+}
+
+bool file_write_float(FILE *file, float value) {
+    uint32_float_t uf;
+    uf.f = value;
+    return file_write_uint32(file, uf.u);
+
+}
 
 inline uint32_t file_deserialize_uint32(unsigned char *buf) {
     return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
diff --git a/src/file_utils.h b/src/file_utils.h
index 13fa1e39..cb2004ac 100644
--- a/src/file_utils.h
+++ b/src/file_utils.h
@@ -61,6 +61,11 @@ bool file_write_uint64(FILE *file, uint64_t value);
 
 bool file_read_uint64_array(FILE *file, uint64_t *value, size_t n);
 
+bool file_read_float(FILE *file, float *value);
+bool file_write_float(FILE *file, float value);
+
+bool file_read_float_array(FILE *file, float *value, size_t n);
+
 bool file_read_double(FILE *file, double *value);
 bool file_write_double(FILE *file, double value);
 
diff --git a/src/language_classifier.c b/src/language_classifier.c
index e3417828..35da504f 100644
--- a/src/language_classifier.c
+++ b/src/language_classifier.c
@@ -28,7 +28,7 @@ void language_classifier_destroy(language_classifier_t *self) {
     }
 
     if (self->weights != NULL) {
-        matrix_destroy(self->weights);
+        double_matrix_destroy(self->weights);
     }
 
     free(self);
@@ -83,11 +83,11 @@ language_classifier_response_t *classify_languages(char *address) {
     sparse_matrix_t *x = feature_vector(classifier->features, feature_counts);
 
     size_t n = classifier->num_labels;
-    matrix_t *p_y = matrix_new_zeros(1, n);
+    double_matrix_t *p_y = double_matrix_new_zeros(1, n);
 
     language_classifier_response_t *response = NULL;
     if (logistic_regression_model_expectation(classifier->weights, x, p_y)) {
-        double *predictions = matrix_get_row(p_y, 0);
+        double *predictions = double_matrix_get_row(p_y, 0);
         size_t *indices = double_array_argsort(predictions, n);
         size_t num_languages = 0;
         size_t i;
@@ -126,7 +126,7 @@ language_classifier_response_t *classify_languages(char *address) {
     }
 
     sparse_matrix_destroy(x);
-    matrix_destroy(p_y);
+    double_matrix_destroy(p_y);
     token_array_destroy(tokens);
     char_array_destroy(feature_array);
     const char *key;
@@ -190,7 +190,7 @@ language_classifier_t *language_classifier_read(FILE *f) {
     }
     classifier->num_labels = cstring_array_num_strings(classifier->labels);
 
-    matrix_t *weights = matrix_read(f);
+    double_matrix_t *weights = double_matrix_read(f);
 
     if (weights == NULL) {
         goto exit_classifier_created;
@@ -228,7 +228,7 @@ bool language_classifier_write(language_classifier_t *self, FILE *f) {
         !file_write_uint64(f, self->num_features) ||
         !file_write_uint64(f, self->labels->str->n) ||
         !file_write_chars(f, (const char *)self->labels->str->a, self->labels->str->n) ||
-        !matrix_write(self->weights, f)) {
+        !double_matrix_write(self->weights, f)) {
         return false;
     }
 
@@ -277,5 +277,6 @@ void language_classifier_module_teardown(void) {
     if (language_classifier != NULL) {
         language_classifier_destroy(language_classifier);
     }
+    language_classifier = NULL;
 }
 
diff --git a/src/language_classifier.h b/src/language_classifier.h
index 424342c9..f96d9920 100644
--- a/src/language_classifier.h
+++ b/src/language_classifier.h
@@ -21,7 +21,7 @@ typedef struct language_classifier {
     size_t num_features;
     trie_t *features;
     cstring_array *labels;
-    matrix_t *weights;
+    double_matrix_t *weights;
 } language_classifier_t;
 
 
diff --git a/src/language_classifier_train.c b/src/language_classifier_train.c
index 06095276..0fa33a99 100644
--- a/src/language_classifier_train.c
+++ b/src/language_classifier_train.c
@@ -13,7 +13,7 @@
 #include "sparse_matrix.h"
 #include "sparse_matrix_utils.h"
 
-#define LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD 1.0
+#define LANGUAGE_CLASSIFIER_FEATURE_COUNT_THRESHOLD 5.0
 #define LANGUAGE_CLASSIFIER_LABEL_COUNT_THRESHOLD 100
 
 #define LOG_BATCH_INTERVAL 10
@@ -134,13 +134,13 @@ double compute_cv_accuracy(logistic_regression_trainer_t *trainer, char *filenam
     uint32_t correct = 0;
     uint32_t total = 0;
 
-    matrix_t *p_y = matrix_new_zeros(LANGUAGE_CLASSIFIER_DEFAULT_BATCH_SIZE, trainer->num_labels);
+    double_matrix_t *p_y = double_matrix_new_zeros(LANGUAGE_CLASSIFIER_DEFAULT_BATCH_SIZE, trainer->num_labels);
 
     while ((minibatch = language_classifier_data_set_get_minibatch(data_set, trainer->label_ids)) != NULL) {
         sparse_matrix_t *x = feature_matrix(trainer->feature_ids, minibatch->features);
         uint32_array *y = label_vector(trainer->label_ids, minibatch->labels);
 
-        matrix_resize(p_y, x->m, trainer->num_labels);
+        double_matrix_resize(p_y, x->m, trainer->num_labels);
 
         if (!logistic_regression_model_expectation(trainer->weights, x, p_y)) {
             log_error("Predict cv batch failed\n");
@@ -149,7 +149,7 @@ double compute_cv_accuracy(logistic_regression_trainer_t *trainer, char *filenam
 
         double *row;
         for (size_t i = 0; i < p_y->m; i++) {
-            row = matrix_get_row(p_y, i);
+            row = double_matrix_get_row(p_y, i);
 
             int64_t predicted = double_array_argmax(row, p_y->n);
             if (predicted < 0) {
@@ -171,7 +171,7 @@ double compute_cv_accuracy(logistic_regression_trainer_t *trainer, char *filenam
     }
 
     language_classifier_data_set_destroy(data_set);
-    matrix_destroy(p_y);
+    double_matrix_destroy(p_y);
 
     double accuracy = (double)correct / total;
     return accuracy;
diff --git a/src/logistic.c b/src/logistic.c
index 223b15bc..b664326c 100644
--- a/src/logistic.c
+++ b/src/logistic.c
@@ -23,7 +23,7 @@ inline void softmax_vector(double *x, size_t n) {
 }
 
 
-void softmax_matrix(matrix_t *matrix) {
+void softmax_matrix(double_matrix_t *matrix) {
     size_t num_rows = matrix->m;
     size_t num_cols = matrix->n;
 
diff --git a/src/logistic.h b/src/logistic.h
index ddb7de16..e8ffd09e 100644
--- a/src/logistic.h
+++ b/src/logistic.h
@@ -9,6 +9,6 @@
 double sigmoid(double x);
 void sigmoid_vector(double *x, size_t n);
 void softmax_vector(double *x, size_t n);
-void softmax_matrix(matrix_t *matrix);
+void softmax_matrix(double_matrix_t *matrix);
 
 #endif
\ No newline at end of file
diff --git a/src/logistic_regression.c b/src/logistic_regression.c
index 8d3fb32d..c5c1e028 100644
--- a/src/logistic_regression.c
+++ b/src/logistic_regression.c
@@ -6,7 +6,7 @@
 
 #define NEAR_ZERO_WEIGHT 1e-6
 
-bool logistic_regression_model_expectation(matrix_t *theta, sparse_matrix_t *x, matrix_t *p_y) {
+bool logistic_regression_model_expectation(double_matrix_t *theta, sparse_matrix_t *x, double_matrix_t *p_y) {
     if (theta == NULL || x == NULL || p_y == NULL) return false;
 
     if (sparse_matrix_dot_dense(x, theta, p_y) != 0) {
@@ -18,12 +18,12 @@ bool logistic_regression_model_expectation(matrix_t *theta, sparse_matrix_t *x,
     return true;
 }
 
-double logistic_regression_cost_function(matrix_t *theta, sparse_matrix_t *x, uint32_array *y, matrix_t *p_y, double lambda) {
+double logistic_regression_cost_function(double_matrix_t *theta, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda) {
     size_t m = x->m;
     size_t n = x->n;
     if (m != y->n) return -1.0;
 
-    if (!matrix_resize(p_y, x->m, theta->n)) {
+    if (!double_matrix_resize(p_y, x->m, theta->n)) {
         return -1.0;
     }
 
@@ -36,7 +36,7 @@ double logistic_regression_cost_function(matrix_t *theta, sparse_matrix_t *x, ui
 
     for (size_t i = 0; i < p_y->m; i++) {
         uint32_t y_i = y->a[i];
-        double value = matrix_get(p_y, i, y_i);
+        double value = double_matrix_get(p_y, i, y_i);
         if (value > NEAR_ZERO_WEIGHT) {
             cost += log(value);
         }
@@ -48,7 +48,7 @@ double logistic_regression_cost_function(matrix_t *theta, sparse_matrix_t *x, ui
         double reg_cost = 0.0;
         for (size_t i = 1; i < theta->m; i++) {
             for (size_t j = 0; j < theta->n; j++) {
-                double theta_ij = matrix_get(theta, i, j);
+                double theta_ij = double_matrix_get(theta, i, j);
                 reg_cost += theta_ij * theta_ij;
             }
 
@@ -61,13 +61,13 @@ double logistic_regression_cost_function(matrix_t *theta, sparse_matrix_t *x, ui
 
 }
 
-static bool logistic_regression_gradient_params(matrix_t *theta, matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, matrix_t *p_y, 
+static bool logistic_regression_gradient_params(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, 
                                                 uint32_array *x_cols, double lambda) {
     size_t m = x->m;
     size_t n = x->n;
     if (m != y->n) return false;
 
-    if (!matrix_resize(p_y, x->m, theta->n) || !matrix_resize(p_y, x->m, theta->n)) {
+    if (!double_matrix_resize(p_y, x->m, theta->n) || !double_matrix_resize(p_y, x->m, theta->n)) {
         return false;
     }
 
@@ -100,11 +100,11 @@ static bool logistic_regression_gradient_params(matrix_t *theta, matrix_t *gradi
         uint32_t *cols = x_cols->a;
         for (i = 0; i < batch_rows; i++) {
             col = cols[i];
-            gradient_i = matrix_get_row(gradient, col);
+            gradient_i = double_matrix_get_row(gradient, col);
             double_array_zero(gradient_i, num_classes);
         }
     } else {
-        matrix_zero(gradient);
+        double_matrix_zero(gradient);
     }
 
     // gradient = -(1. / m) * x.T.dot(y - p_y) + lambda * theta
@@ -133,7 +133,7 @@ static bool logistic_regression_gradient_params(matrix_t *theta, matrix_t *gradi
             }
         }
     } else {
-        matrix_mul(gradient, scale);
+        double_matrix_mul(gradient, scale);
     }
 
 
@@ -166,12 +166,12 @@ static bool logistic_regression_gradient_params(matrix_t *theta, matrix_t *gradi
     return true;
 }
 
-inline bool logistic_regression_gradient_sparse(matrix_t *theta, matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, matrix_t *p_y, 
+inline bool logistic_regression_gradient_sparse(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, 
                                                 uint32_array *x_cols, double lambda) {
     return logistic_regression_gradient_params(theta, gradient, x, y, p_y, x_cols, lambda);
 }
 
 
-inline bool logistic_regression_gradient(matrix_t *theta, matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, matrix_t *p_y, double lambda) {
+inline bool logistic_regression_gradient(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda) {
     return logistic_regression_gradient_params(theta, gradient, x, y, p_y, NULL, lambda);
 }
diff --git a/src/logistic_regression.h b/src/logistic_regression.h
index d6f99fac..7d798e45 100644
--- a/src/logistic_regression.h
+++ b/src/logistic_regression.h
@@ -26,10 +26,10 @@ may be called softmax regression.
 #include "matrix.h"
 #include "sparse_matrix.h"
 
-bool logistic_regression_model_expectation(matrix_t *theta, sparse_matrix_t *x, matrix_t *p_y);
-double logistic_regression_cost_function(matrix_t *theta, sparse_matrix_t *x, uint32_array *y, matrix_t *p_y, double lambda);
-bool logistic_regression_gradient(matrix_t *theta, matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, matrix_t *p_y, double lambda);
-bool logistic_regression_gradient_sparse(matrix_t *theta, matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, matrix_t *p_y, 
+bool logistic_regression_model_expectation(double_matrix_t *theta, sparse_matrix_t *x, double_matrix_t *p_y);
+double logistic_regression_cost_function(double_matrix_t *theta, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda);
+bool logistic_regression_gradient(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, double lambda);
+bool logistic_regression_gradient_sparse(double_matrix_t *theta, double_matrix_t *gradient, sparse_matrix_t *x, uint32_array *y, double_matrix_t *p_y, 
                                          uint32_array *x_cols, double lambda);
 
 #endif
diff --git a/src/logistic_regression_trainer.c b/src/logistic_regression_trainer.c
index b2df4533..f2be12f0 100644
--- a/src/logistic_regression_trainer.c
+++ b/src/logistic_regression_trainer.c
@@ -13,7 +13,7 @@ void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) {
     }
 
     if (self->weights != NULL) {
-        matrix_destroy(self->weights);
+        double_matrix_destroy(self->weights);
     }
 
     if (self->last_updated != NULL) {
@@ -29,7 +29,7 @@ void logistic_regression_trainer_destroy(logistic_regression_trainer_t *self) {
     }
 
     if (self->gradient != NULL) {
-        matrix_destroy(self->gradient);
+        double_matrix_destroy(self->gradient);
     }
 
     free(self);
@@ -48,9 +48,9 @@ logistic_regression_trainer_t *logistic_regression_trainer_init(trie_t *feature_
     trainer->label_ids = label_ids;
     trainer->num_labels = kh_size(label_ids);
 
-    trainer->weights = matrix_new_zeros(trainer->num_features, trainer->num_labels);
+    trainer->weights = double_matrix_new_zeros(trainer->num_features, trainer->num_labels);
 
-    trainer->gradient = matrix_new_zeros(trainer->num_features, trainer->num_labels);
+    trainer->gradient = double_matrix_new_zeros(trainer->num_features, trainer->num_labels);
 
     trainer->unique_columns = kh_init(int_set);
     trainer->batch_columns = uint32_array_new_size(trainer->num_features);
@@ -70,14 +70,14 @@ exit_trainer_created:
 }
 
 
-static matrix_t *model_expectation(sparse_matrix_t *x, matrix_t *theta) {
-    matrix_t *p_y = matrix_new_zeros(x->m, theta->n);
+static double_matrix_t *model_expectation(sparse_matrix_t *x, double_matrix_t *theta) {
+    double_matrix_t *p_y = double_matrix_new_zeros(x->m, theta->n);
     if (p_y == NULL) return NULL;
 
     if(logistic_regression_model_expectation(theta, x, p_y)) {
         return p_y;
     } else {
-        matrix_destroy(p_y);
+        double_matrix_destroy(p_y);
         return NULL;
     }
 }
@@ -88,11 +88,11 @@ double logistic_regression_trainer_batch_cost(logistic_regression_trainer_t *sel
 
     sparse_matrix_t *x = feature_matrix(self->feature_ids, features);
     uint32_array *y = label_vector(self->label_ids, labels);
-    matrix_t *p_y = matrix_new_zeros(x->m, n);
+    double_matrix_t *p_y = double_matrix_new_zeros(x->m, n);
 
     double cost = logistic_regression_cost_function(self->weights, x, y, p_y, self->lambda);
 
-    matrix_destroy(p_y);
+    double_matrix_destroy(p_y);
     uint32_array_destroy(y);
     sparse_matrix_destroy(x);
     return cost;    
@@ -103,12 +103,12 @@ bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self
     size_t n = self->weights->n;
 
     // Optimize
-    matrix_t *gradient = self->gradient;
+    double_matrix_t *gradient = self->gradient;
 
     sparse_matrix_t *x = feature_matrix(self->feature_ids, features);
     uint32_array *y = label_vector(self->label_ids, labels);
 
-    matrix_t *p_y = matrix_new_zeros(x->m, n);
+    double_matrix_t *p_y = double_matrix_new_zeros(x->m, n);
 
     bool ret = false;
 
@@ -135,7 +135,7 @@ bool logistic_regression_trainer_train_batch(logistic_regression_trainer_t *self
     self->iters++;
 
 exit_matrices_created:
-    matrix_destroy(p_y);
+    double_matrix_destroy(p_y);
     uint32_array_destroy(y);
     sparse_matrix_destroy(x);
     return ret;
diff --git a/src/logistic_regression_trainer.h b/src/logistic_regression_trainer.h
index bc647ad7..fc7f0a8e 100644
--- a/src/logistic_regression_trainer.h
+++ b/src/logistic_regression_trainer.h
@@ -26,8 +26,8 @@ typedef struct logistic_regression_trainer {
     size_t num_features;                // Number of features
     khash_t(str_uint32) *label_ids;     // Hashtable mapping labels to array indices
     size_t num_labels;                  // Number of labels
-    matrix_t *weights;                  // Matrix of logistic regression weights
-    matrix_t *gradient;                 // Gradient matrix to be reused
+    double_matrix_t *weights;                  // Matrix of logistic regression weights
+    double_matrix_t *gradient;                 // Gradient matrix to be reused
     khash_t(int_set) *unique_columns;   // Unique columns set
     uint32_array *batch_columns;        // Unique columns as array
     uint32_array *last_updated;         // Array of length N indicating the last time each feature was updated
diff --git a/src/matrix.c b/src/matrix.c
deleted file mode 100644
index 61b3fa25..00000000
--- a/src/matrix.c
+++ /dev/null
@@ -1,320 +0,0 @@
-#include "matrix.h"
-#include "file_utils.h"
-
-matrix_t *matrix_new(size_t m, size_t n) {
-    matrix_t *matrix = malloc(sizeof(matrix_t));
-
-    if (matrix == NULL) {
-        return NULL;
-    }
-
-    matrix->m = m;
-    matrix->n = n;
-
-    matrix->values = malloc(sizeof(double) * m * n);
-    if (matrix->values == NULL) {
-        free(matrix);
-        return NULL;
-    }
-
-    return matrix;
-
-}
-
-bool matrix_resize(matrix_t *self, size_t m, size_t n) {
-    if (self == NULL) return false;
-
-    // If the new size is smaller, don't reallocate, just ignore the extra values
-    if (m * n > (self->m * self->n)) {
-        self->values = realloc(self->values, sizeof(double) * m * n);
-        if (self->values == NULL) {
-            return false;
-        }        
-    }
-
-    self->m = m;
-    self->n = n;
-
-    // Set all values to 0
-    matrix_zero(self);
-
-    return true;
-}
-
-matrix_t *matrix_new_copy(matrix_t *self) {
-    matrix_t *cpy = matrix_new(self->m, self->n);
-    size_t num_values = self->m * self->n;
-    memcpy(cpy->values, self->values, num_values);
-
-    return cpy;
-}
-
-bool matrix_copy(matrix_t *self, matrix_t *other) {
-    if (self->m != other->m || self->n != other->n) {
-        return false;
-    }
-    size_t num_values = self->n * self->n;
-
-    memcpy(other->values, self->values, num_values * sizeof(double));
-    return true;
-}
-
-inline void matrix_init_values(matrix_t *self, double *values) {
-    size_t num_values = self->m * self->n;
-    memcpy(self->values, values, num_values * sizeof(double));
-}
-
-inline void matrix_zero(matrix_t *self) {
-    memset(self->values, 0, self->m * self->n * sizeof(double));
-}
-
-inline void matrix_set(matrix_t *self, double value) {
-    double_array_set(self->values, self->m * self->n, value);
-}
-
-inline void matrix_set_row(matrix_t *self, size_t index, double *row) {
-    size_t offset = index * self->n;
-    double *values = self->values;
-    size_t n = self->n;
-    memcpy(values + offset, row, n * sizeof(double));
-}
-
-inline void matrix_set_scalar(matrix_t *self, size_t row_index, size_t col_index, double value) {
-    size_t offset = row_index * self->n + col_index;
-    self->values[offset] = value;
-}
-
-inline void matrix_add_scalar(matrix_t *self, size_t row_index, size_t col_index, double value) {
-    size_t offset = row_index * self->n + col_index;
-    self->values[offset] += value;
-}
-
-inline void matrix_sub_scalar(matrix_t *self, size_t row_index, size_t col_index, double value) {
-    size_t offset = row_index * self->n + col_index;
-    self->values[offset] -= value;
-}
-
-inline void matrix_mul_scalar(matrix_t *self, size_t row_index, size_t col_index, double value) {
-    size_t offset = row_index * self->n + col_index;
-    self->values[offset] *= value;
-}
-
-inline void matrix_div_scalar(matrix_t *self, size_t row_index, size_t col_index, double value) {
-    size_t offset = row_index * self->n + col_index;
-    self->values[offset] /= value;
-}
-
-inline double matrix_get(matrix_t *self, size_t row_index, size_t col_index) {
-    size_t index = row_index * self->n + col_index;
-    return self->values[index];
-}
-
-inline double *matrix_get_row(matrix_t *self, size_t row_index) {
-    size_t index = row_index * self->n;
-    return self->values + index;
-}
-
-
-inline matrix_t *matrix_new_value(size_t m, size_t n, double value) {
-    matrix_t *matrix = matrix_new(m, n);
-    matrix_set(matrix, value);
-    return matrix;
-}
-
-inline matrix_t *matrix_new_zeros(size_t m, size_t n) {
-    return matrix_new_value(m, n, 0.0);
-}
-
-inline matrix_t *matrix_new_ones(size_t m, size_t n) {
-    return matrix_new_value(m, n, 1.0);
-}
-
-matrix_t *matrix_new_values(size_t m, size_t n, double *values) {
-    matrix_t *matrix = matrix_new(m, n);
-    memcpy(matrix->values, values, m * n * sizeof(double));
-    return matrix;
-}
-
-
-inline void matrix_div(matrix_t *self, double value) {
-    double_array_div(self->values, value, self->m * self->n);
-}
-
-inline bool matrix_div_matrix(matrix_t *self, matrix_t *other) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_div_array(self->values, other->values, self->m * self->n);
-    return true;
-}
-
-inline bool matrix_div_matrix_times_scalar(matrix_t *self, matrix_t *other, double v) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_div_array_times_scalar(self->values, other->values, v, self->m * self->n);
-    return true;
-}
-
-
-inline void matrix_mul(matrix_t *self, double value) {
-    double_array_mul(self->values, value, self->m * self->n);
-}
-
-inline bool matrix_mul_matrix(matrix_t *self, matrix_t *other) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_mul_array(self->values, other->values, self->m * self->n);
-    return true;
-}
-
-inline bool matrix_mul_matrix_times_scalar(matrix_t *self, matrix_t *other, double v) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_mul_array_times_scalar(self->values, other->values, v, self->m * self->n);
-    return true;
-}
-
-inline void matrix_add(matrix_t *self, double value) {
-    double_array_add(self->values, self->m * self->n, value);
-}
-
-
-inline bool matrix_add_matrix(matrix_t *self, matrix_t *other) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_add_array(self->values, other->values, self->m * self->n);
-    return true;
-}
-
-inline bool matrix_add_matrix_times_scalar(matrix_t *self, matrix_t *other, double v) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_add_array_times_scalar(self->values, other->values, v, self->m * self->n);
-    return true;
-}
-
-inline void matrix_sub(matrix_t *self, double value) {
-    double_array_sub(self->values, value, self->m * self->n);
-}
-
-inline bool matrix_sub_matrix(matrix_t *self, matrix_t *other) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_sub_array(self->values, other->values, self->m * self->n);
-    return true;
-}
-
-inline bool matrix_sub_matrix_times_scalar(matrix_t *self, matrix_t *other, double v) {
-    if (self->m != other->m || self->n != other->n) return false;
-    double_array_sub_array_times_scalar(self->values, other->values, v, self->m * self->n);
-    return true;
-}
-
-
-inline void matrix_log(matrix_t *self) {
-    double_array_log(self->values, self->m * self->n);
-}
-
-inline void matrix_exp(matrix_t *self) {
-    double_array_exp(self->values, self->m * self->n);
-}
-
-void matrix_dot_vector(matrix_t *self, double *vec, double *result) {
-    double *values = self->values;
-    size_t n = self->n;
-    for (int i = 0; i < self->m; i++) {
-        for (int j = 0; j < n; j++) {
-            result[i] += values[n * i + j] * vec[j];
-        }
-    }
-}
-
-inline bool matrix_dot_matrix(matrix_t *m1, matrix_t *m2, matrix_t *result) {
-    if (m1->n != m2->m || m1->m != result->m || m2->n != result->n) {
-        return false;
-    }
-
-    size_t m1_rows = m1->m;
-    size_t m1_cols = m1->n;
-    size_t m2_rows = m2->m;
-    size_t m2_cols = m2->n;
-
-    double *m1_values = m1->values;
-    double *m2_values = m2->values;
-    double *result_values = result->values;
-
-    for (int i = 0; i < m1_rows; i++) {
-        for (int j = 0; j < m2_cols; j++) {
-            size_t result_index = m2_cols * i + j;
-            result_values[result_index] = 0.0;
-            for (int k = 0; k < m2_rows; k++) {
-                result_values[result_index] += m1_values[m1_cols * i + k] * m2_values[m2_cols * k + j];
-            }
-        }
-    }
-
-    return true;
-}
-
-matrix_t *matrix_read(FILE *f) {
-    matrix_t *mat = malloc(sizeof(matrix_t));
-    if (mat == NULL) return NULL;
-
-    mat->values = NULL;
-
-    uint64_t m = 0;
-    uint64_t n = 0;
-
-    if (!file_read_uint64(f, &m) ||
-        !file_read_uint64(f, &n)) {
-        goto exit_matrix_allocated;
-    }
-
-    mat->m = (size_t)m;
-    mat->n = (size_t)n;
-
-    size_t len_data = mat->m * mat->n;
-
-    double *data = malloc(len_data * sizeof(double));
-    if (data == NULL) {
-        printf("data alloc\n");
-        goto exit_matrix_allocated;
-    }
-
-    if (!file_read_double_array(f, data, len_data)) {
-        free(data);
-        goto exit_matrix_allocated;
-    }
-
-    mat->values = data;
-
-    return mat;
-
-exit_matrix_allocated:
-    matrix_destroy(mat);
-    return NULL;
-}
-
-bool matrix_write(matrix_t *self, FILE *f) {
-    if (self == NULL || self->values == NULL) {
-        return false;
-    }
-
-    if (!file_write_uint64(f, (uint64_t)self->m) ||
-        !file_write_uint64(f, (uint64_t)self->n)) {
-        return false;
-    }
-
-    uint64_t len_data = (uint64_t)self->m * (uint64_t)self->n;
-
-    for (uint64_t i = 0; i < len_data; i++) {
-        if (!file_write_double(f, self->values[i])) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-
-void matrix_destroy(matrix_t *self) {
-    if (self == NULL) return;
-
-    if (self->values != NULL) {
-        free(self->values);
-    }
-
-    free(self);
-}
diff --git a/src/matrix.h b/src/matrix.h
index 7bb14b97..7ec7df6a 100644
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -6,60 +6,374 @@
 #include <stdint.h>
 #include <stdbool.h>
 
+#include <config.h>
+
 #include "collections.h"
+#include "file_utils.h"
+#include "vector.h"
+#include "vector_math.h"
 
-typedef struct matrix {
-    size_t m;
-    size_t n;
-    double *values;
-} matrix_t;
+#ifdef HAVE_BLAS
+#include <cblas.h>
+#else
+#warning "No CLBAS"
+#endif
 
-matrix_t *matrix_new(size_t m, size_t n);
-matrix_t *matrix_new_value(size_t m, size_t n, double value);
-matrix_t *matrix_new_values(size_t m, size_t n, double *values);
-matrix_t *matrix_new_zeros(size_t m, size_t n);
-matrix_t *matrix_new_ones(size_t m, size_t n);
+#define MATRIX_INIT(name, type, type_name, array_type)                                                      \
+    typedef struct {                                                                                        \
+        size_t m, n;                                                                                        \
+        type *values;                                                                                       \
+    } name##_t;                                                                                             \
+                                                                                                            \
+    static name##_t *name##_new(size_t m, size_t n) {                                                       \
+        name##_t *matrix = malloc(sizeof(name##_t));                                                        \
+                                                                                                            \
+        if (matrix == NULL) {                                                                               \
+            return NULL;                                                                                    \
+        }                                                                                                   \
+                                                                                                            \
+        matrix->m = m;                                                                                      \
+        matrix->n = n;                                                                                      \
+                                                                                                            \
+        matrix->values = malloc(sizeof(type) * m * n);                                                      \
+        if (matrix->values == NULL) {                                                                       \
+            free(matrix);                                                                                   \
+            return NULL;                                                                                    \
+        }                                                                                                   \
+                                                                                                            \
+        return matrix;                                                                                      \
+                                                                                                            \
+    }                                                                                                       \
+                                                                                                            \
+    static void name##_destroy(name##_t *self) {                                                            \
+        if (self == NULL) return;                                                                           \
+                                                                                                            \
+        if (self->values != NULL) {                                                                         \
+            free(self->values);                                                                             \
+        }                                                                                                   \
+                                                                                                            \
+        free(self);                                                                                         \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_zero(name##_t *self) {                                                        \
+        memset(self->values, 0, self->m * self->n * sizeof(type));                                          \
+    }                                                                                                       \
+                                                                                                            \
+                                                                                                            \
+    static inline bool name##_resize(name##_t *self, size_t m, size_t n) {                                  \
+        if (self == NULL) return false;                                                                     \
+                                                                                                            \
+        if (m * n > (self->m * self->n)) {                                                                  \
+            self->values = realloc(self->values, sizeof(type) * m * n);                                     \
+            if (self->values == NULL) {                                                                     \
+                return false;                                                                               \
+            }                                                                                               \
+        }                                                                                                   \
+                                                                                                            \
+        self->m = m;                                                                                        \
+        self->n = n;                                                                                        \
+                                                                                                            \
+        name##_zero(self);                                                                                  \
+                                                                                                            \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline name##_t *name##_new_copy(name##_t *self) {                                               \
+        name##_t *cpy = name##_new(self->m, self->n);                                                       \
+        size_t num_values = self->m * self->n;                                                              \
+        memcpy(cpy->values, self->values, num_values);                                                      \
+                                                                                                            \
+        return cpy;                                                                                         \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_copy(name##_t *self, name##_t *other) {                                       \
+        if (self->m != other->m || self->n != other->n) {                                                   \
+            return false;                                                                                   \
+        }                                                                                                   \
+        size_t num_values = self->n * self->n;                                                              \
+                                                                                                            \
+        memcpy(other->values, self->values, num_values * sizeof(type));                                     \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_init_values(name##_t *self, type *values) {                                   \
+        size_t num_values = self->m * self->n;                                                              \
+        memcpy(self->values, values, num_values * sizeof(type));                                            \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_set(name##_t *self, type value) {                                             \
+        array_type##_set(self->values, value, self->m * self->n);                                           \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_set_row(name##_t *self, size_t index, type *row) {                            \
+        size_t offset = index * self->n;                                                                    \
+        type *values = self->values;                                                                        \
+        size_t n = self->n;                                                                                 \
+        memcpy(values + offset, row, n * sizeof(type));                                                     \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_set_scalar(name##_t *self, size_t row_index, size_t col_index, type value) {  \
+        size_t offset = row_index * self->n + col_index;                                                    \
+        self->values[offset] = value;                                                                       \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_add_scalar(name##_t *self, size_t row_index, size_t col_index, type value) {  \
+        size_t offset = row_index * self->n + col_index;                                                    \
+        self->values[offset] += value;                                                                      \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_sub_scalar(name##_t *self, size_t row_index, size_t col_index, type value) {  \
+        size_t offset = row_index * self->n + col_index;                                                    \
+        self->values[offset] -= value;                                                                      \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_mul_scalar(name##_t *self, size_t row_index, size_t col_index, type value) {  \
+        size_t offset = row_index * self->n + col_index;                                                    \
+        self->values[offset] *= value;                                                                      \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_div_scalar(name##_t *self, size_t row_index, size_t col_index, type value) {  \
+        size_t offset = row_index * self->n + col_index;                                                    \
+        self->values[offset] /= value;                                                                      \
+    }                                                                                                       \
+                                                                                                            \
+    static inline type name##_get(name##_t *self, size_t row_index, size_t col_index) {                     \
+        size_t index = row_index * self->n + col_index;                                                     \
+        return self->values[index];                                                                         \
+    }                                                                                                       \
+                                                                                                            \
+    static inline type *name##_get_row(name##_t *self, size_t row_index) {                                  \
+        size_t index = row_index * self->n;                                                                 \
+        return self->values + index;                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline name##_t *name##_new_value(size_t m, size_t n, type value) {                              \
+        name##_t *matrix = name##_new(m, n);                                                                \
+        name##_set(matrix, value);                                                                          \
+        return matrix;                                                                                      \
+    }                                                                                                       \
+                                                                                                            \
+    static inline name##_t *name##_new_zeros(size_t m, size_t n) {                                          \
+        name##_t *matrix = name##_new(m, n);                                                                \
+        name##_zero(matrix);                                                                                \
+        return matrix;                                                                                      \
+    }                                                                                                       \
+                                                                                                            \
+    static inline name##_t *name##_new_ones(size_t m, size_t n) {                                           \
+        return name##_new_value(m, n, (type)1);                                                             \
+    }                                                                                                       \
+                                                                                                            \
+    static inline name##_t *name##_new_values(size_t m, size_t n, type *values) {                           \
+        name##_t *matrix = name##_new(m, n);                                                                \
+        memcpy(matrix->values, values, m * n * sizeof(type));                                               \
+        return matrix;                                                                                      \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_div(name##_t *self, type value) {                                             \
+        array_type##_div(self->values, value, self->m * self->n);                                           \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_div_matrix(name##_t *self, name##_t *other) {                                 \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_div_array(self->values, other->values, self->m * self->n);                             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_div_matrix_times_scalar(name##_t *self, name##_t *other, type v) {            \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_div_array_times_scalar(self->values, other->values, v, self->m * self->n);             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_mul(name##_t *self, type value) {                                             \
+        array_type##_mul(self->values, value, self->m * self->n);                                           \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_mul_matrix(name##_t *self, name##_t *other) {                                 \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_mul_array(self->values, other->values, self->m * self->n);                             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_mul_matrix_times_scalar(name##_t *self, name##_t *other, type v) {            \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_mul_array_times_scalar(self->values, other->values, v, self->m * self->n);             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_add(name##_t *self, type value) {                                             \
+        array_type##_add(self->values, self->m * self->n, value);                                           \
+    }                                                                                                       \
+                                                                                                            \
+                                                                                                            \
+    static inline bool name##_add_matrix(name##_t *self, name##_t *other) {                                 \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_add_array(self->values, other->values, self->m * self->n);                             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_add_matrix_times_scalar(name##_t *self, name##_t *other, type v) {            \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_add_array_times_scalar(self->values, other->values, v, self->m * self->n);             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline void name##_sub(name##_t *self, type value) {                                             \
+        array_type##_sub(self->values, value, self->m * self->n);                                           \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_sub_matrix(name##_t *self, name##_t *other) {                                 \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_sub_array(self->values, other->values, self->m * self->n);                             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static inline bool name##_sub_matrix_times_scalar(name##_t *self, name##_t *other, type v) {            \
+        if (self->m != other->m || self->n != other->n) return false;                                       \
+        array_type##_sub_array_times_scalar(self->values, other->values, v, self->m * self->n);             \
+        return true;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static name##_t *name##_read(FILE *f) {                                                                 \
+        name##_t *mat = malloc(sizeof(name##_t));                                                           \
+        if (mat == NULL) return NULL;                                                                       \
+                                                                                                            \
+        mat->values = NULL;                                                                                 \
+                                                                                                            \
+        uint64_t m = 0;                                                                                     \
+        uint64_t n = 0;                                                                                     \
+                                                                                                            \
+        if (!file_read_uint64(f, &m) ||                                                                     \
+            !file_read_uint64(f, &n)) {                                                                     \
+            goto exit_##name##_allocated;                                                                   \
+        }                                                                                                   \
+                                                                                                            \
+        mat->m = (size_t)m;                                                                                 \
+        mat->n = (size_t)n;                                                                                 \
+                                                                                                            \
+        size_t len_data = mat->m * mat->n;                                                                  \
+                                                                                                            \
+        type *data = malloc(len_data * sizeof(type));                                                       \
+        if (data == NULL) {                                                                                 \
+            log_error("error in data malloc\n");                                                            \
+            goto exit_##name##_allocated;                                                                   \
+        }                                                                                                   \
+                                                                                                            \
+        if (!file_read_##array_type(f, data, len_data)) {                                                   \
+            free(data);                                                                                     \
+            goto exit_##name##_allocated;                                                                   \
+        }                                                                                                   \
+                                                                                                            \
+        mat->values = data;                                                                                 \
+                                                                                                            \
+        return mat;                                                                                         \
+                                                                                                            \
+    exit_##name##_allocated:                                                                                \
+        name##_destroy(mat);                                                                                \
+        return NULL;                                                                                        \
+    }                                                                                                       \
+                                                                                                            \
+    static bool name##_write(name##_t *self, FILE *f) {                                                     \
+        if (self == NULL || self->values == NULL) {                                                         \
+            return false;                                                                                   \
+        }                                                                                                   \
+                                                                                                            \
+        if (!file_write_uint64(f, (uint64_t)self->m) ||                                                     \
+            !file_write_uint64(f, (uint64_t)self->n)) {                                                     \
+            return false;                                                                                   \
+        }                                                                                                   \
+                                                                                                            \
+        uint64_t len_data = (uint64_t)self->m * (uint64_t)self->n;                                          \
+                                                                                                            \
+        for (uint64_t i = 0; i < len_data; i++) {                                                           \
+            if (!file_write_##type_name(f, self->values[i])) {                                              \
+                return false;                                                                               \
+            }                                                                                               \
+        }                                                                                                   \
+                                                                                                            \
+        return true;                                                                                        \
+    }
 
-bool matrix_resize(matrix_t *self, size_t m, size_t n);
 
-matrix_t *matrix_new_copy(matrix_t *self);
-bool matrix_copy(matrix_t *self, matrix_t *other);
+#define MATRIX_INIT_FLOAT_BASE(name, type, type_name, array_type)                       \
+    MATRIX_INIT(name, type, type_name, array_type)                                      \
+                                                                                        \
+    static inline void name##_log(name##_t *self) {                                     \
+        array_type##_log(self->values, self->m * self->n);                              \
+    }                                                                                   \
+                                                                                        \
+    static inline void name##_exp(name##_t *self) {                                     \
+        array_type##_exp(self->values, self->m * self->n);                              \
+    }                                                                                   \
+                                                                                        \
+    static inline void name##_dot_vector(name##_t *self, type *vec, type *result) {     \
+        type *values = self->values;                                                    \
+        size_t m = self->m;                                                             \
+        size_t n = self->n;                                                             \
+        for (size_t i = 0; i < m; i++) {                                                \
+            for (size_t j = 0; j < n; j++) {                                            \
+                result[i] += values[n * i + j] * vec[j];                                \
+            }                                                                           \
+        }                                                                               \
+    }
 
-void matrix_init_values(matrix_t *self, double *values);
-void matrix_set(matrix_t *self, double value);
-void matrix_zero(matrix_t *self);
-void matrix_set_row(matrix_t *self, size_t index, double *row);
-void matrix_set_scalar(matrix_t *self, size_t row_index, size_t col_index, double value);
-void matrix_add_scalar(matrix_t *self, size_t row_index, size_t col_index, double value);
-void matrix_sub_scalar(matrix_t *self, size_t row_index, size_t col_index, double value);
-void matrix_mul_scalar(matrix_t *self, size_t row_index, size_t col_index, double value);
-void matrix_div_scalar(matrix_t *self, size_t row_index, size_t col_index, double value);
 
-double matrix_get(matrix_t *self, size_t row_index, size_t col_index);
-double *matrix_get_row(matrix_t *self, size_t row_index);
+#ifdef HAVE_BLAS
+#define MATRIX_INIT_FLOAT(name, type, type_name, array_type, blas_prefix)                                   \
+    MATRIX_INIT_FLOAT_BASE(name, type, type_name, array_type)                                               \
+                                                                                                            \
+    static inline bool name##_dot_matrix(name##_t *m1, name##_t *m2, name##_t *result) {                    \
+        if (m1->n != m2->m || m1->m != result->m || m2->n != result->n) {                                   \
+            return false;                                                                                   \
+        }                                                                                                   \
+                                                                                                            \
+        log_debug("doing CBLAS\n");                                                                         \
+        cblas_##blas_prefix##gemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,                                \
+                    m1->m, m2->n, m1->n, 1.0,                                                               \
+                    m1->values, m1->n,                                                                      \
+                    m2->values, m2->n, 0.0,                                                                 \
+                    result->values, result->n                                                               \
+                    );                                                                                      \
+                                                                                                            \
+        return true;                                                                                        \
+    }
 
-void matrix_add(matrix_t *self, double value);
-bool matrix_add_matrix(matrix_t *self, matrix_t *other);
-bool matrix_add_matrix_times_scalar(matrix_t *self, matrix_t *other, double v);
-void matrix_sub(matrix_t *self, double value);
-bool matrix_sub_matrix(matrix_t *self, matrix_t *other);
-bool matrix_sub_matrix_times_scalar(matrix_t *self, matrix_t *other, double v);
-void matrix_mul(matrix_t *self, double value);
-bool matrix_mul_matrix(matrix_t *self, matrix_t *other);
-bool matrix_mul_matrix_times_scalar(matrix_t *self, matrix_t *other, double v);
-void matrix_div(matrix_t *self, double value);
-bool matrix_div_matrix(matrix_t *self, matrix_t *other);
-bool matrix_div_matrix_times_scalar(matrix_t *self, matrix_t *other, double v);
+#else
+#define MATRIX_INIT_FLOAT(name, type, type_name, array_type, blas_prefix)                                       \
+    MATRIX_INIT_FLOAT_BASE(name, type, type_name, array_type)                                                   \
+                                                                                                                \
+    static inline bool name##_dot_matrix(name##_t *m1, name##_t *m2, name##_t *result) {                        \
+        if (m1->n != m2->m || m1->m != result->m || m2->n != result->n) {                                       \
+            return false;                                                                                       \
+        }                                                                                                       \
+                                                                                                                \
+        size_t m1_rows = m1->m;                                                                                 \
+        size_t m1_cols = m1->n;                                                                                 \
+        size_t m2_rows = m2->m;                                                                                 \
+        size_t m2_cols = m2->n;                                                                                 \
+                                                                                                                \
+        type *m1_values = m1->values;                                                                           \
+        type *m2_values = m2->values;                                                                           \
+        type *result_values = result->values;                                                                   \
+                                                                                                                \
+        for (size_t i = 0; i < m1_rows; i++) {                                                                  \
+            for (size_t j = 0; j < m2_cols; j++) {                                                              \
+                size_t result_index = m2_cols * i + j;                                                          \
+                result_values[result_index] = 0.0;                                                              \
+                for (size_t k = 0; k < m2_rows; k++) {                                                          \
+                    result_values[result_index] += m1_values[m1_cols * i + k] * m2_values[m2_cols * k + j];     \
+                }                                                                                               \
+            }                                                                                                   \
+        }                                                                                                       \
+                                                                                                                \
+        return true;                                                                                            \
+    }
+#endif
 
-void matrix_log(matrix_t *self);
-void matrix_exp(matrix_t *self);
+MATRIX_INIT(uint32_matrix, uint32_t, uint32, uint32_array)
 
-void matrix_dot_vector(matrix_t *self, double *vec, double *result);
-bool matrix_dot_matrix(matrix_t *m1, matrix_t *m2, matrix_t *result);
+MATRIX_INIT_FLOAT(float_matrix, float, float, float_array,s)
+MATRIX_INIT_FLOAT(double_matrix, double, double, double_array,d)
 
-matrix_t *matrix_read(FILE *f);
-bool matrix_write(matrix_t *self, FILE *f);
-
-void matrix_destroy(matrix_t *self);
 
 #endif
diff --git a/src/sparse_matrix.c b/src/sparse_matrix.c
index dc9fa7fc..6a5911f8 100644
--- a/src/sparse_matrix.c
+++ b/src/sparse_matrix.c
@@ -231,7 +231,7 @@ int sparse_matrix_sum_rows(sparse_matrix_t *self, uint32_t *rows, size_t m, doub
 
 
 
-int sparse_matrix_dot_dense(sparse_matrix_t *self, matrix_t *matrix, matrix_t *result) {
+int sparse_matrix_dot_dense(sparse_matrix_t *self, double_matrix_t *matrix, double_matrix_t *result) {
     if (self->n != matrix->m || self->m != result->m || matrix->n != result->n) {
         return -1;
     }
@@ -267,7 +267,7 @@ int sparse_matrix_dot_dense(sparse_matrix_t *self, matrix_t *matrix, matrix_t *r
 
 
 
-int sparse_matrix_dot_sparse(sparse_matrix_t *self, sparse_matrix_t *other, matrix_t *result) {
+int sparse_matrix_dot_sparse(sparse_matrix_t *self, sparse_matrix_t *other, double_matrix_t *result) {
     if (self->n != other->m || self->m != result->m || other->n != result->n) {
         return -1;
     }
diff --git a/src/sparse_matrix.h b/src/sparse_matrix.h
index 18377038..bc23c068 100644
--- a/src/sparse_matrix.h
+++ b/src/sparse_matrix.h
@@ -78,8 +78,8 @@ int sparse_matrix_rows_sum_cols(sparse_matrix_t *self, uint32_t *rows, size_t m,
 int sparse_matrix_sum_all_rows(sparse_matrix_t *self, double *result, size_t n);
 int sparse_matrix_sum_rows(sparse_matrix_t *self, uint32_t *rows, size_t m, double *result, size_t n);
 
-int sparse_matrix_dot_dense(sparse_matrix_t *self, matrix_t *matrix, matrix_t *result);
-int sparse_matrix_dot_sparse(sparse_matrix_t *self, sparse_matrix_t *other, matrix_t *result);
+int sparse_matrix_dot_dense(sparse_matrix_t *self, double_matrix_t *matrix, double_matrix_t *result);
+int sparse_matrix_dot_sparse(sparse_matrix_t *self, sparse_matrix_t *other, double_matrix_t *result);
 
 bool sparse_matrix_write(sparse_matrix_t *self, FILE *f);
 sparse_matrix_t *sparse_matrix_read(FILE *f);
diff --git a/src/sparse_matrix_utils.c b/src/sparse_matrix_utils.c
index fa87ef0d..dbbc0b62 100644
--- a/src/sparse_matrix_utils.c
+++ b/src/sparse_matrix_utils.c
@@ -38,7 +38,9 @@ bool sparse_matrix_add_unique_columns(sparse_matrix_t *matrix, khash_t(int_set)
     }
 
     uint32_array_clear(array);
-    uint32_array_resize(array, kh_size(unique_columns));
+    if (!uint32_array_resize(array, kh_size(unique_columns))) {
+        return false;
+    }
 
     khint_t k;
 
diff --git a/src/sparse_matrix_utils.h b/src/sparse_matrix_utils.h
index c19c3b21..a6f44558 100644
--- a/src/sparse_matrix_utils.h
+++ b/src/sparse_matrix_utils.h
@@ -6,7 +6,7 @@
 #include "sparse_matrix.h"
 #include "matrix.h"
 
-sparse_matrix_t *sparse_matrix_new_from_matrix(matrix_t *matrix);
+sparse_matrix_t *sparse_matrix_new_from_matrix(double_matrix_t *matrix);
 uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix);
 bool sparse_matrix_add_unique_columns(sparse_matrix_t *matrix, khash_t(int_set) *unique_columns, uint32_array *array);
 
diff --git a/src/stochastic_gradient_descent.c b/src/stochastic_gradient_descent.c
index b9439e86..c3e98d2a 100644
--- a/src/stochastic_gradient_descent.c
+++ b/src/stochastic_gradient_descent.c
@@ -1,6 +1,6 @@
 #include "stochastic_gradient_descent.h"
 
-bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gamma) {
+bool stochastic_gradient_descent(double_matrix_t *theta, double_matrix_t *gradient, double gamma) {
     if (gradient->m != theta->m || gradient->n != theta->n) {
         return false;
     }
@@ -8,10 +8,10 @@ bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gam
     size_t m = gradient->m;
     size_t n = gradient->n;
 
-    return matrix_sub_matrix_times_scalar(theta, gradient, gamma);
+    return double_matrix_sub_matrix_times_scalar(theta, gradient, gamma);
 }
 
-bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma) {
+bool stochastic_gradient_descent_sparse(double_matrix_t *theta, double_matrix_t *gradient, uint32_array *update_indices, double gamma) {
     if (gradient->m != theta->m || gradient->n != theta->n) {
         return false;
     }
@@ -90,7 +90,7 @@ static inline void regularize_row(double *theta_i, size_t n, double lambda, uint
     double_array_mul(theta_i, update, n);
 }
 
-bool stochastic_gradient_descent_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0) {
+bool stochastic_gradient_descent_regularize_weights(double_matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0) {
     if (lambda > 0.0) {        
         uint32_t *updates = last_updated->a;
 
@@ -101,7 +101,7 @@ bool stochastic_gradient_descent_regularize_weights(matrix_t *theta, uint32_arra
 
         for (size_t i = 0; i < batch_rows; i++) {
             uint32_t row = rows[i];
-            double *theta_i = matrix_get_row(theta, row);
+            double *theta_i = double_matrix_get_row(theta, row);
             uint32_t last_updated = updates[row];
 
             double gamma_t  = stochastic_gradient_descent_gamma_t(gamma_0, lambda, t - last_updated);
@@ -114,14 +114,14 @@ bool stochastic_gradient_descent_regularize_weights(matrix_t *theta, uint32_arra
     return true;
 }
 
-inline bool stochastic_gradient_descent_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0) {
+inline bool stochastic_gradient_descent_finalize_weights(double_matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0) {
     if (lambda > 0.0) {
         uint32_t *updates = last_updated->a;
         size_t m = theta->m;
         size_t n = theta->n;
 
         for (size_t i = 0; i < m; i++) {
-            double *theta_i = matrix_get_row(theta, i);
+            double *theta_i = double_matrix_get_row(theta, i);
             uint32_t last_updated = updates[i];
 
             double gamma_t  = stochastic_gradient_descent_gamma_t(gamma_0, lambda, t - last_updated);
diff --git a/src/stochastic_gradient_descent.h b/src/stochastic_gradient_descent.h
index e0e35287..6077e46f 100644
--- a/src/stochastic_gradient_descent.h
+++ b/src/stochastic_gradient_descent.h
@@ -16,10 +16,10 @@ gamma_t = gamma_0(1 + gamma_0 * lambda * t)^-1
 
 #include "matrix.h"
 
-bool stochastic_gradient_descent(matrix_t *theta, matrix_t *gradient, double gamma);
-bool stochastic_gradient_descent_sparse(matrix_t *theta, matrix_t *gradient, uint32_array *update_indices, double gamma);
-bool stochastic_gradient_descent_regularize_weights(matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
-bool stochastic_gradient_descent_finalize_weights(matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
+bool stochastic_gradient_descent(double_matrix_t *theta, double_matrix_t *gradient, double gamma);
+bool stochastic_gradient_descent_sparse(double_matrix_t *theta, double_matrix_t *gradient, uint32_array *update_indices, double gamma);
+bool stochastic_gradient_descent_regularize_weights(double_matrix_t *theta, uint32_array *update_indices, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
+bool stochastic_gradient_descent_finalize_weights(double_matrix_t *theta, uint32_array *last_updated, uint32_t t, double lambda, double gamma_0);
 double stochastic_gradient_descent_gamma_t(double gamma_0, double lambda, uint32_t t);