From 96e1ca5e896c4a5913971f7a6e09115dc567c939 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 2 Apr 2017 13:48:46 -0400 Subject: [PATCH] [utils] sparse_matrix_add_unique_columns_alias, adds the actual column indices to hashtable/array and aliases those in the table from 1 to N (where N is the number of unique columns in this batch). This way it's compatible with smaller matrices of batch weights. --- src/sparse_matrix_utils.c | 72 +++++++++++++++++++++++++++++++-------- src/sparse_matrix_utils.h | 4 ++- 2 files changed, 60 insertions(+), 16 deletions(-) diff --git a/src/sparse_matrix_utils.c b/src/sparse_matrix_utils.c index 614d76fd..53fcaf97 100644 --- a/src/sparse_matrix_utils.c +++ b/src/sparse_matrix_utils.c @@ -19,38 +19,80 @@ sparse_matrix_t *sparse_matrix_new_from_matrix(double_matrix_t *matrix) { } -bool sparse_matrix_add_unique_columns(sparse_matrix_t *matrix, khash_t(int_set) *unique_columns, uint32_array *array) { +bool sparse_matrix_add_unique_columns(sparse_matrix_t *matrix, khash_t(int_uint32) *unique_columns, uint32_array *array) { size_t n = matrix->indices->n; uint32_t *indices = matrix->indices->a; - kh_clear(int_set, unique_columns); + kh_clear(int_uint32, unique_columns); size_t i; + khiter_t k; for (i = 0; i < n; i++) { uint32_t col = indices[i]; - int ret; - kh_put(int_set, unique_columns, (khint_t)col, &ret); - if (ret < 0) { + int ret = 0; + k = kh_get(int_uint32, unique_columns, col); + if (k == kh_end(unique_columns)) { + uint32_t next_id = (uint32_t)kh_size(unique_columns); + + k = kh_put(int_uint32, unique_columns, col, &ret); + if (ret < 0) { + return false; + } + kh_value(unique_columns, k) = next_id; + } + + } + + uint32_array_clear(array); + if (!uint32_array_resize_fixed(array, kh_size(unique_columns))) { + return false; + } + + khint_t key; + + uint32_t *batch = array->a; + uint32_t col_id; + + kh_foreach(unique_columns, key, col_id, { + batch[col_id] = (uint32_t)key; + }) + + return true; +} + +bool sparse_matrix_alias_columns(sparse_matrix_t *matrix, khash_t(int_uint32) *unique_columns) { + size_t n = matrix->indices->n; + uint32_t *indices = matrix->indices->a; + + size_t i; + khiter_t k; + uint32_t col_id; + + for (i = 0; i < n; i++) { + uint32_t col = indices[i]; + + int ret = 0; + k = kh_get(int_uint32, unique_columns, col); + if (k != kh_end(unique_columns)) { + col_id = kh_value(unique_columns, k); + indices[i] = col_id; + } else { return false; } } - uint32_array_clear(array); - if (!uint32_array_resize(array, kh_size(unique_columns))) { - return false; - } - - khint_t k; - - kh_foreach_key(unique_columns, k, { - uint32_array_push(array, (uint32_t)k); - }) + matrix->n = kh_size(unique_columns); return true; } +inline bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khash_t(int_uint32) *unique_columns, uint32_array *array) { + return sparse_matrix_add_unique_columns(matrix, unique_columns, array) && + sparse_matrix_alias_columns(matrix, unique_columns); +} + uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix) { khash_t(int_set) *unique_columns = kh_init(int_set); uint32_array *ret = uint32_array_new(); diff --git a/src/sparse_matrix_utils.h b/src/sparse_matrix_utils.h index a6f44558..da9b3bd0 100644 --- a/src/sparse_matrix_utils.h +++ b/src/sparse_matrix_utils.h @@ -8,6 +8,8 @@ sparse_matrix_t *sparse_matrix_new_from_matrix(double_matrix_t *matrix); uint32_array *sparse_matrix_unique_columns(sparse_matrix_t *matrix); -bool sparse_matrix_add_unique_columns(sparse_matrix_t *matrix, khash_t(int_set) *unique_columns, uint32_array *array); +bool sparse_matrix_add_unique_columns(sparse_matrix_t *matrix, khash_t(int_uint32) *unique_columns, uint32_array *array); +bool sparse_matrix_alias_columns(sparse_matrix_t *matrix, khash_t(int_uint32) *unique_columns); +bool sparse_matrix_add_unique_columns_alias(sparse_matrix_t *matrix, khash_t(int_uint32) *unique_columns, uint32_array *array); #endif \ No newline at end of file