[features] Functions for dealing with minibatches

This commit is contained in:
Al
2016-01-08 00:48:11 -05:00
parent 06638d2885
commit ba8fc716df
2 changed files with 241 additions and 0 deletions

215
src/minibatch.c Normal file
View File

@@ -0,0 +1,215 @@
#include "minibatch.h"
#include "float_utils.h"
#define BIAS_FEATURE_ID 0
bool count_features_minibatch(khash_t(str_double) *feature_counts, feature_count_array *minibatch, bool unique) {
const char *feature;
uint32_t feature_id;
double count;
size_t i;
size_t m = minibatch->n;
for (i = 0; i < minibatch->n; i++) {
khash_t(str_double) *counts = minibatch->a[i];
kh_foreach(counts, feature, count, {
// If unique is true, count features once per example
double value = unique ? 1.0 : count;
if (!feature_counts_add(feature_counts, (char *)feature, value)) {
return false;
}
})
}
return true;
}
bool count_labels_minibatch(khash_t(str_uint32) *label_ids, cstring_array *labels) {
uint32_t i;
char *label;
cstring_array_foreach(labels, i, label, {
khiter_t k = kh_get(str_uint32, label_ids, label);
if (k != kh_end(label_ids)) {
kh_value(label_ids, k)++;
} else {
int ret = 0;
k = kh_put(str_uint32, label_ids, strdup(label), &ret);
if (ret < 0) {
return false;
}
kh_value(label_ids, k) = 1;
}
})
return true;
}
trie_t *select_features_threshold(khash_t(str_double) *feature_counts, double threshold) {
const char *feature;
double count;
int ret = 0;
// First feature is the bias unit, so start from 1
uint32_t feature_id = 1;
khash_t(str_uint32) *feature_ids = kh_init(str_uint32);
size_t n = kh_size(feature_counts);
bool reversed = true;
char **sorted_keys = str_double_hash_sort_keys_by_value(feature_counts, reversed);
log_info("Sort done\n");
for (size_t i = 0; i < n; i++) {
char *key = sorted_keys[i];
khiter_t k = kh_get(str_double, feature_counts, key);
if (k == kh_end(feature_counts)) {
goto exit_destroy_feature_ids;
}
if (strlen(key) == 0) continue;
count = kh_value(feature_counts, k);
if (count < threshold && !double_equals(count, threshold)) continue;
// feature_ids is a local hash, don't need to strdup the key on put
k = kh_put(str_uint32, feature_ids, key, &ret);
if (ret < 0) {
goto exit_destroy_feature_ids;
}
kh_value(feature_ids, k) = feature_id++;
}
trie_t *trie = trie_new_from_hash(feature_ids);
free(sorted_keys);
kh_destroy(str_uint32, feature_ids);
return trie;
exit_destroy_feature_ids:
free(sorted_keys);
kh_destroy(str_uint32, feature_ids);
return NULL;
}
khash_t(str_uint32) *select_labels_threshold(khash_t(str_uint32) *label_counts, uint32_t threshold) {
const char *label;
uint32_t count;
int ret = 0;
uint32_t label_id = 0;
khash_t(str_uint32) *label_ids = kh_init(str_uint32);
size_t n = kh_size(label_counts);
bool reversed = true;
char **sorted_keys = str_uint32_hash_sort_keys_by_value(label_counts, reversed);
for (size_t i = 0; i < n; i++) {
char *label = sorted_keys[i];
khiter_t k = kh_get(str_uint32, label_counts, label);
if (k == kh_end(label_counts)) {
goto exit_destroy_label_ids;
}
count = kh_value(label_counts, k);
if (count < threshold) continue;
k = kh_put(str_uint32, label_ids, label, &ret);
if (ret < 0) {
goto exit_destroy_label_ids;
}
kh_value(label_ids, k) = label_id++;
}
free(sorted_keys);
return label_ids;
exit_destroy_label_ids:
free(sorted_keys);
kh_destroy(str_uint32, label_ids);
return NULL;
}
sparse_matrix_t *feature_matrix(trie_t *feature_ids, feature_count_array *feature_counts) {
const char *feature;
uint32_t feature_id;
double count;
size_t i;
size_t m = feature_counts->n;
// Add one feature for bias unit
size_t n = trie_num_keys(feature_ids) + 1;
sparse_matrix_t *matrix = sparse_matrix_new_shape(m, n);
for (i = 0; i < m; i++) {
khash_t(str_double) *counts = feature_counts->a[i];
sparse_matrix_append(matrix, BIAS_FEATURE_ID, 1.0);
kh_foreach(counts, feature, count, {
if (!trie_get_data(feature_ids, (char *)feature, &feature_id)) {
continue;
}
sparse_matrix_append(matrix, feature_id, count);
})
sparse_matrix_finalize_row(matrix);
}
return matrix;
}
sparse_matrix_t *feature_vector(trie_t *feature_ids, khash_t(str_double) *feature_counts) {
const char *feature;
uint32_t feature_id;
double count;
size_t m = 1;
// Add one feature for bias unit
size_t n = trie_num_keys(feature_ids) + 1;
sparse_matrix_t *matrix = sparse_matrix_new_shape(m, n);
sparse_matrix_append(matrix, BIAS_FEATURE_ID, 1.0);
kh_foreach(feature_counts, feature, count, {
if (!trie_get_data(feature_ids, (char *)feature, &feature_id)) {
continue;
}
sparse_matrix_append(matrix, feature_id, count);
})
sparse_matrix_finalize_row(matrix);
return matrix;
}
uint32_array *label_vector(khash_t(str_uint32) *label_ids, cstring_array *labels) {
uint32_t i;
char *label;
uint32_t label_id;
uint32_array *array = uint32_array_new_size(cstring_array_num_strings(labels));
cstring_array_foreach(labels, i, label, {
khiter_t k = kh_get(str_uint32, label_ids, label);
if (k != kh_end(label_ids)) {
label_id = kh_value(label_ids, k);
uint32_array_push(array, label_id);
}
});
return array;
}

26
src/minibatch.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef MINIBATCH_H
#define MINIBATCH_H
#include <stdio.h>
#include <stdlib.h>
#include "collections.h"
#include "features.h"
#include "sparse_matrix.h"
#include "trie.h"
#include "trie_utils.h"
#include "vector_math.h"
bool count_features_minibatch(khash_t(str_double) *feature_counts, feature_count_array *minibatch, bool unique);
bool count_labels_minibatch(khash_t(str_uint32) *label_counts, cstring_array *labels);
trie_t *select_features_threshold(khash_t(str_double) *feature_counts, double threshold);
khash_t(str_uint32) *select_labels_threshold(khash_t(str_uint32) *label_counts, uint32_t threshold);
sparse_matrix_t *feature_matrix(trie_t *feature_ids, feature_count_array *feature_counts);
sparse_matrix_t *feature_vector(trie_t *feature_ids, khash_t(str_double) *feature_counts);
uint32_array *label_vector(khash_t(str_uint32) *label_ids, cstring_array *labels);
#endif