diff --git a/src/address_parser_train.c b/src/address_parser_train.c index 2931741a..1818c8c7 100644 --- a/src/address_parser_train.c +++ b/src/address_parser_train.c @@ -29,6 +29,7 @@ KHASH_MAP_INIT_STR(phrase_types, address_parser_types_t) // Training #define DEFAULT_ITERATIONS 5 +#define DEFAULT_MIN_UPDATES 5 #define MIN_VOCAB_COUNT 5 #define MIN_PHRASE_COUNT 1 @@ -692,7 +693,6 @@ address_parser_t *address_parser_init(char *filename) { parser->model = NULL; - size_t num_classes = kh_size(class_counts); log_info("num_classes = %zu\n", num_classes); parser->num_classes = num_classes; @@ -1037,8 +1037,8 @@ exit_epoch_training_started: return true; } -bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_iterations) { - averaged_perceptron_trainer_t *trainer = averaged_perceptron_trainer_new(); +bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_iterations, size_t min_updates) { + averaged_perceptron_trainer_t *trainer = averaged_perceptron_trainer_new(min_updates); for (uint32_t iter = 0; iter < num_iterations; iter++) { log_info("Doing epoch %d\n", iter); @@ -1073,7 +1073,8 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i typedef enum { ADDRESS_PARSER_TRAIN_POSITIONAL_ARG, - ADDRESS_PARSER_TRAIN_ARG_ITERATIONS + ADDRESS_PARSER_TRAIN_ARG_ITERATIONS, + ADDRESS_PARSER_TRAIN_ARG_MIN_UPDATES } address_parser_train_keyword_arg_t; #define USAGE "Usage: ./address_parser_train filename output_dir [--iterations number]\n" @@ -1093,9 +1094,11 @@ int main(int argc, char **argv) { address_parser_train_keyword_arg_t kwarg = ADDRESS_PARSER_TRAIN_POSITIONAL_ARG; size_t num_iterations = DEFAULT_ITERATIONS; + uint64_t min_updates = DEFAULT_MIN_UPDATES; size_t position = 0; ssize_t arg_iterations; + uint64_t arg_min_updates; char *filename = NULL; char *output_dir = NULL; @@ -1108,12 +1111,24 @@ int main(int argc, char **argv) { continue; } + if (string_equals(arg, "--min-updates")) { + kwarg = ADDRESS_PARSER_TRAIN_ARG_MIN_UPDATES; + continue; + } + if (kwarg == ADDRESS_PARSER_TRAIN_ARG_ITERATIONS) { if (sscanf(arg, "%zd", &arg_iterations) != 1 || arg_iterations < 0) { log_error("Bad arg for --iterations: %s\n", arg); exit(EXIT_FAILURE); } num_iterations = (size_t)arg_iterations; + } else if (kwarg == ADDRESS_PARSER_TRAIN_ARG_MIN_UPDATES) { + if (sscanf(arg, "%llu", &arg_min_updates) != 1) { + log_error("Bad arg for --min-updates: %s\n", arg); + exit(EXIT_FAILURE); + } + min_updates = arg_min_updates; + log_info("min_updates = %llu\n", min_updates); } else if (position == 0) { filename = arg; position++; @@ -1154,7 +1169,7 @@ int main(int argc, char **argv) { log_info("Finished initialization\n"); - if (!address_parser_train(parser, filename, num_iterations)) { + if (!address_parser_train(parser, filename, num_iterations, min_updates)) { log_error("Error in training\n"); exit(EXIT_FAILURE); } diff --git a/src/averaged_perceptron_trainer.c b/src/averaged_perceptron_trainer.c index f0c3406c..a82c3721 100644 --- a/src/averaged_perceptron_trainer.c +++ b/src/averaged_perceptron_trainer.c @@ -35,6 +35,10 @@ void averaged_perceptron_trainer_destroy(averaged_perceptron_trainer_t *self) { kh_destroy(feature_class_weights, self->weights); } + if (self->update_counts != NULL) { + uint64_array_destroy(self->update_counts); + } + if (self->scores != NULL) { double_array_destroy(self->scores); } @@ -107,6 +111,7 @@ bool averaged_perceptron_trainer_get_feature_id(averaged_perceptron_trainer_t *s kh_value(features, k) = new_id; *feature_id = new_id; + uint64_array_push(self->update_counts, 0); self->num_features++; return true; } @@ -117,34 +122,85 @@ bool averaged_perceptron_trainer_get_feature_id(averaged_perceptron_trainer_t *s averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_trainer_t *self) { if (self == NULL || self->num_classes == 0) return NULL; - sparse_matrix_t *averaged_weights = sparse_matrix_new(); - uint32_t class_id; class_weight_t weight; uint64_t updates = self->num_updates; khash_t(class_weights) *weights; - for (uint32_t feature_id = 0; feature_id < self->num_features; feature_id++) { - khiter_t k; + char **feature_keys = malloc(sizeof(char *) * self->num_features); + uint32_t feature_id; + const char *feature; + kh_foreach(self->features, feature, feature_id, { + if (feature_id >= self->num_features) { + free(feature_keys); + return NULL; + } + feature_keys[feature_id] = (char *)feature; + }) + + sparse_matrix_t *averaged_weights = sparse_matrix_new(); + + uint32_t next_feature_id = 0; + khiter_t k; + + uint64_t *update_counts = self->update_counts->a; + + log_info("Finalizing trainer, num_features=%u\n", self->num_features); + + log_info("Pruning weights with < min_updates = %llu\n", self->min_updates); + + for (feature_id = 0; feature_id < self->num_features; feature_id++) { k = kh_get(feature_class_weights, self->weights, feature_id); if (k == kh_end(self->weights)) { sparse_matrix_destroy(averaged_weights); + free(feature_keys); return NULL; } weights = kh_value(self->weights, k); uint32_t class_id; - kh_foreach(weights, class_id, weight, { - weight.total += (updates - weight.last_updated) * weight.value; - double value = weight.total / updates; - sparse_matrix_append(averaged_weights, class_id, value); - }) + uint64_t update_count = update_counts[feature_id]; + bool keep_feature = update_count >= self->min_updates; + + uint32_t new_feature_id = next_feature_id; + + if (keep_feature) { + kh_foreach(weights, class_id, weight, { + weight.total += (updates - weight.last_updated) * weight.value; + double value = weight.total / updates; + sparse_matrix_append(averaged_weights, class_id, value); + }) + + sparse_matrix_finalize_row(averaged_weights); + next_feature_id++; + } + + + if (!keep_feature || new_feature_id != feature_id) { + feature = feature_keys[feature_id]; + k = kh_get(str_uint32, self->features, feature); + if (k != kh_end(self->features)) { + if (keep_feature) { + kh_value(self->features, k) = new_feature_id; + } else { + kh_del(str_uint32, self->features, k); + } + } else { + log_error("Error in kh_get on self->features\n"); + averaged_perceptron_trainer_destroy(self); + return NULL; + } + } - sparse_matrix_finalize_row(averaged_weights); } + free(feature_keys); + + self->num_features = kh_size(self->features); + log_info("After pruning, num_features=%u\n", self->num_features); + averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t)); perceptron->weights = averaged_weights; @@ -231,6 +287,9 @@ static inline bool averaged_perceptron_trainer_update_feature(averaged_perceptro return false; } + uint64_t *update_counts = self->update_counts->a; + update_counts[feature_id]++; + return true; } @@ -320,7 +379,7 @@ bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *se return true; } -bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) { +bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) { // Keep two tags of history in training char *prev = NULL; char *prev2 = NULL; @@ -395,7 +454,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se } -averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) { +averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(uint64_t min_updates) { averaged_perceptron_trainer_t *self = calloc(1, sizeof(averaged_perceptron_trainer_t)); if (self == NULL) return NULL; @@ -406,6 +465,8 @@ averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) { self->num_errors = 0; self->iterations = 0; + self->min_updates = min_updates; + self->features = kh_init(str_uint32); if (self->features == NULL) { goto exit_trainer_created; @@ -427,7 +488,15 @@ averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) { goto exit_trainer_created; } + self->update_counts = uint64_array_new(); + if (self->update_counts == NULL) { + goto exit_trainer_created; + } + self->scores = double_array_new(); + if (self->scores == NULL) { + goto exit_trainer_created; + } return self; diff --git a/src/averaged_perceptron_trainer.h b/src/averaged_perceptron_trainer.h index a4c22e7b..2832e9aa 100644 --- a/src/averaged_perceptron_trainer.h +++ b/src/averaged_perceptron_trainer.h @@ -60,15 +60,17 @@ typedef struct averaged_perceptron_trainer { uint64_t num_updates; uint64_t num_errors; uint32_t iterations; + uint64_t min_updates; khash_t(str_uint32) *features; khash_t(str_uint32) *classes; cstring_array *class_strings; // {feature_id => {class_id => class_weight_t}} khash_t(feature_class_weights) *weights; + uint64_array *update_counts; double_array *scores; } averaged_perceptron_trainer_t; -averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void); +averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(uint64_t min_updates); uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self, cstring_array *features);