[parser] learning a sparser averaged perceptron model for the parser using the following method:

- store a vector of update counts for each feature in the model
- when the model updates after making a mistake, increment the update
  counters for the observed features in that example
- after the model is finished training, keep only the features that
  participated in a minimum number of updates

This method is described in greater detail in this paper from Yoav
Goldberg: https://www.cs.bgu.ac.il/~yoavg/publications/acl2011sparse.pdf

The authors there report a 4x size reduction at only a trivial cost in
terms of accuracy. So far the trials on libpostal indicate roughly the
same, though at lower training set sizes the accuracy cost is greater.

This method is more effective than simple feature pruning as feature
pruning methods are usually based on the frequency of the feature
in the training set, and infrequent features can still be important.
However, the perceptron's early iterations make many updates on
irrelevant featuers simply because the weights for the more relevant
features aren't tuned yet. The number of updates a feature participates
in can be seen as a measure of its relevance to classifying examples.

This commit introduces --min-features option to address_parser_train
(default=5), so it can effectively be turned off by using
"--min-features 0" or "--min-features 1".
This commit is contained in:
Al
2017-03-06 21:56:10 -05:00
parent 5c1c1ae0f2
commit 95015990ab
3 changed files with 104 additions and 18 deletions

View File

@@ -29,6 +29,7 @@ KHASH_MAP_INIT_STR(phrase_types, address_parser_types_t)
// Training // Training
#define DEFAULT_ITERATIONS 5 #define DEFAULT_ITERATIONS 5
#define DEFAULT_MIN_UPDATES 5
#define MIN_VOCAB_COUNT 5 #define MIN_VOCAB_COUNT 5
#define MIN_PHRASE_COUNT 1 #define MIN_PHRASE_COUNT 1
@@ -692,7 +693,6 @@ address_parser_t *address_parser_init(char *filename) {
parser->model = NULL; parser->model = NULL;
size_t num_classes = kh_size(class_counts); size_t num_classes = kh_size(class_counts);
log_info("num_classes = %zu\n", num_classes); log_info("num_classes = %zu\n", num_classes);
parser->num_classes = num_classes; parser->num_classes = num_classes;
@@ -1037,8 +1037,8 @@ exit_epoch_training_started:
return true; return true;
} }
bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_iterations) { bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_iterations, size_t min_updates) {
averaged_perceptron_trainer_t *trainer = averaged_perceptron_trainer_new(); averaged_perceptron_trainer_t *trainer = averaged_perceptron_trainer_new(min_updates);
for (uint32_t iter = 0; iter < num_iterations; iter++) { for (uint32_t iter = 0; iter < num_iterations; iter++) {
log_info("Doing epoch %d\n", iter); log_info("Doing epoch %d\n", iter);
@@ -1073,7 +1073,8 @@ bool address_parser_train(address_parser_t *self, char *filename, uint32_t num_i
typedef enum { typedef enum {
ADDRESS_PARSER_TRAIN_POSITIONAL_ARG, ADDRESS_PARSER_TRAIN_POSITIONAL_ARG,
ADDRESS_PARSER_TRAIN_ARG_ITERATIONS ADDRESS_PARSER_TRAIN_ARG_ITERATIONS,
ADDRESS_PARSER_TRAIN_ARG_MIN_UPDATES
} address_parser_train_keyword_arg_t; } address_parser_train_keyword_arg_t;
#define USAGE "Usage: ./address_parser_train filename output_dir [--iterations number]\n" #define USAGE "Usage: ./address_parser_train filename output_dir [--iterations number]\n"
@@ -1093,9 +1094,11 @@ int main(int argc, char **argv) {
address_parser_train_keyword_arg_t kwarg = ADDRESS_PARSER_TRAIN_POSITIONAL_ARG; address_parser_train_keyword_arg_t kwarg = ADDRESS_PARSER_TRAIN_POSITIONAL_ARG;
size_t num_iterations = DEFAULT_ITERATIONS; size_t num_iterations = DEFAULT_ITERATIONS;
uint64_t min_updates = DEFAULT_MIN_UPDATES;
size_t position = 0; size_t position = 0;
ssize_t arg_iterations; ssize_t arg_iterations;
uint64_t arg_min_updates;
char *filename = NULL; char *filename = NULL;
char *output_dir = NULL; char *output_dir = NULL;
@@ -1108,12 +1111,24 @@ int main(int argc, char **argv) {
continue; continue;
} }
if (string_equals(arg, "--min-updates")) {
kwarg = ADDRESS_PARSER_TRAIN_ARG_MIN_UPDATES;
continue;
}
if (kwarg == ADDRESS_PARSER_TRAIN_ARG_ITERATIONS) { if (kwarg == ADDRESS_PARSER_TRAIN_ARG_ITERATIONS) {
if (sscanf(arg, "%zd", &arg_iterations) != 1 || arg_iterations < 0) { if (sscanf(arg, "%zd", &arg_iterations) != 1 || arg_iterations < 0) {
log_error("Bad arg for --iterations: %s\n", arg); log_error("Bad arg for --iterations: %s\n", arg);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
num_iterations = (size_t)arg_iterations; num_iterations = (size_t)arg_iterations;
} else if (kwarg == ADDRESS_PARSER_TRAIN_ARG_MIN_UPDATES) {
if (sscanf(arg, "%llu", &arg_min_updates) != 1) {
log_error("Bad arg for --min-updates: %s\n", arg);
exit(EXIT_FAILURE);
}
min_updates = arg_min_updates;
log_info("min_updates = %llu\n", min_updates);
} else if (position == 0) { } else if (position == 0) {
filename = arg; filename = arg;
position++; position++;
@@ -1154,7 +1169,7 @@ int main(int argc, char **argv) {
log_info("Finished initialization\n"); log_info("Finished initialization\n");
if (!address_parser_train(parser, filename, num_iterations)) { if (!address_parser_train(parser, filename, num_iterations, min_updates)) {
log_error("Error in training\n"); log_error("Error in training\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }

View File

@@ -35,6 +35,10 @@ void averaged_perceptron_trainer_destroy(averaged_perceptron_trainer_t *self) {
kh_destroy(feature_class_weights, self->weights); kh_destroy(feature_class_weights, self->weights);
} }
if (self->update_counts != NULL) {
uint64_array_destroy(self->update_counts);
}
if (self->scores != NULL) { if (self->scores != NULL) {
double_array_destroy(self->scores); double_array_destroy(self->scores);
} }
@@ -107,6 +111,7 @@ bool averaged_perceptron_trainer_get_feature_id(averaged_perceptron_trainer_t *s
kh_value(features, k) = new_id; kh_value(features, k) = new_id;
*feature_id = new_id; *feature_id = new_id;
uint64_array_push(self->update_counts, 0);
self->num_features++; self->num_features++;
return true; return true;
} }
@@ -117,25 +122,51 @@ bool averaged_perceptron_trainer_get_feature_id(averaged_perceptron_trainer_t *s
averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_trainer_t *self) { averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_trainer_t *self) {
if (self == NULL || self->num_classes == 0) return NULL; if (self == NULL || self->num_classes == 0) return NULL;
sparse_matrix_t *averaged_weights = sparse_matrix_new();
uint32_t class_id; uint32_t class_id;
class_weight_t weight; class_weight_t weight;
uint64_t updates = self->num_updates; uint64_t updates = self->num_updates;
khash_t(class_weights) *weights; khash_t(class_weights) *weights;
for (uint32_t feature_id = 0; feature_id < self->num_features; feature_id++) { char **feature_keys = malloc(sizeof(char *) * self->num_features);
uint32_t feature_id;
const char *feature;
kh_foreach(self->features, feature, feature_id, {
if (feature_id >= self->num_features) {
free(feature_keys);
return NULL;
}
feature_keys[feature_id] = (char *)feature;
})
sparse_matrix_t *averaged_weights = sparse_matrix_new();
uint32_t next_feature_id = 0;
khiter_t k; khiter_t k;
uint64_t *update_counts = self->update_counts->a;
log_info("Finalizing trainer, num_features=%u\n", self->num_features);
log_info("Pruning weights with < min_updates = %llu\n", self->min_updates);
for (feature_id = 0; feature_id < self->num_features; feature_id++) {
k = kh_get(feature_class_weights, self->weights, feature_id); k = kh_get(feature_class_weights, self->weights, feature_id);
if (k == kh_end(self->weights)) { if (k == kh_end(self->weights)) {
sparse_matrix_destroy(averaged_weights); sparse_matrix_destroy(averaged_weights);
free(feature_keys);
return NULL; return NULL;
} }
weights = kh_value(self->weights, k); weights = kh_value(self->weights, k);
uint32_t class_id; uint32_t class_id;
uint64_t update_count = update_counts[feature_id];
bool keep_feature = update_count >= self->min_updates;
uint32_t new_feature_id = next_feature_id;
if (keep_feature) {
kh_foreach(weights, class_id, weight, { kh_foreach(weights, class_id, weight, {
weight.total += (updates - weight.last_updated) * weight.value; weight.total += (updates - weight.last_updated) * weight.value;
double value = weight.total / updates; double value = weight.total / updates;
@@ -143,8 +174,33 @@ averaged_perceptron_t *averaged_perceptron_trainer_finalize(averaged_perceptron_
}) })
sparse_matrix_finalize_row(averaged_weights); sparse_matrix_finalize_row(averaged_weights);
next_feature_id++;
} }
if (!keep_feature || new_feature_id != feature_id) {
feature = feature_keys[feature_id];
k = kh_get(str_uint32, self->features, feature);
if (k != kh_end(self->features)) {
if (keep_feature) {
kh_value(self->features, k) = new_feature_id;
} else {
kh_del(str_uint32, self->features, k);
}
} else {
log_error("Error in kh_get on self->features\n");
averaged_perceptron_trainer_destroy(self);
return NULL;
}
}
}
free(feature_keys);
self->num_features = kh_size(self->features);
log_info("After pruning, num_features=%u\n", self->num_features);
averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t)); averaged_perceptron_t *perceptron = malloc(sizeof(averaged_perceptron_t));
perceptron->weights = averaged_weights; perceptron->weights = averaged_weights;
@@ -231,6 +287,9 @@ static inline bool averaged_perceptron_trainer_update_feature(averaged_perceptro
return false; return false;
} }
uint64_t *update_counts = self->update_counts->a;
update_counts[feature_id]++;
return true; return true;
} }
@@ -320,7 +379,7 @@ bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *se
return true; return true;
} }
bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) { bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) {
// Keep two tags of history in training // Keep two tags of history in training
char *prev = NULL; char *prev = NULL;
char *prev2 = NULL; char *prev2 = NULL;
@@ -395,7 +454,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
} }
averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) { averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(uint64_t min_updates) {
averaged_perceptron_trainer_t *self = calloc(1, sizeof(averaged_perceptron_trainer_t)); averaged_perceptron_trainer_t *self = calloc(1, sizeof(averaged_perceptron_trainer_t));
if (self == NULL) return NULL; if (self == NULL) return NULL;
@@ -406,6 +465,8 @@ averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) {
self->num_errors = 0; self->num_errors = 0;
self->iterations = 0; self->iterations = 0;
self->min_updates = min_updates;
self->features = kh_init(str_uint32); self->features = kh_init(str_uint32);
if (self->features == NULL) { if (self->features == NULL) {
goto exit_trainer_created; goto exit_trainer_created;
@@ -427,7 +488,15 @@ averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void) {
goto exit_trainer_created; goto exit_trainer_created;
} }
self->update_counts = uint64_array_new();
if (self->update_counts == NULL) {
goto exit_trainer_created;
}
self->scores = double_array_new(); self->scores = double_array_new();
if (self->scores == NULL) {
goto exit_trainer_created;
}
return self; return self;

View File

@@ -60,15 +60,17 @@ typedef struct averaged_perceptron_trainer {
uint64_t num_updates; uint64_t num_updates;
uint64_t num_errors; uint64_t num_errors;
uint32_t iterations; uint32_t iterations;
uint64_t min_updates;
khash_t(str_uint32) *features; khash_t(str_uint32) *features;
khash_t(str_uint32) *classes; khash_t(str_uint32) *classes;
cstring_array *class_strings; cstring_array *class_strings;
// {feature_id => {class_id => class_weight_t}} // {feature_id => {class_id => class_weight_t}}
khash_t(feature_class_weights) *weights; khash_t(feature_class_weights) *weights;
uint64_array *update_counts;
double_array *scores; double_array *scores;
} averaged_perceptron_trainer_t; } averaged_perceptron_trainer_t;
averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(void); averaged_perceptron_trainer_t *averaged_perceptron_trainer_new(uint64_t min_updates);
uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self, cstring_array *features); uint32_t averaged_perceptron_trainer_predict(averaged_perceptron_trainer_t *self, cstring_array *features);