[parser] using separate arrays for features requiring tag history and making the tagger responsible for those features so the feature function does not require passing in prev and prev2 explicitly (i.e. don't need to run the feature function multiple times if using global best-sequence prediction)

This commit is contained in:
Al
2017-02-19 14:21:32 -08:00
parent ae85e3c0a0
commit 8ea5405c20
8 changed files with 94 additions and 42 deletions

View File

@@ -320,7 +320,7 @@ bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *se
return true;
}
bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) {
bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) {
// Keep two tags of history in training
char *prev = START;
char *prev2 = START2;
@@ -353,7 +353,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
prev2 = START;
}
if (!feature_function(tagger, context, tokenized, i, prev, prev2)) {
if (!feature_function(tagger, context, tokenized, i)) {
log_error("Could not add address parser features\n");
return false;
}
@@ -365,6 +365,17 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
return false;
}
uint32_t fidx;
const char *feature;
cstring_array_foreach(prev_tag_features, fidx, feature, {
feature_array_add(features, 2, (char *)feature, prev);
})
cstring_array_foreach(prev2_tag_features, fidx, feature, {
feature_array_add(features, 3, (char *)feature, prev2, prev);
})
uint32_t guess = averaged_perceptron_trainer_predict(self, features);
// Online error-driven learning, only needs to update weights when it gets a wrong answer, making training fast