[parser] using separate arrays for features requiring tag history and making the tagger responsible for those features so the feature function does not require passing in prev and prev2 explicitly (i.e. don't need to run the feature function multiple times if using global best-sequence prediction)

This commit is contained in:
Al
2017-02-19 14:21:32 -08:00
parent ae85e3c0a0
commit 8ea5405c20
8 changed files with 94 additions and 42 deletions

View File

@@ -406,6 +406,14 @@ void address_parser_context_destroy(address_parser_context_t *self) {
cstring_array_destroy(self->features); cstring_array_destroy(self->features);
} }
if (self->prev_tag_features != NULL) {
cstring_array_destroy(self->prev_tag_features);
}
if (self->prev2_tag_features != NULL) {
cstring_array_destroy(self->prev2_tag_features);
}
if (self->tokenized_str != NULL) { if (self->tokenized_str != NULL) {
tokenized_string_destroy(self->tokenized_str); tokenized_string_destroy(self->tokenized_str);
} }
@@ -558,6 +566,16 @@ address_parser_context_t *address_parser_context_new(void) {
goto exit_address_parser_context_allocated; goto exit_address_parser_context_allocated;
} }
context->prev_tag_features = cstring_array_new();
if (context->prev_tag_features == NULL) {
goto exit_address_parser_context_allocated;
}
context->prev2_tag_features = cstring_array_new();
if (context->prev2_tag_features == NULL) {
goto exit_address_parser_context_allocated;
}
context->tokenized_str = tokenized_string_new(); context->tokenized_str = tokenized_string_new();
if (context->tokenized_str == NULL) { if (context->tokenized_str == NULL) {
goto exit_address_parser_context_allocated; goto exit_address_parser_context_allocated;
@@ -999,13 +1017,15 @@ char *prev2: the predicted tag at index i - 2
*/ */
bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx, char *prev, char *prev2) { bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) {
if (self == NULL || ctx == NULL) return false; if (self == NULL || ctx == NULL) return false;
address_parser_t *parser = (address_parser_t *)self; address_parser_t *parser = (address_parser_t *)self;
address_parser_context_t *context = (address_parser_context_t *)ctx; address_parser_context_t *context = (address_parser_context_t *)ctx;
cstring_array *features = context->features; cstring_array *features = context->features;
cstring_array *prev_tag_features = context->prev_tag_features;
cstring_array *prev2_tag_features = context->prev2_tag_features;
char *language = context->language; char *language = context->language;
char *country = context->country; char *country = context->country;
@@ -1020,6 +1040,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
uint32_array *separators = context->separators; uint32_array *separators = context->separators;
cstring_array_clear(features); cstring_array_clear(features);
cstring_array_clear(prev_tag_features);
cstring_array_clear(prev2_tag_features);
token_array *tokens = tokenized->tokens; token_array *tokens = tokenized->tokens;
@@ -1366,7 +1388,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
if (idx == 0) { if (idx == 0) {
feature_array_add(features, 2, "first word", word); feature_array_add(features, 2, "first word", word);
//feature_array_add(features, 3, "prev tag=START+word+next word", word, next_word); //feature_array_add(features, 3, "first word+next word", word, next_word);
} }
} else if (component_phrase_string != NULL) { } else if (component_phrase_string != NULL) {
@@ -1375,16 +1397,15 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
word = phrase_string; word = phrase_string;
} }
if (prev != NULL && last_index == idx - 1) { if (last_index == idx - 1) {
// Previous tag and current word // Previous tag and current word
feature_array_add(features, 3, "prev tag+word", prev, word); feature_array_add(prev_tag_features, 2, "prev tag+word", word);
feature_array_add(features, 2, "prev tag", prev); feature_array_add(prev_tag_features, 1, "prev tag");
if (prev2 != NULL) {
// Previous two tags and current word // Previous two tags and current word
feature_array_add(features, 4, "prev2 tag+prev tag+word", prev2, prev, word); feature_array_add(prev2_tag_features, 2, "prev2 tag+prev tag+word", word);
feature_array_add(features, 3, "prev2 tag+prev tag", prev2, prev); feature_array_add(prev2_tag_features, 1, "prev2 tag+prev tag");
}
} }
if (last_index >= 0) { if (last_index >= 0) {
@@ -1405,7 +1426,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
if (last_index == idx - 1) { if (last_index == idx - 1) {
feature_array_add(features, 3, "prev tag+prev word", prev, prev_word); feature_array_add(prev_tag_features, 2, "prev tag+prev word", prev_word);
} }
// Previous word and current word // Previous word and current word
@@ -1542,19 +1563,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
} }
} }
if (parser->options.print_features) {
uint32_t fidx;
char *feature;
printf("{ ");
size_t num_features = cstring_array_num_strings(features);
cstring_array_foreach(context->features, fidx, feature, {
printf("%s", feature);
if (fidx < num_features - 1) printf(", ");
})
printf(" }\n");
}
return true; return true;
} }
@@ -1682,9 +1690,23 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c
char *prev_label = NULL; char *prev_label = NULL;
if (averaged_perceptron_tagger_predict(model, parser, context, context->features, token_labels, &address_parser_features, tokenized_str)) { if (averaged_perceptron_tagger_predict(model, parser, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, &address_parser_features, tokenized_str)) {
response = address_parser_response_new(); response = address_parser_response_new();
if (parser->options.print_features) {
uint32_t fidx;
char *feature;
printf("{ ");
size_t num_features = cstring_array_num_strings(context->features);
cstring_array_foreach(context->features, fidx, feature, {
printf("%s", feature);
if (fidx < num_features - 1) printf(", ");
})
printf(" }\n");
}
size_t num_strings = cstring_array_num_strings(tokenized_str->strings); size_t num_strings = cstring_array_num_strings(tokenized_str->strings);
cstring_array *labels = cstring_array_new_size(num_strings); cstring_array *labels = cstring_array_new_size(num_strings);

View File

@@ -129,6 +129,8 @@ typedef struct address_parser_context {
char *language; char *language;
char *country; char *country;
cstring_array *features; cstring_array *features;
cstring_array *prev_tag_features;
cstring_array *prev2_tag_features;
// Temporary strings used at each token during feature extraction // Temporary strings used at each token during feature extraction
char_array *phrase; char_array *phrase;
char_array *context_phrase; char_array *context_phrase;
@@ -211,7 +213,7 @@ void address_parser_context_destroy(address_parser_context_t *self);
void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country); void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country);
// Feature function // Feature function
bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2); bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i);
// I/O methods // I/O methods

View File

@@ -80,7 +80,7 @@ bool address_parser_test(address_parser_t *parser, char *filename, address_parse
size_t starting_errors = result->num_errors; size_t starting_errors = result->num_errors;
if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, token_labels, &address_parser_features, data_set->tokenized_str)) { if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, &address_parser_features, data_set->tokenized_str)) {
uint32_t i; uint32_t i;
char *predicted; char *predicted;
cstring_array_foreach(token_labels, i, predicted, { cstring_array_foreach(token_labels, i, predicted, {

View File

@@ -4,6 +4,7 @@
#include "averaged_perceptron_trainer.h" #include "averaged_perceptron_trainer.h"
#include "collections.h" #include "collections.h"
#include "constants.h" #include "constants.h"
#include "cooccurrences.h"
#include "file_utils.h" #include "file_utils.h"
#include "geodb.h" #include "geodb.h"
#include "shuffle.h" #include "shuffle.h"
@@ -702,9 +703,13 @@ address_parser_t *address_parser_init(char *filename) {
} }
}) })
size_t hash_size;
const char *context_token;
bool sort_reverse = true;
log_info("Creating phrase_types trie\n"); log_info("Creating phrase_types trie\n");
bool sort_reverse = true; sort_reverse = true;
char **phrase_keys = str_uint32_hash_sort_keys_by_value(phrase_counts, sort_reverse); char **phrase_keys = str_uint32_hash_sort_keys_by_value(phrase_counts, sort_reverse);
if (phrase_keys == NULL) { if (phrase_keys == NULL) {
log_error("phrase_keys == NULL\n"); log_error("phrase_keys == NULL\n");
@@ -713,7 +718,7 @@ address_parser_t *address_parser_init(char *filename) {
goto exit_hashes_allocated; goto exit_hashes_allocated;
} }
size_t hash_size = kh_size(phrase_counts); hash_size = kh_size(phrase_counts);
address_parser_types_array *phrase_types_array = address_parser_types_array_new_size(hash_size); address_parser_types_array *phrase_types_array = address_parser_types_array_new_size(hash_size);
for (size_t idx = 0; idx < hash_size; idx++) { for (size_t idx = 0; idx < hash_size; idx++) {
@@ -828,7 +833,6 @@ address_parser_t *address_parser_init(char *filename) {
} }
khash_t(str_set) *context_phrases; khash_t(str_set) *context_phrases;
const char *context_token;
uint32_t postal_code_id; uint32_t postal_code_id;
uint32_t context_phrase_id; uint32_t context_phrase_id;
@@ -970,7 +974,7 @@ bool address_parser_train_epoch(address_parser_t *self, averaged_perceptron_trai
address_parser_context_fill(context, self, data_set->tokenized_str, language, country); address_parser_context_fill(context, self, data_set->tokenized_str, language, country);
bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, &address_parser_features, data_set->tokenized_str, data_set->labels); bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, &address_parser_features, data_set->tokenized_str, data_set->labels);
if (!example_success) { if (!example_success) {
log_error("Error training example\n"); log_error("Error training example\n");

View File

@@ -2,11 +2,11 @@
#include "log/log.h" #include "log/log.h"
bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) { bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) {
// Keep two tags of history in training // Keep two tags of history in training
char *prev = START; char *prev = NULL;
char *prev2 = START2; char *prev2 = NULL;
uint32_t prev_id = 0; uint32_t prev_id = 0;
uint32_t prev2_id = 0; uint32_t prev2_id = 0;
@@ -22,17 +22,26 @@ bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagg
if (i > 1) { if (i > 1) {
prev2 = cstring_array_get_string(model->classes, prev2_id); prev2 = cstring_array_get_string(model->classes, prev2_id);
} else if (i == 1) {
prev2 = START;
} }
log_debug("prev=%s, prev2=%s\n", prev, prev2); log_debug("prev=%s, prev2=%s\n", prev, prev2);
if (!feature_function(tagger, context, tokenized, i, prev, prev2)) { if (!feature_function(tagger, context, tokenized, i)) {
log_error("Could not add address parser features\n"); log_error("Could not add address parser features\n");
return false; return false;
} }
uint32_t fidx;
const char *feature;
cstring_array_foreach(prev_tag_features, fidx, feature, {
feature_array_add(features, 2, (char *)feature, prev);
})
cstring_array_foreach(prev2_tag_features, fidx, feature, {
feature_array_add(features, 3, (char *)feature, prev2, prev);
})
uint32_t guess = averaged_perceptron_predict(model, features); uint32_t guess = averaged_perceptron_predict(model, features);
char *predicted = cstring_array_get_string(model->classes, guess); char *predicted = cstring_array_get_string(model->classes, guess);

View File

@@ -18,14 +18,15 @@ the current value.
#include <string.h> #include <string.h>
#include "averaged_perceptron.h" #include "averaged_perceptron.h"
#include "features.h"
#include "tokens.h" #include "tokens.h"
#define START "START" #define START "START"
#define START2 "START2" #define START2 "START2"
// Arguments: tagger, context, tokenized str, index, i-1 tag, i-2 tag // Arguments: tagger, context, tokenized str, index
typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t, char *, char *); typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t);
bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized); bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized);
#endif #endif

View File

@@ -320,7 +320,7 @@ bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *se
return true; return true;
} }
bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) { bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) {
// Keep two tags of history in training // Keep two tags of history in training
char *prev = START; char *prev = START;
char *prev2 = START2; char *prev2 = START2;
@@ -353,7 +353,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
prev2 = START; prev2 = START;
} }
if (!feature_function(tagger, context, tokenized, i, prev, prev2)) { if (!feature_function(tagger, context, tokenized, i)) {
log_error("Could not add address parser features\n"); log_error("Could not add address parser features\n");
return false; return false;
} }
@@ -365,6 +365,17 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
return false; return false;
} }
uint32_t fidx;
const char *feature;
cstring_array_foreach(prev_tag_features, fidx, feature, {
feature_array_add(features, 2, (char *)feature, prev);
})
cstring_array_foreach(prev2_tag_features, fidx, feature, {
feature_array_add(features, 3, (char *)feature, prev2, prev);
})
uint32_t guess = averaged_perceptron_trainer_predict(self, features); uint32_t guess = averaged_perceptron_trainer_predict(self, features);
// Online error-driven learning, only needs to update weights when it gets a wrong answer, making training fast // Online error-driven learning, only needs to update weights when it gets a wrong answer, making training fast

View File

@@ -36,6 +36,7 @@ Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf
#include "averaged_perceptron.h" #include "averaged_perceptron.h"
#include "averaged_perceptron_tagger.h" #include "averaged_perceptron_tagger.h"
#include "collections.h" #include "collections.h"
#include "features.h"
#include "string_utils.h" #include "string_utils.h"
#include "tokens.h" #include "tokens.h"
#include "trie.h" #include "trie.h"
@@ -75,6 +76,8 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se
void *tagger, void *tagger,
void *context, void *context,
cstring_array *features, cstring_array *features,
cstring_array *prev_tag_features,
cstring_array *prev2_tag_features,
ap_tagger_feature_function feature_function, ap_tagger_feature_function feature_function,
tokenized_string_t *tokenized, tokenized_string_t *tokenized,
cstring_array *labels cstring_array *labels