diff --git a/src/address_parser.c b/src/address_parser.c index be12c362..608dc33c 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -406,6 +406,14 @@ void address_parser_context_destroy(address_parser_context_t *self) { cstring_array_destroy(self->features); } + if (self->prev_tag_features != NULL) { + cstring_array_destroy(self->prev_tag_features); + } + + if (self->prev2_tag_features != NULL) { + cstring_array_destroy(self->prev2_tag_features); + } + if (self->tokenized_str != NULL) { tokenized_string_destroy(self->tokenized_str); } @@ -558,6 +566,16 @@ address_parser_context_t *address_parser_context_new(void) { goto exit_address_parser_context_allocated; } + context->prev_tag_features = cstring_array_new(); + if (context->prev_tag_features == NULL) { + goto exit_address_parser_context_allocated; + } + + context->prev2_tag_features = cstring_array_new(); + if (context->prev2_tag_features == NULL) { + goto exit_address_parser_context_allocated; + } + context->tokenized_str = tokenized_string_new(); if (context->tokenized_str == NULL) { goto exit_address_parser_context_allocated; @@ -999,13 +1017,15 @@ char *prev2: the predicted tag at index i - 2 */ -bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx, char *prev, char *prev2) { +bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) { if (self == NULL || ctx == NULL) return false; address_parser_t *parser = (address_parser_t *)self; address_parser_context_t *context = (address_parser_context_t *)ctx; cstring_array *features = context->features; + cstring_array *prev_tag_features = context->prev_tag_features; + cstring_array *prev2_tag_features = context->prev2_tag_features; char *language = context->language; char *country = context->country; @@ -1020,6 +1040,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize uint32_array *separators = context->separators; cstring_array_clear(features); + cstring_array_clear(prev_tag_features); + cstring_array_clear(prev2_tag_features); token_array *tokens = tokenized->tokens; @@ -1366,7 +1388,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (idx == 0) { feature_array_add(features, 2, "first word", word); - //feature_array_add(features, 3, "prev tag=START+word+next word", word, next_word); + //feature_array_add(features, 3, "first word+next word", word, next_word); } } else if (component_phrase_string != NULL) { @@ -1375,16 +1397,15 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize word = phrase_string; } - if (prev != NULL && last_index == idx - 1) { + if (last_index == idx - 1) { // Previous tag and current word - feature_array_add(features, 3, "prev tag+word", prev, word); - feature_array_add(features, 2, "prev tag", prev); + feature_array_add(prev_tag_features, 2, "prev tag+word", word); + feature_array_add(prev_tag_features, 1, "prev tag"); - if (prev2 != NULL) { - // Previous two tags and current word - feature_array_add(features, 4, "prev2 tag+prev tag+word", prev2, prev, word); - feature_array_add(features, 3, "prev2 tag+prev tag", prev2, prev); - } + + // Previous two tags and current word + feature_array_add(prev2_tag_features, 2, "prev2 tag+prev tag+word", word); + feature_array_add(prev2_tag_features, 1, "prev2 tag+prev tag"); } if (last_index >= 0) { @@ -1405,7 +1426,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (last_index == idx - 1) { - feature_array_add(features, 3, "prev tag+prev word", prev, prev_word); + feature_array_add(prev_tag_features, 2, "prev tag+prev word", prev_word); } // Previous word and current word @@ -1542,19 +1563,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } } - if (parser->options.print_features) { - uint32_t fidx; - char *feature; - - printf("{ "); - size_t num_features = cstring_array_num_strings(features); - cstring_array_foreach(context->features, fidx, feature, { - printf("%s", feature); - if (fidx < num_features - 1) printf(", "); - }) - printf(" }\n"); - } - return true; } @@ -1682,9 +1690,23 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c char *prev_label = NULL; - if (averaged_perceptron_tagger_predict(model, parser, context, context->features, token_labels, &address_parser_features, tokenized_str)) { + if (averaged_perceptron_tagger_predict(model, parser, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, &address_parser_features, tokenized_str)) { response = address_parser_response_new(); + if (parser->options.print_features) { + uint32_t fidx; + char *feature; + + printf("{ "); + size_t num_features = cstring_array_num_strings(context->features); + cstring_array_foreach(context->features, fidx, feature, { + printf("%s", feature); + if (fidx < num_features - 1) printf(", "); + }) + printf(" }\n"); + } + + size_t num_strings = cstring_array_num_strings(tokenized_str->strings); cstring_array *labels = cstring_array_new_size(num_strings); diff --git a/src/address_parser.h b/src/address_parser.h index 33e92a4a..b3df7837 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -129,6 +129,8 @@ typedef struct address_parser_context { char *language; char *country; cstring_array *features; + cstring_array *prev_tag_features; + cstring_array *prev2_tag_features; // Temporary strings used at each token during feature extraction char_array *phrase; char_array *context_phrase; @@ -211,7 +213,7 @@ void address_parser_context_destroy(address_parser_context_t *self); void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country); // Feature function -bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i, char *prev, char *prev2); +bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i); // I/O methods diff --git a/src/address_parser_test.c b/src/address_parser_test.c index 35cf1093..0a44ff40 100644 --- a/src/address_parser_test.c +++ b/src/address_parser_test.c @@ -80,7 +80,7 @@ bool address_parser_test(address_parser_t *parser, char *filename, address_parse size_t starting_errors = result->num_errors; - if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, token_labels, &address_parser_features, data_set->tokenized_str)) { + if (averaged_perceptron_tagger_predict(parser->model, parser, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, &address_parser_features, data_set->tokenized_str)) { uint32_t i; char *predicted; cstring_array_foreach(token_labels, i, predicted, { diff --git a/src/address_parser_train.c b/src/address_parser_train.c index 371c2518..71fcbb20 100644 --- a/src/address_parser_train.c +++ b/src/address_parser_train.c @@ -4,6 +4,7 @@ #include "averaged_perceptron_trainer.h" #include "collections.h" #include "constants.h" +#include "cooccurrences.h" #include "file_utils.h" #include "geodb.h" #include "shuffle.h" @@ -702,9 +703,13 @@ address_parser_t *address_parser_init(char *filename) { } }) + size_t hash_size; + const char *context_token; + bool sort_reverse = true; + log_info("Creating phrase_types trie\n"); - bool sort_reverse = true; + sort_reverse = true; char **phrase_keys = str_uint32_hash_sort_keys_by_value(phrase_counts, sort_reverse); if (phrase_keys == NULL) { log_error("phrase_keys == NULL\n"); @@ -713,7 +718,7 @@ address_parser_t *address_parser_init(char *filename) { goto exit_hashes_allocated; } - size_t hash_size = kh_size(phrase_counts); + hash_size = kh_size(phrase_counts); address_parser_types_array *phrase_types_array = address_parser_types_array_new_size(hash_size); for (size_t idx = 0; idx < hash_size; idx++) { @@ -828,7 +833,6 @@ address_parser_t *address_parser_init(char *filename) { } khash_t(str_set) *context_phrases; - const char *context_token; uint32_t postal_code_id; uint32_t context_phrase_id; @@ -970,7 +974,7 @@ bool address_parser_train_epoch(address_parser_t *self, averaged_perceptron_trai address_parser_context_fill(context, self, data_set->tokenized_str, language, country); - bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, &address_parser_features, data_set->tokenized_str, data_set->labels); + bool example_success = averaged_perceptron_trainer_train_example(trainer, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, &address_parser_features, data_set->tokenized_str, data_set->labels); if (!example_success) { log_error("Error training example\n"); diff --git a/src/averaged_perceptron_tagger.c b/src/averaged_perceptron_tagger.c index 83c7324a..781b64e0 100644 --- a/src/averaged_perceptron_tagger.c +++ b/src/averaged_perceptron_tagger.c @@ -2,11 +2,11 @@ #include "log/log.h" -bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) { +bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) { // Keep two tags of history in training - char *prev = START; - char *prev2 = START2; + char *prev = NULL; + char *prev2 = NULL; uint32_t prev_id = 0; uint32_t prev2_id = 0; @@ -22,17 +22,26 @@ bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagg if (i > 1) { prev2 = cstring_array_get_string(model->classes, prev2_id); - } else if (i == 1) { - prev2 = START; } log_debug("prev=%s, prev2=%s\n", prev, prev2); - if (!feature_function(tagger, context, tokenized, i, prev, prev2)) { + if (!feature_function(tagger, context, tokenized, i)) { log_error("Could not add address parser features\n"); return false; } + uint32_t fidx; + const char *feature; + + cstring_array_foreach(prev_tag_features, fidx, feature, { + feature_array_add(features, 2, (char *)feature, prev); + }) + + cstring_array_foreach(prev2_tag_features, fidx, feature, { + feature_array_add(features, 3, (char *)feature, prev2, prev); + }) + uint32_t guess = averaged_perceptron_predict(model, features); char *predicted = cstring_array_get_string(model->classes, guess); diff --git a/src/averaged_perceptron_tagger.h b/src/averaged_perceptron_tagger.h index bd9bcfe4..eae2e40a 100644 --- a/src/averaged_perceptron_tagger.h +++ b/src/averaged_perceptron_tagger.h @@ -18,14 +18,15 @@ the current value. #include #include "averaged_perceptron.h" +#include "features.h" #include "tokens.h" #define START "START" #define START2 "START2" -// Arguments: tagger, context, tokenized str, index, i-1 tag, i-2 tag -typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t, char *, char *); +// Arguments: tagger, context, tokenized str, index +typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t); -bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized); +bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized); #endif \ No newline at end of file diff --git a/src/averaged_perceptron_trainer.c b/src/averaged_perceptron_trainer.c index f2ad4352..aa89a68c 100644 --- a/src/averaged_perceptron_trainer.c +++ b/src/averaged_perceptron_trainer.c @@ -320,7 +320,7 @@ bool averaged_perceptron_trainer_update_counts(averaged_perceptron_trainer_t *se return true; } -bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) { +bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *self, void *tagger, void *context, cstring_array *features, cstring_array *prev_tag_features, cstring_array *prev2_tag_features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels) { // Keep two tags of history in training char *prev = START; char *prev2 = START2; @@ -353,7 +353,7 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se prev2 = START; } - if (!feature_function(tagger, context, tokenized, i, prev, prev2)) { + if (!feature_function(tagger, context, tokenized, i)) { log_error("Could not add address parser features\n"); return false; } @@ -365,6 +365,17 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se return false; } + uint32_t fidx; + const char *feature; + + cstring_array_foreach(prev_tag_features, fidx, feature, { + feature_array_add(features, 2, (char *)feature, prev); + }) + + cstring_array_foreach(prev2_tag_features, fidx, feature, { + feature_array_add(features, 3, (char *)feature, prev2, prev); + }) + uint32_t guess = averaged_perceptron_trainer_predict(self, features); // Online error-driven learning, only needs to update weights when it gets a wrong answer, making training fast diff --git a/src/averaged_perceptron_trainer.h b/src/averaged_perceptron_trainer.h index a100f3df..aad0488b 100644 --- a/src/averaged_perceptron_trainer.h +++ b/src/averaged_perceptron_trainer.h @@ -36,6 +36,7 @@ Link: http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf #include "averaged_perceptron.h" #include "averaged_perceptron_tagger.h" #include "collections.h" +#include "features.h" #include "string_utils.h" #include "tokens.h" #include "trie.h" @@ -75,6 +76,8 @@ bool averaged_perceptron_trainer_train_example(averaged_perceptron_trainer_t *se void *tagger, void *context, cstring_array *features, + cstring_array *prev_tag_features, + cstring_array *prev2_tag_features, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized, cstring_array *labels