diff --git a/src/averaged_perceptron_tagger.c b/src/averaged_perceptron_tagger.c new file mode 100644 index 00000000..840a10d1 --- /dev/null +++ b/src/averaged_perceptron_tagger.c @@ -0,0 +1,44 @@ +#include "averaged_perceptron_tagger.h" + + + +bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) { + + // Keep two tags of history in training + char *prev = START; + char *prev2 = START2; + + uint32_t prev_id = 0; + uint32_t prev2_id = 0; + + size_t num_tokens = tokenized->tokens->n; + + for (uint32_t i = 0; i < num_tokens; i++) { + cstring_array_clear(features); + + if (i > 0) { + prev = cstring_array_get_string(labels, prev_id); + } + + if (i > 1) { + prev2 = cstring_array_get_string(labels, prev2_id); + } + + if (!feature_function(tagger, context, tokenized, i, prev, prev2)) { + log_error("Could not add address parser features\n"); + return false; + } + + uint32_t guess = averaged_perceptron_predict(model, features); + char *predicted = cstring_array_get_string(model->classes, guess); + + cstring_array_add_string(labels, predicted); + + prev2_id = prev_id; + prev_id = guess; + + } + + return true; + +} diff --git a/src/averaged_perceptron_tagger.h b/src/averaged_perceptron_tagger.h new file mode 100644 index 00000000..02a07413 --- /dev/null +++ b/src/averaged_perceptron_tagger.h @@ -0,0 +1,29 @@ +/* +averaged_perceptron_tagger.h +---------------------------- + +An averaged perceptron tagger is a greedy sequence labeling +algorithm which uses two tags of history. + +*/ + +#ifndef AVERAGED_PERCEPTRON_TAGGER_H +#define AVERAGED_PERCEPTRON_TAGGER_H + +#include +#include +#include +#include + +#include "averaged_perceptron.h" +#include "tokens.h" + +#define START "START" +#define START2 "START2" + +// Arguments: tagger, context, tokenized str, index, i-1 tag, i-2 tag +typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t, char *, char *); + +bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized); + +#endif \ No newline at end of file