[parser] Averaged perceptron tagger

This commit is contained in:
Al
2015-09-17 05:51:24 -04:00
parent 8a86f7ec64
commit 9e9131bda0
2 changed files with 73 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
#include "averaged_perceptron_tagger.h"
bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) {
// Keep two tags of history in training
char *prev = START;
char *prev2 = START2;
uint32_t prev_id = 0;
uint32_t prev2_id = 0;
size_t num_tokens = tokenized->tokens->n;
for (uint32_t i = 0; i < num_tokens; i++) {
cstring_array_clear(features);
if (i > 0) {
prev = cstring_array_get_string(labels, prev_id);
}
if (i > 1) {
prev2 = cstring_array_get_string(labels, prev2_id);
}
if (!feature_function(tagger, context, tokenized, i, prev, prev2)) {
log_error("Could not add address parser features\n");
return false;
}
uint32_t guess = averaged_perceptron_predict(model, features);
char *predicted = cstring_array_get_string(model->classes, guess);
cstring_array_add_string(labels, predicted);
prev2_id = prev_id;
prev_id = guess;
}
return true;
}

View File

@@ -0,0 +1,29 @@
/*
averaged_perceptron_tagger.h
----------------------------
An averaged perceptron tagger is a greedy sequence labeling
algorithm which uses two tags of history.
*/
#ifndef AVERAGED_PERCEPTRON_TAGGER_H
#define AVERAGED_PERCEPTRON_TAGGER_H
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include "averaged_perceptron.h"
#include "tokens.h"
#define START "START"
#define START2 "START2"
// Arguments: tagger, context, tokenized str, index, i-1 tag, i-2 tag
typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t, char *, char *);
bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized);
#endif