[parser] Averaged perceptron tagger
This commit is contained in:
44
src/averaged_perceptron_tagger.c
Normal file
44
src/averaged_perceptron_tagger.c
Normal file
@@ -0,0 +1,44 @@
|
||||
#include "averaged_perceptron_tagger.h"
|
||||
|
||||
|
||||
|
||||
bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized) {
|
||||
|
||||
// Keep two tags of history in training
|
||||
char *prev = START;
|
||||
char *prev2 = START2;
|
||||
|
||||
uint32_t prev_id = 0;
|
||||
uint32_t prev2_id = 0;
|
||||
|
||||
size_t num_tokens = tokenized->tokens->n;
|
||||
|
||||
for (uint32_t i = 0; i < num_tokens; i++) {
|
||||
cstring_array_clear(features);
|
||||
|
||||
if (i > 0) {
|
||||
prev = cstring_array_get_string(labels, prev_id);
|
||||
}
|
||||
|
||||
if (i > 1) {
|
||||
prev2 = cstring_array_get_string(labels, prev2_id);
|
||||
}
|
||||
|
||||
if (!feature_function(tagger, context, tokenized, i, prev, prev2)) {
|
||||
log_error("Could not add address parser features\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t guess = averaged_perceptron_predict(model, features);
|
||||
char *predicted = cstring_array_get_string(model->classes, guess);
|
||||
|
||||
cstring_array_add_string(labels, predicted);
|
||||
|
||||
prev2_id = prev_id;
|
||||
prev_id = guess;
|
||||
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
29
src/averaged_perceptron_tagger.h
Normal file
29
src/averaged_perceptron_tagger.h
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
averaged_perceptron_tagger.h
|
||||
----------------------------
|
||||
|
||||
An averaged perceptron tagger is a greedy sequence labeling
|
||||
algorithm which uses two tags of history.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef AVERAGED_PERCEPTRON_TAGGER_H
|
||||
#define AVERAGED_PERCEPTRON_TAGGER_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "averaged_perceptron.h"
|
||||
#include "tokens.h"
|
||||
|
||||
#define START "START"
|
||||
#define START2 "START2"
|
||||
|
||||
// Arguments: tagger, context, tokenized str, index, i-1 tag, i-2 tag
|
||||
typedef bool (*ap_tagger_feature_function)(void *, void *, tokenized_string_t *, uint32_t, char *, char *);
|
||||
|
||||
bool averaged_perceptron_tagger_predict(averaged_perceptron_t *model, void *tagger, void *context, cstring_array *features, cstring_array *labels, ap_tagger_feature_function feature_function, tokenized_string_t *tokenized);
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user