Files
libpostal/src/crf_trainer_averaged_perceptron.h
Al 9afff5c9ed [parser/crf] adding an initial training algorithm for CRFs, the averaged
perceptron (FTW!)

Though it does not generate scores suitable for use as probabilties, and
might achieve slightly lower accuracy on some tasks than its
gradient-based counterparts like SGD (a possibility for libpostal)
or LBFGS (prohibitive on this much data), the averaged perceptron is
appealing for two reasons: speed and low memory usage i.e. we can still use
all the same tricks as in the greedy model like sparse construction of
the weight matrix. In this case we can go even sparser than in the
original because the state-transition features are separate from the
state features, and we need to be able to iterate over all of them
instead of simply creating new string keys in the feature space. The
solution to this is quite simple: we simply treat the weights for each
state-transition feature as if they have L * L output labels instead of
simply L. So instead of:

{
    "prev|road|word|DD": {1: 1.0, 2: -1.0}
    ...
}

We'd have:

{
    "word|DD": {(0, 1): 1.0, (0, 2): -1.0}
    ...
}

As usual we compress the features to a trie, and the weights to
compressed-sparse row (CSR) format sparse matrix after the weights have
been averaged. These representations are smaller, faster to load from
disk, and faster to use at runtime (contiguous arrays vs hashtables).

This also includes the min_updates variation from the greedy perceptron,
so features that participate in fewer than N updates are discarded at
the end (and also not used in scoring until they meet the threshold so
the model doesn't become dependent on features it doesn't really have).
This tends to discard irrelevant features, keeping the model small
without hurting accuracy much (within a tenth of a percent or so in my
tests on the greedy perceptron).
2017-03-10 01:28:31 -05:00

68 lines
2.4 KiB
C

#ifndef CRF_AVERAGED_PERCEPTRON_TRAINER_H
#define CRF_AVERAGED_PERCEPTRON_TRAINER_H
#include <stdio.h>
#include <stdlib.h>
#include "averaged_perceptron_trainer.h"
#include "crf.h"
#include "crf_trainer.h"
#include "collections.h"
#include "string_utils.h"
#include "tokens.h"
#include "trie.h"
#include "trie_utils.h"
typedef union tag_bigram {
uint64_t value;
struct {
uint32_t prev_class_id:32;
uint32_t class_id:32;
};
} tag_bigram_t;
KHASH_MAP_INIT_INT64(prev_tag_class_weights, class_weight_t)
KHASH_MAP_INIT_INT(feature_prev_tag_class_weights, khash_t(prev_tag_class_weights) *)
typedef struct crf_averaged_perceptron_trainer {
crf_trainer_t *base_trainer;
uint64_t num_updates;
uint64_t num_errors;
uint32_t iterations;
uint64_t min_updates;
// {feature_id => {class_id => class_weight_t}}
khash_t(feature_class_weights) *weights;
khash_t(feature_prev_tag_class_weights) *prev_tag_weights;
khash_t(prev_tag_class_weights) *trans_weights;
uint64_array *update_counts;
uint64_array *prev_tag_update_counts;
cstring_array *sequence_features;
uint32_array *sequence_features_indptr;
cstring_array *sequence_prev_tag_features;
uint32_array *sequence_prev_tag_features_indptr;
uint32_array *label_ids;
uint32_array *viterbi;
} crf_averaged_perceptron_trainer_t;
crf_averaged_perceptron_trainer_t *crf_averaged_perceptron_trainer_new(size_t num_classes, size_t min_updates);
uint32_t crf_averaged_perceptron_trainer_predict(crf_averaged_perceptron_trainer_t *self, cstring_array *features);
bool crf_averaged_perceptron_trainer_train_example(crf_averaged_perceptron_trainer_t *self,
void *tagger,
void *context,
cstring_array *features,
cstring_array *prev_tag_features,
tagger_feature_function feature_function,
tokenized_string_t *tokenized,
cstring_array *labels
);
crf_t *crf_averaged_perceptron_trainer_finalize(crf_averaged_perceptron_trainer_t *self);
void crf_averaged_perceptron_trainer_destroy(crf_averaged_perceptron_trainer_t *self);
#endif