[parser] adding polymorphic (as much as C does polymorphism) model type for the parser to allow it to handle either the greedy averaged perceptron or a CRF. During training, saving, and loading, we use a different filename for a parser trained with a CRF, which is still backward-compatible with models previously trained in parser-data. Making necessary modifications to address_parser.c, address_parser_train.c, and address_parser_test.c. Also adding an option in address_parser_test to print individual errors in addition to the confusion matrix.

This commit is contained in:
Al
2017-03-10 19:19:40 -05:00
parent 1bd4689c5f
commit 8deb1716cb
4 changed files with 281 additions and 61 deletions

View File

@@ -46,11 +46,13 @@ with the general error-driven averaged perceptron.
#include <stdint.h>
#include <stdbool.h>
#include "averaged_perceptron.h"
#include "averaged_perceptron_tagger.h"
#include "libpostal.h"
#include "libpostal_config.h"
#include "averaged_perceptron.h"
#include "averaged_perceptron_tagger.h"
#include "collections.h"
#include "crf.h"
#include "normalize.h"
#include "string_utils.h"
@@ -178,6 +180,11 @@ typedef union postal_code_context_value {
#define POSTAL_CODE_CONTEXT(pc, ad) ((postal_code_context_value_t){.postcode = (pc), .admin = (ad) })
typedef enum address_parser_model_type {
ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON,
ADDRESS_PARSER_TYPE_CRF
} address_parser_model_type_t;
typedef struct parser_options {
uint64_t rare_word_threshold;
bool print_features;
@@ -187,7 +194,11 @@ typedef struct parser_options {
typedef struct address_parser {
parser_options_t options;
size_t num_classes;
averaged_perceptron_t *model;
address_parser_model_type_t model_type;
union {
averaged_perceptron_t *ap;
crf_t *crf;
} model;
trie_t *vocab;
trie_t *phrases;
address_parser_types_array *phrase_types;
@@ -208,6 +219,8 @@ void address_parser_destroy(address_parser_t *self);
char *address_parser_normalize_string(char *str);
void address_parser_normalize_token(cstring_array *array, char *str, token_t token);
bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str);
address_parser_context_t *address_parser_context_new(void);
void address_parser_context_destroy(address_parser_context_t *self);