[parsing] Initial commit of the address parser, training/testing, feature function, I/O

This commit is contained in:
Al
2015-11-30 14:48:13 -05:00
parent e62eb1e697
commit 89677d94a3
7 changed files with 1602 additions and 4 deletions

40
src/address_parser_io.h Normal file
View File

@@ -0,0 +1,40 @@
#ifndef ADDRESS_PARSER_IO_H
#define ADDRESS_PARSER_IO_H
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include "address_parser.h"
#include "collections.h"
#include "file_utils.h"
#include "scanner.h"
#include "string_utils.h"
#define AMBIGUOUS_LANGUAGE "xxx"
#define UNKNOWN_LANGUAGE "unk"
enum address_parser_training_data_fields {
ADDRESS_PARSER_FIELD_LANGUAGE,
ADDRESS_PARSER_FIELD_COUNTRY,
ADDRESS_PARSER_FIELD_ADDRESS,
ADDRESS_PARSER_FILE_NUM_TOKENS
};
typedef struct address_parser_data_set {
FILE *f;
token_array *tokens;
tokenized_string_t *tokenized_str;
cstring_array *labels;
uint32_array *separators;
char_array *language;
char_array *country;
} address_parser_data_set_t;
address_parser_data_set_t *address_parser_data_set_init(char *filename);
bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels);
bool address_parser_data_set_next(address_parser_data_set_t *data_set);
void address_parser_data_set_destroy(address_parser_data_set_t *self);
#endif