[parsing] Initial commit of the address parser, training/testing, feature function, I/O
This commit is contained in:
40
src/address_parser_io.h
Normal file
40
src/address_parser_io.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef ADDRESS_PARSER_IO_H
|
||||
#define ADDRESS_PARSER_IO_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "address_parser.h"
|
||||
#include "collections.h"
|
||||
#include "file_utils.h"
|
||||
#include "scanner.h"
|
||||
#include "string_utils.h"
|
||||
|
||||
#define AMBIGUOUS_LANGUAGE "xxx"
|
||||
#define UNKNOWN_LANGUAGE "unk"
|
||||
|
||||
enum address_parser_training_data_fields {
|
||||
ADDRESS_PARSER_FIELD_LANGUAGE,
|
||||
ADDRESS_PARSER_FIELD_COUNTRY,
|
||||
ADDRESS_PARSER_FIELD_ADDRESS,
|
||||
ADDRESS_PARSER_FILE_NUM_TOKENS
|
||||
};
|
||||
|
||||
typedef struct address_parser_data_set {
|
||||
FILE *f;
|
||||
token_array *tokens;
|
||||
tokenized_string_t *tokenized_str;
|
||||
cstring_array *labels;
|
||||
uint32_array *separators;
|
||||
char_array *language;
|
||||
char_array *country;
|
||||
} address_parser_data_set_t;
|
||||
|
||||
|
||||
address_parser_data_set_t *address_parser_data_set_init(char *filename);
|
||||
bool address_parser_data_set_tokenize_line(char *input, token_array *tokens, uint32_array *separators, cstring_array *labels);
|
||||
bool address_parser_data_set_next(address_parser_data_set_t *data_set);
|
||||
void address_parser_data_set_destroy(address_parser_data_set_t *self);
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user