From 08f39d6b80e495bc3ff59ca3f816d216695a6667 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Jul 2016 17:13:49 -0400 Subject: [PATCH] [parser] Adding address_parser_rewind to make multiple passes through the file when compiling the phrase tries --- src/address_parser_io.c | 40 +++++++++++++++++++++++----------------- src/address_parser_io.h | 5 +++-- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/address_parser_io.c b/src/address_parser_io.c index 3bbc7284..a09bf783 100644 --- a/src/address_parser_io.c +++ b/src/address_parser_io.c @@ -18,11 +18,16 @@ address_parser_data_set_t *address_parser_data_set_init(char *filename) { return data_set; } +bool address_parser_data_set_rewind(address_parser_data_set_t *self) { + if (self == NULL || self->f == NULL) return false; -bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, char *input) { - token_array *tokens = data_set->tokens; - uint32_array *separators = data_set->separators; - cstring_array *labels = data_set->labels; + return (fseek(self->f, 0, SEEK_SET) == 0); +} + +bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char *input) { + token_array *tokens = self->tokens; + uint32_array *separators = self->separators; + cstring_array *labels = self->labels; size_t count = 0; @@ -122,10 +127,10 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, -bool address_parser_data_set_next(address_parser_data_set_t *data_set) { - if (data_set == NULL) return false; +bool address_parser_data_set_next(address_parser_data_set_t *self) { + if (self == NULL) return false; - char *line = file_getline(data_set->f); + char *line = file_getline(self->f); if (line == NULL) { return false; } @@ -138,6 +143,7 @@ bool address_parser_data_set_next(address_parser_data_set_t *data_set) { if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) { log_error("Token count did not match, ected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count); + return false; } char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE); @@ -155,30 +161,30 @@ bool address_parser_data_set_next(address_parser_data_set_t *data_set) { log_debug("Normalized: %s\n", normalized); - token_array *tokens = data_set->tokens; - cstring_array *labels = data_set->labels; - uint32_array *separators = data_set->separators; + token_array *tokens = self->tokens; + cstring_array *labels = self->labels; + uint32_array *separators = self->separators; token_array_clear(tokens); cstring_array_clear(labels); uint32_array_clear(separators); size_t len = strlen(normalized); - char_array_clear(data_set->country); - char_array_add(data_set->country, country); + char_array_clear(self->country); + char_array_add(self->country, country); - char_array_clear(data_set->language); - char_array_add(data_set->language, language); + char_array_clear(self->language); + char_array_add(self->language, language); tokenized_string_t *tokenized_str = NULL; - if (address_parser_data_set_tokenize_line(data_set, normalized)) { + if (address_parser_data_set_tokenize_line(self, normalized)) { // Add tokens as discrete strings for easier use in feature functions bool copy_tokens = true; - tokenized_str = tokenized_string_from_tokens(normalized, data_set->tokens, copy_tokens); + tokenized_str = tokenized_string_from_tokens(normalized, self->tokens, copy_tokens); } - data_set->tokenized_str = tokenized_str; + self->tokenized_str = tokenized_str; free(normalized); cstring_array_destroy(fields); diff --git a/src/address_parser_io.h b/src/address_parser_io.h index 46cab33d..745107e3 100644 --- a/src/address_parser_io.h +++ b/src/address_parser_io.h @@ -33,8 +33,9 @@ typedef struct address_parser_data_set { address_parser_data_set_t *address_parser_data_set_init(char *filename); -bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_ser, char *input); -bool address_parser_data_set_next(address_parser_data_set_t *data_set); +bool address_parser_data_set_rewind(address_parser_data_set_t *self); +bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char *input); +bool address_parser_data_set_next(address_parser_data_set_t *self); void address_parser_data_set_destroy(address_parser_data_set_t *self); #endif \ No newline at end of file