[parser] Adding address_parser_rewind to make multiple passes through the file when compiling the phrase tries
This commit is contained in:
@@ -18,11 +18,16 @@ address_parser_data_set_t *address_parser_data_set_init(char *filename) {
|
|||||||
return data_set;
|
return data_set;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool address_parser_data_set_rewind(address_parser_data_set_t *self) {
|
||||||
|
if (self == NULL || self->f == NULL) return false;
|
||||||
|
|
||||||
bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, char *input) {
|
return (fseek(self->f, 0, SEEK_SET) == 0);
|
||||||
token_array *tokens = data_set->tokens;
|
}
|
||||||
uint32_array *separators = data_set->separators;
|
|
||||||
cstring_array *labels = data_set->labels;
|
bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char *input) {
|
||||||
|
token_array *tokens = self->tokens;
|
||||||
|
uint32_array *separators = self->separators;
|
||||||
|
cstring_array *labels = self->labels;
|
||||||
|
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
|
|
||||||
@@ -122,10 +127,10 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
bool address_parser_data_set_next(address_parser_data_set_t *data_set) {
|
bool address_parser_data_set_next(address_parser_data_set_t *self) {
|
||||||
if (data_set == NULL) return false;
|
if (self == NULL) return false;
|
||||||
|
|
||||||
char *line = file_getline(data_set->f);
|
char *line = file_getline(self->f);
|
||||||
if (line == NULL) {
|
if (line == NULL) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -138,6 +143,7 @@ bool address_parser_data_set_next(address_parser_data_set_t *data_set) {
|
|||||||
|
|
||||||
if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) {
|
if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) {
|
||||||
log_error("Token count did not match, ected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count);
|
log_error("Token count did not match, ected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE);
|
char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE);
|
||||||
@@ -155,30 +161,30 @@ bool address_parser_data_set_next(address_parser_data_set_t *data_set) {
|
|||||||
|
|
||||||
log_debug("Normalized: %s\n", normalized);
|
log_debug("Normalized: %s\n", normalized);
|
||||||
|
|
||||||
token_array *tokens = data_set->tokens;
|
token_array *tokens = self->tokens;
|
||||||
cstring_array *labels = data_set->labels;
|
cstring_array *labels = self->labels;
|
||||||
uint32_array *separators = data_set->separators;
|
uint32_array *separators = self->separators;
|
||||||
|
|
||||||
token_array_clear(tokens);
|
token_array_clear(tokens);
|
||||||
cstring_array_clear(labels);
|
cstring_array_clear(labels);
|
||||||
uint32_array_clear(separators);
|
uint32_array_clear(separators);
|
||||||
size_t len = strlen(normalized);
|
size_t len = strlen(normalized);
|
||||||
|
|
||||||
char_array_clear(data_set->country);
|
char_array_clear(self->country);
|
||||||
char_array_add(data_set->country, country);
|
char_array_add(self->country, country);
|
||||||
|
|
||||||
char_array_clear(data_set->language);
|
char_array_clear(self->language);
|
||||||
char_array_add(data_set->language, language);
|
char_array_add(self->language, language);
|
||||||
|
|
||||||
tokenized_string_t *tokenized_str = NULL;
|
tokenized_string_t *tokenized_str = NULL;
|
||||||
|
|
||||||
if (address_parser_data_set_tokenize_line(data_set, normalized)) {
|
if (address_parser_data_set_tokenize_line(self, normalized)) {
|
||||||
// Add tokens as discrete strings for easier use in feature functions
|
// Add tokens as discrete strings for easier use in feature functions
|
||||||
bool copy_tokens = true;
|
bool copy_tokens = true;
|
||||||
tokenized_str = tokenized_string_from_tokens(normalized, data_set->tokens, copy_tokens);
|
tokenized_str = tokenized_string_from_tokens(normalized, self->tokens, copy_tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
data_set->tokenized_str = tokenized_str;
|
self->tokenized_str = tokenized_str;
|
||||||
|
|
||||||
free(normalized);
|
free(normalized);
|
||||||
cstring_array_destroy(fields);
|
cstring_array_destroy(fields);
|
||||||
|
|||||||
@@ -33,8 +33,9 @@ typedef struct address_parser_data_set {
|
|||||||
|
|
||||||
|
|
||||||
address_parser_data_set_t *address_parser_data_set_init(char *filename);
|
address_parser_data_set_t *address_parser_data_set_init(char *filename);
|
||||||
bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_ser, char *input);
|
bool address_parser_data_set_rewind(address_parser_data_set_t *self);
|
||||||
bool address_parser_data_set_next(address_parser_data_set_t *data_set);
|
bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char *input);
|
||||||
|
bool address_parser_data_set_next(address_parser_data_set_t *self);
|
||||||
void address_parser_data_set_destroy(address_parser_data_set_t *self);
|
void address_parser_data_set_destroy(address_parser_data_set_t *self);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
Reference in New Issue
Block a user