several copies of the same training example will be generated. 1. with only lowercasing 2. with simple Latin-ASCII normalization (no umlauts, only things that are common to all languages) 3. basic UTF-8 normalizations (accent stripping) 4. language-specific Latin-ASCII transliteration (e.g. ü => ue in German) This will apply both on the initial passes when building the phrase gazetteers and during each iteration of training. In this way, only the most basic normalizations like lowercasing need to be done at runtime and it's possible to use only minimal normalizations like lowercasing. May have a small effect on randomization as examples are created in a deterministic order. However, this should not lead to cycles since the base examples are shuffled, thus still satisfying the random permutation requirement of an online/stochastic learning algorithm.
296 lines
9.4 KiB
C
296 lines
9.4 KiB
C
#include "address_parser_io.h"
|
|
|
|
address_parser_data_set_t *address_parser_data_set_init(char *filename) {
|
|
address_parser_data_set_t *data_set = malloc(sizeof(address_parser_data_set_t));
|
|
data_set->f = fopen(filename, "r");
|
|
if (data_set->f == NULL) {
|
|
free(data_set);
|
|
return NULL;
|
|
}
|
|
|
|
data_set->tokens = token_array_new();
|
|
data_set->tokenized_str = NULL;
|
|
data_set->normalizations = cstring_array_new();
|
|
data_set->norm = 0;
|
|
data_set->labels = cstring_array_new();
|
|
data_set->separators = uint32_array_new();
|
|
data_set->language = char_array_new_size(MAX_LANGUAGE_LEN);
|
|
data_set->country = char_array_new_size(MAX_COUNTRY_CODE_LEN);
|
|
|
|
return data_set;
|
|
}
|
|
|
|
bool address_parser_data_set_rewind(address_parser_data_set_t *self) {
|
|
if (self == NULL || self->f == NULL) return false;
|
|
|
|
return (fseek(self->f, 0, SEEK_SET) == 0);
|
|
}
|
|
|
|
|
|
bool address_parser_all_normalizations(cstring_array *strings, char *str, char *language) {
|
|
if (strings == NULL) return false;
|
|
|
|
char *lowercased = normalize_string_utf8(str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS);
|
|
if (lowercased == NULL) {
|
|
return false;
|
|
}
|
|
|
|
cstring_array_add_string(strings, lowercased);
|
|
|
|
char *latin_normalized = normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN);
|
|
if (latin_normalized != NULL) {
|
|
if (!string_equals(latin_normalized, lowercased)) {
|
|
cstring_array_add_string(strings, latin_normalized);
|
|
}
|
|
free(latin_normalized);
|
|
}
|
|
|
|
char *trans_name = NULL;
|
|
char *transliterated = NULL;
|
|
char *transliterated_utf8_normalized = NULL;
|
|
|
|
foreach_transliterator(SCRIPT_LATIN, language, trans_name, {
|
|
if (!string_equals(trans_name, LATIN_ASCII)) {
|
|
transliterated = transliterate(trans_name, str, strlen(str));
|
|
if (transliterated != NULL) {
|
|
transliterated_utf8_normalized = normalize_string_utf8(transliterated, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8);
|
|
if (transliterated_utf8_normalized != NULL) {
|
|
if (!string_equals(transliterated_utf8_normalized, lowercased)) {
|
|
cstring_array_add_string(strings, transliterated_utf8_normalized);
|
|
}
|
|
free(transliterated_utf8_normalized);
|
|
transliterated_utf8_normalized = NULL;
|
|
} else {
|
|
cstring_array_add_string(strings, transliterated);
|
|
}
|
|
|
|
free(transliterated);
|
|
transliterated = NULL;
|
|
}
|
|
}
|
|
})
|
|
|
|
char *utf8_normalized = normalize_string_utf8(str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8);
|
|
if (utf8_normalized != NULL) {
|
|
if (!string_equals(utf8_normalized, lowercased)) {
|
|
cstring_array_add_string(strings, utf8_normalized);
|
|
}
|
|
free(utf8_normalized);
|
|
}
|
|
|
|
free(lowercased);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char *input) {
|
|
token_array *tokens = self->tokens;
|
|
uint32_array *separators = self->separators;
|
|
cstring_array *labels = self->labels;
|
|
|
|
size_t count = 0;
|
|
|
|
token_t token;
|
|
|
|
uint32_t i = 0;
|
|
char *str = NULL;
|
|
|
|
cstring_array *pairs = cstring_array_split_ignore_consecutive(input, " ", 1, &count);
|
|
size_t num_pairs = cstring_array_num_strings(pairs);
|
|
|
|
char *label = NULL;
|
|
|
|
// First populate token array
|
|
cstring_array_foreach(pairs, i, str, {
|
|
size_t pair_len = strlen(str);
|
|
|
|
char *last_separator = strrchr(str, (int)'/');
|
|
|
|
if (last_separator == NULL) {
|
|
log_error("All tokens must be delimited with '/'\n");
|
|
return false;
|
|
}
|
|
|
|
uint32_t last_separator_index = last_separator - str;
|
|
|
|
label = str + last_separator_index + 1;
|
|
|
|
if (strcmp(label, FIELD_SEPARATOR_LABEL) == 0) {
|
|
uint32_array_pop(separators);
|
|
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD | ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
|
continue;
|
|
} else if (strcmp(label, SEPARATOR_LABEL) == 0) {
|
|
uint32_array_pop(separators);
|
|
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
|
continue;
|
|
}
|
|
|
|
token.offset = pairs->indices->a[i];
|
|
size_t expected_len = last_separator_index;
|
|
|
|
scanner_t scanner = scanner_from_string(input + token.offset, expected_len);
|
|
token.type = scan_token(&scanner);
|
|
token.len = scanner.cursor - scanner.start;
|
|
|
|
if (token.len == expected_len) {
|
|
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
|
|
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
|
continue;
|
|
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
|
|
// shouldn't happen but just in case
|
|
continue;
|
|
} else {
|
|
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
|
|
}
|
|
|
|
cstring_array_add_string(labels, label);
|
|
|
|
token_array_push(tokens, token);
|
|
} else {
|
|
/* If normalizing the string turned one token into several e.g. ½ => 1/2
|
|
add all the tokens where offset = (token.offset + sub_token.offset)
|
|
with the same label as the parent.
|
|
*/
|
|
token_array *sub_tokens = token_array_new();
|
|
if (sub_tokens == NULL) {
|
|
log_error("Error allocating sub-token array\n");
|
|
return false;
|
|
}
|
|
tokenize_add_tokens(sub_tokens, input + token.offset, expected_len, false);
|
|
for (size_t j = 0; j < sub_tokens->n; j++) {
|
|
token_t sub_token = sub_tokens->a[j];
|
|
// Add the offset of the parent "token"
|
|
sub_token.offset = token.offset + sub_token.offset;
|
|
|
|
if (ADDRESS_PARSER_IS_SEPARATOR(sub_token.type)) {
|
|
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
|
|
continue;
|
|
} else if (ADDRESS_PARSER_IS_IGNORABLE(sub_token.type)) {
|
|
continue;
|
|
} else {
|
|
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
|
|
}
|
|
|
|
cstring_array_add_string(labels, label);
|
|
token_array_push(tokens, sub_token);
|
|
}
|
|
|
|
}
|
|
|
|
})
|
|
|
|
cstring_array_destroy(pairs);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
bool address_parser_data_set_next(address_parser_data_set_t *self) {
|
|
if (self == NULL) return false;
|
|
|
|
cstring_array *fields = NULL;
|
|
|
|
if (self->norm == 0 || self->norm >= cstring_array_num_strings(self->normalizations)) {
|
|
char *line = file_getline(self->f);
|
|
if (line == NULL) {
|
|
return false;
|
|
}
|
|
|
|
size_t token_count;
|
|
|
|
fields = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count);
|
|
|
|
free(line);
|
|
|
|
if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) {
|
|
log_error("Token count did not match, expected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count);
|
|
return false;
|
|
}
|
|
|
|
char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE);
|
|
char *country = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_COUNTRY);
|
|
char *address = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_ADDRESS);
|
|
|
|
char_array_clear(self->country);
|
|
char_array_add(self->country, country);
|
|
|
|
char_array_clear(self->language);
|
|
char_array_add(self->language, language);
|
|
|
|
log_debug("Doing: %s\n", address);
|
|
|
|
cstring_array_clear(self->normalizations);
|
|
|
|
if (!address_parser_all_normalizations(self->normalizations, address, language) || cstring_array_num_strings(self->normalizations) == 0) {
|
|
log_error("Error during string normalization\n");
|
|
return false;
|
|
}
|
|
self->norm = 0;
|
|
}
|
|
|
|
char *normalized = cstring_array_get_string(self->normalizations, self->norm);
|
|
|
|
token_array *tokens = self->tokens;
|
|
cstring_array *labels = self->labels;
|
|
uint32_array *separators = self->separators;
|
|
|
|
token_array_clear(tokens);
|
|
cstring_array_clear(labels);
|
|
uint32_array_clear(separators);
|
|
size_t len = strlen(normalized);
|
|
|
|
tokenized_string_t *tokenized_str = NULL;
|
|
|
|
if (address_parser_data_set_tokenize_line(self, normalized)) {
|
|
// Add tokens as discrete strings for easier use in feature functions
|
|
bool copy_tokens = true;
|
|
tokenized_str = tokenized_string_from_tokens(normalized, self->tokens, copy_tokens);
|
|
}
|
|
|
|
self->tokenized_str = tokenized_str;
|
|
|
|
self->norm++;
|
|
|
|
if (fields != NULL) {
|
|
cstring_array_destroy(fields);
|
|
}
|
|
|
|
return tokenized_str != NULL;
|
|
}
|
|
|
|
|
|
void address_parser_data_set_destroy(address_parser_data_set_t *self) {
|
|
if (self == NULL) return;
|
|
|
|
if (self->f != NULL) {
|
|
fclose(self->f);
|
|
}
|
|
|
|
if (self->tokens != NULL) {
|
|
token_array_destroy(self->tokens);
|
|
}
|
|
|
|
if (self->normalizations != NULL) {
|
|
cstring_array_destroy(self->normalizations);
|
|
}
|
|
|
|
if (self->labels != NULL) {
|
|
cstring_array_destroy(self->labels);
|
|
}
|
|
|
|
if (self->separators != NULL) {
|
|
uint32_array_destroy(self->separators);
|
|
}
|
|
|
|
if (self->language != NULL) {
|
|
char_array_destroy(self->language);
|
|
}
|
|
|
|
if (self->country != NULL) {
|
|
char_array_destroy(self->country);
|
|
}
|
|
|
|
free(self);
|
|
}
|