From 521a094a472d52f414e7c3603e62ef8029222324 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 15:25:34 -0400 Subject: [PATCH 1/8] [fix] Need to load transliteration module for Latin-ASCII normalization --- src/address_parser_train.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/address_parser_train.c b/src/address_parser_train.c index 7f0c6ee5..11b1cdee 100644 --- a/src/address_parser_train.c +++ b/src/address_parser_train.c @@ -7,6 +7,7 @@ #include "file_utils.h" #include "geodb.h" #include "shuffle.h" +#include "transliterate.h" #include "log/log.h" @@ -450,6 +451,14 @@ int main(int argc, char **argv) { log_info("address dictionary module loaded\n"); + // Needs to load for normalization + if (!transliteration_module_setup(NULL)) { + log_error("Could not load transliteration module\n"); + exit(EXIT_FAILURE); + } + + log_info("transliteration module loaded\n"); + if (!geodb_module_setup(NULL)) { log_error("Could not load geodb dictionaries\n"); exit(EXIT_FAILURE); From 5e07f5e8c50b220b5d802ccd7b22390842a03559 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 15:47:57 -0400 Subject: [PATCH 2/8] [fix] tokenized_string_t should copy its source string --- src/tokens.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/tokens.c b/src/tokens.c index 3fd6a949..6c7ae94b 100644 --- a/src/tokens.c +++ b/src/tokens.c @@ -6,7 +6,6 @@ tokenized_string_t *tokenized_string_new(void) { self->str = NULL; self->strings = cstring_array_new(); self->tokens = token_array_new(); - return self; } @@ -21,7 +20,7 @@ tokenized_string_t *tokenized_string_new_size(size_t len, size_t num_tokens) { inline tokenized_string_t *tokenized_string_new_from_str_size(char *src, size_t len, size_t num_tokens) { tokenized_string_t *self = tokenized_string_new_size(len, num_tokens); - self->str = src; + self->str = strndup(src); return self; } @@ -38,7 +37,7 @@ void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_ tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens, bool copy_tokens) { tokenized_string_t *self = malloc(sizeof(tokenized_string_t)); - self->str = src; + self->str = stdup(src); self->strings = cstring_array_new_size(strlen(src) + tokens->n); if (copy_tokens) { self->tokens = token_array_new_copy(tokens, tokens->n); @@ -48,7 +47,7 @@ tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens, token_t token; - for (int i = 0; i < tokens->n; i++) { + for (size_t i = 0; i < tokens->n; i++) { token = tokens->a[i]; cstring_array_add_string_len(self->strings, src + token.offset, token.len); } From 6baa7087fe490b6153896883128403d159ef0935 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 15:50:53 -0400 Subject: [PATCH 3/8] [fix] calls and NULL checks --- src/tokens.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/tokens.c b/src/tokens.c index 6c7ae94b..e85183f1 100644 --- a/src/tokens.c +++ b/src/tokens.c @@ -20,7 +20,11 @@ tokenized_string_t *tokenized_string_new_size(size_t len, size_t num_tokens) { inline tokenized_string_t *tokenized_string_new_from_str_size(char *src, size_t len, size_t num_tokens) { tokenized_string_t *self = tokenized_string_new_size(len, num_tokens); - self->str = strndup(src); + self->str = strndup(src, len); + if (self->str == NULL) { + tokenized_string_destroy(self); + return NULL; + } return self; } @@ -37,7 +41,11 @@ void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_ tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens, bool copy_tokens) { tokenized_string_t *self = malloc(sizeof(tokenized_string_t)); - self->str = stdup(src); + self->str = strdup(src); + if (self->str == NULL) { + tokenized_string_destroy(self); + return NULL; + } self->strings = cstring_array_new_size(strlen(src) + tokens->n); if (copy_tokens) { self->tokens = token_array_new_copy(tokens, tokens->n); From b1816e9b70057480907f72f4d8de11f7de410bd4 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 17:07:20 -0400 Subject: [PATCH 4/8] [utils] Adding cstring_array_split_ignore_consecutive --- src/string_utils.c | 21 +++++++++++++++++++-- src/string_utils.h | 2 ++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/string_utils.c b/src/string_utils.c index 70b2f304..fa7b9474 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -829,17 +829,23 @@ inline int64_t cstring_array_token_length(cstring_array *self, uint32_t i) { } } -cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) { +static cstring_array *cstring_array_split_options(char *str, const char *separator, size_t separator_len, bool ignore_consecutive, size_t *count) { *count = 0; char_array *array = char_array_new_size(strlen(str)); + bool last_was_separator = false; + while (*str) { if ((separator_len == 1 && *str == separator[0]) || (memcmp(str, separator, separator_len) == 0)) { - char_array_push(array, '\0'); + if (!ignore_consecutive || !last_was_separator) { + char_array_push(array, '\0'); + } str += separator_len; + last_was_separator = true; } else { char_array_push(array, *str); str++; + last_was_separator = false; } } char_array_push(array, '\0'); @@ -850,6 +856,17 @@ cstring_array *cstring_array_split(char *str, const char *separator, size_t sepa return string_array; } + +cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) { + return cstring_array_split_options(str, separator, separator_len, true, count); +} + + +cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count) { + return cstring_array_split_options(str, separator, separator_len, false, count); +} + + cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count) { *count = 0; char *ptr = str; diff --git a/src/string_utils.h b/src/string_utils.h index 76c12a15..fac4f99e 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -180,6 +180,8 @@ char **cstring_array_to_strings(cstring_array *self); // Split on delimiter cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count); +// Split on delimiter, ignore multiple consecutive delimiters +cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count); // Split on delimiter by replacing (single character) separator with the NUL byte in the original string cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count); From ced8f9ae2729972860a4142d82d36d8ce8a1550e Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 17:50:29 -0400 Subject: [PATCH 5/8] =?UTF-8?q?[parser]=20Ignore=20multiple=20spaces=20in?= =?UTF-8?q?=20parser=20input=20post-normalization.=20If=20normalizing=20th?= =?UTF-8?q?e=20string=20creates=20several=20distinct=20tokens=20(namely=20?= =?UTF-8?q?in=20Vulgar=20fractions=20e.g.=20=C2=BD=20=3D>=201/2),=20add=20?= =?UTF-8?q?all=20the=20sub-tokens=20with=20the=20same=20label=20as=20the?= =?UTF-8?q?=20parent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/address_parser_io.c | 62 +++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/src/address_parser_io.c b/src/address_parser_io.c index 83452529..fc6ed9dd 100644 --- a/src/address_parser_io.c +++ b/src/address_parser_io.c @@ -31,7 +31,7 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, uint32_t i = 0; char *str = NULL; - cstring_array *pairs = cstring_array_split(input, " ", 1, &count); + cstring_array *pairs = cstring_array_split_ignore_consecutive(input, " ", 1, &count); size_t num_pairs = cstring_array_num_strings(pairs); char *label = NULL; @@ -62,23 +62,57 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, } token.offset = pairs->indices->a[i]; - token.len = last_separator_index; + size_t expected_len = last_separator_index; - scanner_t scanner = scanner_from_string(input + token.offset, token.len); + scanner_t scanner = scanner_from_string(input + token.offset, expected_len); token.type = scan_token(&scanner); - if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) { - uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); - continue; - } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) { - // shouldn't happen but just in case - continue; - } else { - uint32_array_push(separators, ADDRESS_SEPARATOR_NONE); + token.len = scanner.cursor - scanner.start; + + if (token.len == expected_len) { + if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) { + uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) { + // shouldn't happen but just in case + continue; + } else { + uint32_array_push(separators, ADDRESS_SEPARATOR_NONE); + } + + cstring_array_add_string(labels, label); + + token_array_push(tokens, token); + else { + /* If normalizing the string turned one token into several e.g. ½ => 1/2 + add all the tokens where offset = (token.offset + sub_token.offset) + with the same label as the parent. + */ + token_array *sub_tokens = token_array_new(); + if (sub_tokens == NULL) { + log_error("Error allocating sub-token array\n"); + return false; + } + tokenize_add_tokens(sub_tokens, input + token.offset, expected_len, false); + for (size_t j = 0; j < sub_tokens->n; j++) { + token_t sub_token = sub_tokens->a[j]; + // Add the offset of the parent "token" + sub_token.offset = token.offset + sub_token.offset; + + if (ADDRESS_PARSER_IS_SEPARATOR(sub_token.type)) { + uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } else if (ADDRESS_PARSER_IS_IGNORABLE(sub_token.type)) { + continue; + } else { + uint32_array_push(separators, ADDRESS_SEPARATOR_NONE); + } + + cstring_array_add_string(labels, label); + token_array_push(tokens, sub_token); + } + } - cstring_array_add_string(labels, label); - - token_array_push(tokens, token); }) cstring_array_destroy(pairs); From a42d0e917a9ad3e808b70f96fa44dce660d34320 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 17:52:00 -0400 Subject: [PATCH 6/8] [fix] brace --- src/address_parser_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/address_parser_io.c b/src/address_parser_io.c index fc6ed9dd..3bbc7284 100644 --- a/src/address_parser_io.c +++ b/src/address_parser_io.c @@ -82,7 +82,7 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, cstring_array_add_string(labels, label); token_array_push(tokens, token); - else { + } else { /* If normalizing the string turned one token into several e.g. ½ => 1/2 add all the tokens where offset = (token.offset + sub_token.offset) with the same label as the parent. From 3939dd0ca67cdc3b9006dfcc114a7296688648fe Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 17:58:30 -0400 Subject: [PATCH 7/8] [fix] cstring_array_split calls --- src/string_utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/string_utils.c b/src/string_utils.c index fa7b9474..52f94893 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -858,12 +858,12 @@ static cstring_array *cstring_array_split_options(char *str, const char *separat cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) { - return cstring_array_split_options(str, separator, separator_len, true, count); + return cstring_array_split_options(str, separator, separator_len, false, count); } cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count) { - return cstring_array_split_options(str, separator, separator_len, false, count); + return cstring_array_split_options(str, separator, separator_len, true, count); } From 8f1e69960fd1985b2ff570cb2427989e2f3c4ee5 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 19:54:01 -0400 Subject: [PATCH 8/8] [fix] loading transliteration module in address_parser_test.c as well --- src/address_parser_test.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/address_parser_test.c b/src/address_parser_test.c index 57776381..8fb0fe01 100644 --- a/src/address_parser_test.c +++ b/src/address_parser_test.c @@ -142,6 +142,14 @@ int main(int argc, char **argv) { log_info("address dictionary module loaded\n"); + // Needs to load for normalization + if (!transliteration_module_setup(NULL)) { + log_error("Could not load transliteration module\n"); + exit(EXIT_FAILURE); + } + + log_info("transliteration module loaded\n"); + if (!geodb_module_setup(NULL)) { log_error("Could not load geodb dictionaries\n"); exit(EXIT_FAILURE);