From 994b2f18e4a9793c8ee62d002c267ab9dd6bfc36 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 25 May 2016 17:50:29 -0400 Subject: [PATCH] =?UTF-8?q?[parser]=20Ignore=20multiple=20spaces=20in=20pa?= =?UTF-8?q?rser=20input=20post-normalization.=20If=20normalizing=20the=20s?= =?UTF-8?q?tring=20creates=20several=20distinct=20tokens=20(namely=20in=20?= =?UTF-8?q?Vulgar=20fractions=20e.g.=20=C2=BD=20=3D>=201/2),=20add=20all?= =?UTF-8?q?=20the=20sub-tokens=20with=20the=20same=20label=20as=20the=20pa?= =?UTF-8?q?rent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/address_parser_io.c | 62 +++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/src/address_parser_io.c b/src/address_parser_io.c index 83452529..fc6ed9dd 100644 --- a/src/address_parser_io.c +++ b/src/address_parser_io.c @@ -31,7 +31,7 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, uint32_t i = 0; char *str = NULL; - cstring_array *pairs = cstring_array_split(input, " ", 1, &count); + cstring_array *pairs = cstring_array_split_ignore_consecutive(input, " ", 1, &count); size_t num_pairs = cstring_array_num_strings(pairs); char *label = NULL; @@ -62,23 +62,57 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set, } token.offset = pairs->indices->a[i]; - token.len = last_separator_index; + size_t expected_len = last_separator_index; - scanner_t scanner = scanner_from_string(input + token.offset, token.len); + scanner_t scanner = scanner_from_string(input + token.offset, expected_len); token.type = scan_token(&scanner); - if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) { - uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); - continue; - } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) { - // shouldn't happen but just in case - continue; - } else { - uint32_array_push(separators, ADDRESS_SEPARATOR_NONE); + token.len = scanner.cursor - scanner.start; + + if (token.len == expected_len) { + if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) { + uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) { + // shouldn't happen but just in case + continue; + } else { + uint32_array_push(separators, ADDRESS_SEPARATOR_NONE); + } + + cstring_array_add_string(labels, label); + + token_array_push(tokens, token); + else { + /* If normalizing the string turned one token into several e.g. ½ => 1/2 + add all the tokens where offset = (token.offset + sub_token.offset) + with the same label as the parent. + */ + token_array *sub_tokens = token_array_new(); + if (sub_tokens == NULL) { + log_error("Error allocating sub-token array\n"); + return false; + } + tokenize_add_tokens(sub_tokens, input + token.offset, expected_len, false); + for (size_t j = 0; j < sub_tokens->n; j++) { + token_t sub_token = sub_tokens->a[j]; + // Add the offset of the parent "token" + sub_token.offset = token.offset + sub_token.offset; + + if (ADDRESS_PARSER_IS_SEPARATOR(sub_token.type)) { + uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL); + continue; + } else if (ADDRESS_PARSER_IS_IGNORABLE(sub_token.type)) { + continue; + } else { + uint32_array_push(separators, ADDRESS_SEPARATOR_NONE); + } + + cstring_array_add_string(labels, label); + token_array_push(tokens, sub_token); + } + } - cstring_array_add_string(labels, label); - - token_array_push(tokens, token); }) cstring_array_destroy(pairs);