diff --git a/src/address_parser.c b/src/address_parser.c index 2306d85c..e4a98f3c 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -17,8 +17,6 @@ static address_parser_t *parser = NULL; -//#define PRINT_ADDRESS_PARSER_FEATURES - typedef enum { ADDRESS_PARSER_NULL_PHRASE, ADDRESS_PARSER_DICTIONARY_PHRASE, @@ -29,7 +27,8 @@ typedef enum { static parser_options_t PARSER_DEFAULT_OPTIONS = { - .rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD + .rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD, + .print_features = false }; address_parser_t *address_parser_new_options(parser_options_t options) { @@ -873,7 +872,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize log_warn("expansion_value is NULL. word=%s, sentence=%s\n", word, tokenized->str); } - if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME)) { + if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME | ADDRESS_UNIT)) { phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase); add_word_feature = false; @@ -1146,22 +1145,18 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word); } - #ifndef PRINT_ADDRESS_PARSER_FEATURES - if (0) { - #endif + if (parser->options.print_features) { + uint32_t fidx; + char *feature; - uint32_t fidx; - char *feature; - - printf("{"); - cstring_array_foreach(features, fidx, feature, { - printf(" %s, ", feature); - }) - printf("}\n"); - - #ifndef PRINT_ADDRESS_PARSER_FEATURES + printf("{ "); + size_t num_features = cstring_array_num_strings(features); + cstring_array_foreach(context->features, fidx, feature, { + printf("%s", feature); + if (fidx < num_features - 1) printf(", "); + }) + printf(" }\n"); } - #endif return true; diff --git a/src/address_parser.h b/src/address_parser.h index 1bbc67cf..214e43b4 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -166,6 +166,7 @@ typedef struct address_parser_context { typedef struct parser_options { uint64_t rare_word_threshold; + bool print_features; } parser_options_t; // Can add other gazetteers as well diff --git a/src/address_parser_cli.c b/src/address_parser_cli.c index 65cb09d1..a6663549 100644 --- a/src/address_parser_cli.c +++ b/src/address_parser_cli.c @@ -64,6 +64,8 @@ int main(int argc, char **argv) { char *input = NULL; + address_parser_t *parser = get_address_parser(); + while((input = linenoise("> ")) != NULL) { if (input[0] != '\0') { @@ -101,6 +103,22 @@ int main(int argc, char **argv) { printf("Must specify country code\n"); } + cstring_array_destroy(command); + goto next_input; + } else if (string_starts_with(input, ".print_features")) { + size_t num_tokens = 0; + cstring_array *command = cstring_array_split(input, " ", 1, &num_tokens); + if (cstring_array_num_strings(command) > 1) { + char *flag = cstring_array_get_string(command, 1); + if (string_compare_case_insensitive(flag, "off") == 0) { + parser->options.print_features = false; + } else if (string_compare_case_insensitive(flag, "on") == 0) { + parser->options.print_features = true; + } + } else { + parser->options.print_features = true; + } + cstring_array_destroy(command); goto next_input; } else if (strlen(input) == 0) { @@ -116,16 +134,9 @@ int main(int argc, char **argv) { printf("{\n"); for (int i = 0; i < parsed->num_components; i++) { char *component = parsed->components[i]; - utf8proc_uint8_t *normalized = NULL; - utf8proc_map((utf8proc_uint8_t *)component, 0, &normalized, UTF8PROC_NULLTERM | UTF8PROC_COMPOSE); - if (normalized == NULL) { - log_error("Error parsing address\n"); - exit(EXIT_FAILURE); - } - char *json_string = json_encode_string((char *)normalized); + char *json_string = json_encode_string(component); printf(" \"%s\": %s%s\n", parsed->labels[i], json_string, i < parsed->num_components - 1 ? "," : ""); - free(normalized); free(json_string); } printf("}\n");