[parser/cli] adding .print_features option in address_parser client for debugging

This commit is contained in:
Al
2016-12-31 00:20:35 -05:00
parent bdb51a244e
commit db16e656ca
3 changed files with 33 additions and 26 deletions

View File

@@ -17,8 +17,6 @@
static address_parser_t *parser = NULL;
//#define PRINT_ADDRESS_PARSER_FEATURES
typedef enum {
ADDRESS_PARSER_NULL_PHRASE,
ADDRESS_PARSER_DICTIONARY_PHRASE,
@@ -29,7 +27,8 @@ typedef enum {
static parser_options_t PARSER_DEFAULT_OPTIONS = {
.rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD
.rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD,
.print_features = false
};
address_parser_t *address_parser_new_options(parser_options_t options) {
@@ -873,7 +872,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
log_warn("expansion_value is NULL. word=%s, sentence=%s\n", word, tokenized->str);
}
if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME)) {
if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME | ADDRESS_UNIT)) {
phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase);
add_word_feature = false;
@@ -1146,22 +1145,18 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
//feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);
}
#ifndef PRINT_ADDRESS_PARSER_FEATURES
if (0) {
#endif
if (parser->options.print_features) {
uint32_t fidx;
char *feature;
uint32_t fidx;
char *feature;
printf("{");
cstring_array_foreach(features, fidx, feature, {
printf(" %s, ", feature);
})
printf("}\n");
#ifndef PRINT_ADDRESS_PARSER_FEATURES
printf("{ ");
size_t num_features = cstring_array_num_strings(features);
cstring_array_foreach(context->features, fidx, feature, {
printf("%s", feature);
if (fidx < num_features - 1) printf(", ");
})
printf(" }\n");
}
#endif
return true;

View File

@@ -166,6 +166,7 @@ typedef struct address_parser_context {
typedef struct parser_options {
uint64_t rare_word_threshold;
bool print_features;
} parser_options_t;
// Can add other gazetteers as well

View File

@@ -64,6 +64,8 @@ int main(int argc, char **argv) {
char *input = NULL;
address_parser_t *parser = get_address_parser();
while((input = linenoise("> ")) != NULL) {
if (input[0] != '\0') {
@@ -101,6 +103,22 @@ int main(int argc, char **argv) {
printf("Must specify country code\n");
}
cstring_array_destroy(command);
goto next_input;
} else if (string_starts_with(input, ".print_features")) {
size_t num_tokens = 0;
cstring_array *command = cstring_array_split(input, " ", 1, &num_tokens);
if (cstring_array_num_strings(command) > 1) {
char *flag = cstring_array_get_string(command, 1);
if (string_compare_case_insensitive(flag, "off") == 0) {
parser->options.print_features = false;
} else if (string_compare_case_insensitive(flag, "on") == 0) {
parser->options.print_features = true;
}
} else {
parser->options.print_features = true;
}
cstring_array_destroy(command);
goto next_input;
} else if (strlen(input) == 0) {
@@ -116,16 +134,9 @@ int main(int argc, char **argv) {
printf("{\n");
for (int i = 0; i < parsed->num_components; i++) {
char *component = parsed->components[i];
utf8proc_uint8_t *normalized = NULL;
utf8proc_map((utf8proc_uint8_t *)component, 0, &normalized, UTF8PROC_NULLTERM | UTF8PROC_COMPOSE);
if (normalized == NULL) {
log_error("Error parsing address\n");
exit(EXIT_FAILURE);
}
char *json_string = json_encode_string((char *)normalized);
char *json_string = json_encode_string(component);
printf(" \"%s\": %s%s\n", parsed->labels[i], json_string, i < parsed->num_components - 1 ? "," : "");
free(normalized);
free(json_string);
}
printf("}\n");