From c276cf15291881fdaa1a3bb6f939ea7bdf237ccd Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 24 Nov 2017 15:36:50 -0500 Subject: [PATCH 1/4] [numex] adding a new type of left context for numeric expressions called conat_only_if_number (for something like "oh" which can be "Columbus, OH" or something like "Twenty-One Oh One" --- resources/numex/en.yaml | 1 + scripts/geodata/numbers/numex.py | 2 ++ src/numex.h | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/resources/numex/en.yaml b/resources/numex/en.yaml index cd11f79e..5e35fe06 100644 --- a/resources/numex/en.yaml +++ b/resources/numex/en.yaml @@ -9,6 +9,7 @@ name: "oh" value: 0 type: "cardinal" + left: "concat_only_if_number" - name: "one" value: 1 diff --git a/scripts/geodata/numbers/numex.py b/scripts/geodata/numbers/numex.py index b1d09b62..558a4e6b 100644 --- a/scripts/geodata/numbers/numex.py +++ b/scripts/geodata/numbers/numex.py @@ -47,11 +47,13 @@ category_map = { LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY' LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD' +LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER' LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE' left_context_map = { 'add': LEFT_CONTEXT_ADD, 'multiply': LEFT_CONTEXT_MULTIPLY, + 'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, None: LEFT_CONTEXT_NONE, } diff --git a/src/numex.h b/src/numex.h index d80f96e1..5b289f13 100644 --- a/src/numex.h +++ b/src/numex.h @@ -50,7 +50,8 @@ typedef enum { typedef enum { NUMEX_LEFT_CONTEXT_NONE, NUMEX_LEFT_CONTEXT_ADD, - NUMEX_LEFT_CONTEXT_MULTIPLY + NUMEX_LEFT_CONTEXT_MULTIPLY, + NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER } numex_left_context; typedef enum { From ef098fd2e79c1f915c0094dc2b9b7f379abc85bd Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 24 Nov 2017 15:42:50 -0500 Subject: [PATCH 2/4] [numex] implementing the numex concat_only_if_number left context, which helps in the case of e.g. Columbus, OH in #271 --- src/numex.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/numex.c b/src/numex.c index 107768fa..b8a0f0e7 100644 --- a/src/numex.c +++ b/src/numex.c @@ -709,6 +709,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { bool possible_complete_token = false; bool complete_token = false; + bool prev_rule_was_number = false; + log_debug("Converting numex for str=%s, lang=%s\n", str, lang); while (idx < len) { @@ -851,8 +853,27 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { number_finished = true; advance_index = false; state = start_state; + prev_rule_was_number = true; rule = prev_rule = NUMEX_NULL_RULE; prev_result_len = 0; + } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) { + log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n"); + prev_rule = rule; + last_was_separator = false; + rule = NUMEX_NULL_RULE; + prev_result_len = result.len; + result = NULL_NUMEX_RESULT; + stopword_phrase = NULL_PHRASE; + state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN; + last_was_stopword = false; + continue; + } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && prev_rule_was_number) { + last_was_separator = false; + number_finished = true; + state = start_state; + last_was_stopword = false; + prev_rule_was_number = true; + log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, value = %" PRId64 "\n", result.value); } else if (rule.rule_type != NUMEX_STOPWORD) { result.value = rule.value; log_debug("Got number, result.value=%" PRId64 "\n", result.value); @@ -864,6 +885,9 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { continue; } + + prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL; + if (rule.rule_type != NUMEX_STOPWORD) { prev_rule = rule; prev_result_len = result.len; @@ -895,7 +919,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { if (prev_rule.rule_type != NUMEX_NULL) { number_finished = true; } - } if (!set_rule) { @@ -1137,7 +1160,6 @@ char *replace_numeric_expressions(char *str, char *lang) { char_array_append(replacement, ordinal_suffix); } } - start = result.start + result.len; } From d7f22544b4610fb1e534ec47501da85b2c5524ba Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 24 Nov 2017 15:44:37 -0500 Subject: [PATCH 3/4] [test] adding an expansion test for the Columbus, OH case --- test/test_expand.c | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_expand.c b/test/test_expand.c index f8781a01..6cf70566 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -82,6 +82,7 @@ TEST test_expansions(void) { CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"); CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de")); CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl")); From ebe7fc9be9af246392e70b695ba8447a8a360fd3 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 24 Nov 2017 16:11:07 -0500 Subject: [PATCH 4/4] [test] missing paren in Columbus, OH test. Adding test for "oh" as part of a number in Nineteen oh one W El Segundo Blvd --- test/test_expand.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_expand.c b/test/test_expand.c index 6cf70566..d97838ae 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -82,7 +82,8 @@ TEST test_expansions(void) { CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en")); - CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"); + CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de")); CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));