diff --git a/resources/numex/en.yaml b/resources/numex/en.yaml index cd11f79e..5e35fe06 100644 --- a/resources/numex/en.yaml +++ b/resources/numex/en.yaml @@ -9,6 +9,7 @@ name: "oh" value: 0 type: "cardinal" + left: "concat_only_if_number" - name: "one" value: 1 diff --git a/scripts/geodata/numbers/numex.py b/scripts/geodata/numbers/numex.py index b1d09b62..558a4e6b 100644 --- a/scripts/geodata/numbers/numex.py +++ b/scripts/geodata/numbers/numex.py @@ -47,11 +47,13 @@ category_map = { LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY' LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD' +LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER' LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE' left_context_map = { 'add': LEFT_CONTEXT_ADD, 'multiply': LEFT_CONTEXT_MULTIPLY, + 'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, None: LEFT_CONTEXT_NONE, } diff --git a/src/numex.c b/src/numex.c index 107768fa..b8a0f0e7 100644 --- a/src/numex.c +++ b/src/numex.c @@ -709,6 +709,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { bool possible_complete_token = false; bool complete_token = false; + bool prev_rule_was_number = false; + log_debug("Converting numex for str=%s, lang=%s\n", str, lang); while (idx < len) { @@ -851,8 +853,27 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { number_finished = true; advance_index = false; state = start_state; + prev_rule_was_number = true; rule = prev_rule = NUMEX_NULL_RULE; prev_result_len = 0; + } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) { + log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n"); + prev_rule = rule; + last_was_separator = false; + rule = NUMEX_NULL_RULE; + prev_result_len = result.len; + result = NULL_NUMEX_RESULT; + stopword_phrase = NULL_PHRASE; + state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN; + last_was_stopword = false; + continue; + } else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && prev_rule_was_number) { + last_was_separator = false; + number_finished = true; + state = start_state; + last_was_stopword = false; + prev_rule_was_number = true; + log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, value = %" PRId64 "\n", result.value); } else if (rule.rule_type != NUMEX_STOPWORD) { result.value = rule.value; log_debug("Got number, result.value=%" PRId64 "\n", result.value); @@ -864,6 +885,9 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { continue; } + + prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL; + if (rule.rule_type != NUMEX_STOPWORD) { prev_rule = rule; prev_result_len = result.len; @@ -895,7 +919,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { if (prev_rule.rule_type != NUMEX_NULL) { number_finished = true; } - } if (!set_rule) { @@ -1137,7 +1160,6 @@ char *replace_numeric_expressions(char *str, char *lang) { char_array_append(replacement, ordinal_suffix); } } - start = result.start + result.len; } diff --git a/src/numex.h b/src/numex.h index d80f96e1..5b289f13 100644 --- a/src/numex.h +++ b/src/numex.h @@ -50,7 +50,8 @@ typedef enum { typedef enum { NUMEX_LEFT_CONTEXT_NONE, NUMEX_LEFT_CONTEXT_ADD, - NUMEX_LEFT_CONTEXT_MULTIPLY + NUMEX_LEFT_CONTEXT_MULTIPLY, + NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER } numex_left_context; typedef enum { diff --git a/test/test_expand.c b/test/test_expand.c index f8781a01..d97838ae 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -82,6 +82,8 @@ TEST test_expansions(void) { CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en")); + CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en")); CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de")); CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));