Merge pull request #274 from openvenues/fix_oh_expansion
Context-sensitive expansion of words like "oh" inside vs. outside numeric expressions
This commit is contained in:
@@ -9,6 +9,7 @@
|
||||
name: "oh"
|
||||
value: 0
|
||||
type: "cardinal"
|
||||
left: "concat_only_if_number"
|
||||
-
|
||||
name: "one"
|
||||
value: 1
|
||||
|
||||
@@ -47,11 +47,13 @@ category_map = {
|
||||
|
||||
LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
|
||||
LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
|
||||
LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
|
||||
LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
|
||||
|
||||
left_context_map = {
|
||||
'add': LEFT_CONTEXT_ADD,
|
||||
'multiply': LEFT_CONTEXT_MULTIPLY,
|
||||
'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
|
||||
None: LEFT_CONTEXT_NONE,
|
||||
}
|
||||
|
||||
|
||||
26
src/numex.c
26
src/numex.c
@@ -709,6 +709,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
bool possible_complete_token = false;
|
||||
bool complete_token = false;
|
||||
|
||||
bool prev_rule_was_number = false;
|
||||
|
||||
log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
|
||||
|
||||
while (idx < len) {
|
||||
@@ -851,8 +853,27 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
number_finished = true;
|
||||
advance_index = false;
|
||||
state = start_state;
|
||||
prev_rule_was_number = true;
|
||||
rule = prev_rule = NUMEX_NULL_RULE;
|
||||
prev_result_len = 0;
|
||||
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) {
|
||||
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n");
|
||||
prev_rule = rule;
|
||||
last_was_separator = false;
|
||||
rule = NUMEX_NULL_RULE;
|
||||
prev_result_len = result.len;
|
||||
result = NULL_NUMEX_RESULT;
|
||||
stopword_phrase = NULL_PHRASE;
|
||||
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
|
||||
last_was_stopword = false;
|
||||
continue;
|
||||
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && prev_rule_was_number) {
|
||||
last_was_separator = false;
|
||||
number_finished = true;
|
||||
state = start_state;
|
||||
last_was_stopword = false;
|
||||
prev_rule_was_number = true;
|
||||
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, value = %" PRId64 "\n", result.value);
|
||||
} else if (rule.rule_type != NUMEX_STOPWORD) {
|
||||
result.value = rule.value;
|
||||
log_debug("Got number, result.value=%" PRId64 "\n", result.value);
|
||||
@@ -864,6 +885,9 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL;
|
||||
|
||||
if (rule.rule_type != NUMEX_STOPWORD) {
|
||||
prev_rule = rule;
|
||||
prev_result_len = result.len;
|
||||
@@ -895,7 +919,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
|
||||
if (prev_rule.rule_type != NUMEX_NULL) {
|
||||
number_finished = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (!set_rule) {
|
||||
@@ -1137,7 +1160,6 @@ char *replace_numeric_expressions(char *str, char *lang) {
|
||||
char_array_append(replacement, ordinal_suffix);
|
||||
}
|
||||
}
|
||||
|
||||
start = result.start + result.len;
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,8 @@ typedef enum {
|
||||
typedef enum {
|
||||
NUMEX_LEFT_CONTEXT_NONE,
|
||||
NUMEX_LEFT_CONTEXT_ADD,
|
||||
NUMEX_LEFT_CONTEXT_MULTIPLY
|
||||
NUMEX_LEFT_CONTEXT_MULTIPLY,
|
||||
NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER
|
||||
} numex_left_context;
|
||||
|
||||
typedef enum {
|
||||
|
||||
@@ -82,6 +82,8 @@ TEST test_expansions(void) {
|
||||
CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
|
||||
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));
|
||||
|
||||
Reference in New Issue
Block a user