Merge pull request #274 from openvenues/fix_oh_expansion

Context-sensitive expansion of words like "oh" inside vs. outside numeric expressions
This commit is contained in:
Al Barrentine
2017-11-24 17:13:24 -05:00
committed by GitHub
5 changed files with 31 additions and 3 deletions

View File

@@ -9,6 +9,7 @@
name: "oh"
value: 0
type: "cardinal"
left: "concat_only_if_number"
-
name: "one"
value: 1

View File

@@ -47,11 +47,13 @@ category_map = {
LEFT_CONTEXT_MULTIPLY = 'NUMEX_LEFT_CONTEXT_MULTIPLY'
LEFT_CONTEXT_ADD = 'NUMEX_LEFT_CONTEXT_ADD'
LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER = 'NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER'
LEFT_CONTEXT_NONE = 'NUMEX_LEFT_CONTEXT_NONE'
left_context_map = {
'add': LEFT_CONTEXT_ADD,
'multiply': LEFT_CONTEXT_MULTIPLY,
'concat_only_if_number': LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER,
None: LEFT_CONTEXT_NONE,
}

View File

@@ -709,6 +709,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
bool possible_complete_token = false;
bool complete_token = false;
bool prev_rule_was_number = false;
log_debug("Converting numex for str=%s, lang=%s\n", str, lang);
while (idx < len) {
@@ -851,8 +853,27 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
number_finished = true;
advance_index = false;
state = start_state;
prev_rule_was_number = true;
rule = prev_rule = NUMEX_NULL_RULE;
prev_result_len = 0;
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && !prev_rule_was_number) {
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, no context\n");
prev_rule = rule;
last_was_separator = false;
rule = NUMEX_NULL_RULE;
prev_result_len = result.len;
result = NULL_NUMEX_RESULT;
stopword_phrase = NULL_PHRASE;
state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN;
last_was_stopword = false;
continue;
} else if (rule.left_context_type == NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER && prev_rule_was_number) {
last_was_separator = false;
number_finished = true;
state = start_state;
last_was_stopword = false;
prev_rule_was_number = true;
log_debug("LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER, value = %" PRId64 "\n", result.value);
} else if (rule.rule_type != NUMEX_STOPWORD) {
result.value = rule.value;
log_debug("Got number, result.value=%" PRId64 "\n", result.value);
@@ -864,6 +885,9 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
continue;
}
prev_rule_was_number = prev_rule_was_number || prev_rule.rule_type != NUMEX_NULL;
if (rule.rule_type != NUMEX_STOPWORD) {
prev_rule = rule;
prev_result_len = result.len;
@@ -895,7 +919,6 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
if (prev_rule.rule_type != NUMEX_NULL) {
number_finished = true;
}
}
if (!set_rule) {
@@ -1137,7 +1160,6 @@ char *replace_numeric_expressions(char *str, char *lang) {
char_array_append(replacement, ordinal_suffix);
}
}
start = result.start + result.len;
}

View File

@@ -50,7 +50,8 @@ typedef enum {
typedef enum {
NUMEX_LEFT_CONTEXT_NONE,
NUMEX_LEFT_CONTEXT_ADD,
NUMEX_LEFT_CONTEXT_MULTIPLY
NUMEX_LEFT_CONTEXT_MULTIPLY,
NUMEX_LEFT_CONTEXT_CONCAT_ONLY_IF_NUMBER
} numex_left_context;
typedef enum {

View File

@@ -82,6 +82,8 @@ TEST test_expansions(void) {
CHECK_CALL(test_expansion_contains_with_languages("123 Main St. #2f", "123 main street number 2f", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("120 E 96th St", "120 east 96 street", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("120 E Ninety-sixth St", "120 east 96 street", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("4998 Vanderbilt Dr, Columbus, OH 43213", "4998 vanderbilt drive columbus ohio 43213", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Nineteen oh one W El Segundo Blvd", "1901 west el segundo boulevard", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("S St. NW", "s street northwest", options, 1, "en"));
CHECK_CALL(test_expansion_contains_with_languages("Marktstrasse", "markt strasse", options, 1, "de"));
CHECK_CALL(test_expansion_contains_with_languages("Hoofdstraat", "hoofdstraat", options, 1, "nl"));