[transliteration] Fixing edge case in transliteration where a naked character fails context matching but the set-wrapped version matches
This commit is contained in:
@@ -62,6 +62,7 @@ typedef struct {
|
|||||||
transliteration_state_type_t state;
|
transliteration_state_type_t state;
|
||||||
ssize_t phrase_start;
|
ssize_t phrase_start;
|
||||||
size_t phrase_len;
|
size_t phrase_len;
|
||||||
|
size_t char_len;
|
||||||
uint8_t advance_index:1;
|
uint8_t advance_index:1;
|
||||||
uint8_t advance_state:1;
|
uint8_t advance_state:1;
|
||||||
uint8_t in_set:1;
|
uint8_t in_set:1;
|
||||||
@@ -71,7 +72,7 @@ typedef struct {
|
|||||||
} transliteration_state_t;
|
} transliteration_state_t;
|
||||||
|
|
||||||
|
|
||||||
#define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 1, 1, 0, 0, 0, 0}
|
#define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 0, 1, 1, 0, 0, 0, 0}
|
||||||
|
|
||||||
|
|
||||||
static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result, char *str, size_t start_index) {
|
static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result, char *str, size_t start_index) {
|
||||||
@@ -111,13 +112,13 @@ typedef struct char_set_result {
|
|||||||
|
|
||||||
#define NULL_CHAR_SET_RESULT (char_set_result_t){NULL_PREFIX_RESULT, NO_CHAR_RESULT};
|
#define NULL_CHAR_SET_RESULT (char_set_result_t){NULL_PREFIX_RESULT, NO_CHAR_RESULT};
|
||||||
|
|
||||||
|
static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len, trie_prefix_result_t last_result, bool in_set, bool check_set_only) {
|
||||||
static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len, trie_prefix_result_t last_result, bool in_set) {
|
|
||||||
trie_prefix_result_t result = trie_get_prefix_from_index(trie, str, len, last_result.node_id, last_result.tail_pos);
|
trie_prefix_result_t result = trie_get_prefix_from_index(trie, str, len, last_result.node_id, last_result.tail_pos);
|
||||||
|
|
||||||
bool has_empty_transition = false;
|
bool has_empty_transition = false;
|
||||||
|
|
||||||
if (result.node_id != NULL_NODE_ID) {
|
|
||||||
|
if (!check_set_only && result.node_id != NULL_NODE_ID) {
|
||||||
last_result = result;
|
last_result = result;
|
||||||
result = trie_get_prefix_from_index(trie, REPEAT_CHAR, REPEAT_CHAR_LEN, last_result.node_id, last_result.tail_pos);
|
result = trie_get_prefix_from_index(trie, REPEAT_CHAR, REPEAT_CHAR_LEN, last_result.node_id, last_result.tail_pos);
|
||||||
if (result.node_id == NULL_NODE_ID) {
|
if (result.node_id == NULL_NODE_ID) {
|
||||||
@@ -188,26 +189,18 @@ static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len,
|
|||||||
return (char_set_result_t){result, CHAR_SET_REPEAT};
|
return (char_set_result_t){result, CHAR_SET_REPEAT};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL_CHAR_SET_RESULT;
|
return NULL_CHAR_SET_RESULT;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static transliteration_state_t state_from_char_result(char_set_result_t char_result, size_t index, size_t len, transliteration_state_t prev_state) {
|
||||||
static transliteration_state_t state_transition(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
|
|
||||||
transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
|
transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
|
||||||
|
|
||||||
log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
|
|
||||||
|
|
||||||
log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
|
|
||||||
|
|
||||||
char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set);
|
|
||||||
|
|
||||||
log_debug("char_result.type = %d\n", char_result.type);
|
|
||||||
|
|
||||||
trie_prefix_result_t result = char_result.result;
|
trie_prefix_result_t result = char_result.result;
|
||||||
trie_prefix_result_t prev_result = prev_state.result;
|
trie_prefix_result_t prev_result = prev_state.result;
|
||||||
|
|
||||||
state.result = result;
|
state.result = result;
|
||||||
|
state.char_len = len;
|
||||||
state.in_set = (char_result.type == OPEN_CHAR_SET || (prev_state.in_set && char_result.type == SINGLE_CHAR_ONLY));
|
state.in_set = (char_result.type == OPEN_CHAR_SET || (prev_state.in_set && char_result.type == SINGLE_CHAR_ONLY));
|
||||||
state.repeat = (char_result.type == SINGLE_CHAR_REPEAT || char_result.type == CHAR_SET_REPEAT);
|
state.repeat = (char_result.type == SINGLE_CHAR_REPEAT || char_result.type == CHAR_SET_REPEAT);
|
||||||
state.empty_transition = (char_result.type == SINGLE_EMPTY_TRANSITION || char_result.type == CHAR_SET_EMPTY_TRANSITION);
|
state.empty_transition = (char_result.type == SINGLE_EMPTY_TRANSITION || char_result.type == CHAR_SET_EMPTY_TRANSITION);
|
||||||
@@ -221,6 +214,20 @@ static transliteration_state_t state_transition(trie_t *trie, char *str, size_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
return state;
|
return state;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static transliteration_state_t state_transition(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
|
||||||
|
|
||||||
|
log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
|
||||||
|
|
||||||
|
log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
|
||||||
|
|
||||||
|
char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set, false);
|
||||||
|
|
||||||
|
log_debug("char_result.type = %d\n", char_result.type);
|
||||||
|
|
||||||
|
return state_from_char_result(char_result, index, len, prev_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -735,13 +742,14 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
|||||||
start_state.result = step_result;
|
start_state.result = step_result;
|
||||||
|
|
||||||
transliteration_state_t prev_state = start_state;
|
transliteration_state_t prev_state = start_state;
|
||||||
|
transliteration_state_t prev2_state = start_state;
|
||||||
|
|
||||||
transliteration_state_t repeat_state_end;
|
transliteration_state_t repeat_state_end;
|
||||||
|
|
||||||
bool in_repeat = false;
|
bool in_repeat = false;
|
||||||
|
|
||||||
int32_t ch = 0;
|
int32_t ch = 0;
|
||||||
ssize_t char_len;
|
ssize_t char_len = 0;
|
||||||
uint8_t *ptr = (uint8_t *)str;
|
uint8_t *ptr = (uint8_t *)str;
|
||||||
uint64_t idx = 0;
|
uint64_t idx = 0;
|
||||||
|
|
||||||
@@ -783,8 +791,12 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
|||||||
bool context_no_match = false;
|
bool context_no_match = false;
|
||||||
bool empty_context_match = false;
|
bool empty_context_match = false;
|
||||||
|
|
||||||
|
bool is_last_char = idx + char_len == len;
|
||||||
|
|
||||||
transliteration_state_t match_candidate_state = state.state == TRANS_STATE_PARTIAL_MATCH ? state : prev_state;
|
transliteration_state_t match_candidate_state = state.state == TRANS_STATE_PARTIAL_MATCH ? state : prev_state;
|
||||||
|
if (state.state == TRANS_STATE_PARTIAL_MATCH) {
|
||||||
|
log_debug("state.state == TRANS_STATE_PARTIAL_MATCH\n");
|
||||||
|
}
|
||||||
|
|
||||||
context_result = context_match(trie, str, match_candidate_state);
|
context_result = context_match(trie, str, match_candidate_state);
|
||||||
|
|
||||||
@@ -803,10 +815,28 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
|||||||
} else {
|
} else {
|
||||||
log_debug("Checking for no-context match\n");
|
log_debug("Checking for no-context match\n");
|
||||||
set_match_if_any(trie, match_candidate_state, &match_state);
|
set_match_if_any(trie, match_candidate_state, &match_state);
|
||||||
|
if (match_state.state != TRANS_STATE_MATCH && !match_candidate_state.in_set) {
|
||||||
|
log_debug("Trying set for match candidate\n");
|
||||||
|
|
||||||
|
transliteration_state_t match_prev_state = !is_last_char ? prev2_state : prev_state;
|
||||||
|
|
||||||
|
char_set_result_t char_result = next_prefix_or_set(trie, str + idx - match_candidate_state.char_len, match_candidate_state.char_len, match_prev_state.result, false, true);
|
||||||
|
match_candidate_state = state_from_char_result(char_result, idx - match_candidate_state.char_len, match_candidate_state.char_len, match_prev_state);
|
||||||
|
if (match_candidate_state.state == TRANS_STATE_PARTIAL_MATCH) {
|
||||||
|
log_debug("Got partial match for set check\n");
|
||||||
|
set_match_if_any(trie, match_candidate_state, &match_state);
|
||||||
|
if (match_state.state != TRANS_STATE_MATCH) {
|
||||||
|
prev_state = match_candidate_state;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (match_state.state == TRANS_STATE_MATCH) {
|
if (match_state.state == TRANS_STATE_MATCH) {
|
||||||
log_debug("Match no context\n");
|
log_debug("Match no context\n");
|
||||||
replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
|
replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr);
|
log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr);
|
||||||
context_no_match = true;
|
context_no_match = true;
|
||||||
}
|
}
|
||||||
@@ -814,8 +844,6 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if (replacement != NULL) {
|
if (replacement != NULL) {
|
||||||
char *replacement_string = cstring_array_get_string(trans_table->replacement_strings, replacement->string_index);
|
char *replacement_string = cstring_array_get_string(trans_table->replacement_strings, replacement->string_index);
|
||||||
char *revisit_string = NULL;
|
char *revisit_string = NULL;
|
||||||
@@ -917,6 +945,7 @@ char *transliterate(char *trans_name, char *str, size_t len) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (state.advance_state) {
|
if (state.advance_state) {
|
||||||
|
prev2_state = prev_state;
|
||||||
prev_state = state;
|
prev_state = state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user