[transliteration] Fixing edge case in transliteration where a naked character fails context matching but the set-wrapped version matches

This commit is contained in:
Al
2015-06-28 15:19:19 -04:00
parent a5dacf3d2b
commit f0bf7e750c

View File

@@ -62,6 +62,7 @@ typedef struct {
transliteration_state_type_t state;
ssize_t phrase_start;
size_t phrase_len;
size_t char_len;
uint8_t advance_index:1;
uint8_t advance_state:1;
uint8_t in_set:1;
@@ -71,7 +72,7 @@ typedef struct {
} transliteration_state_t;
#define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 1, 1, 0, 0, 0, 0}
#define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 0, 1, 1, 0, 0, 0, 0}
static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result, char *str, size_t start_index) {
@@ -111,13 +112,13 @@ typedef struct char_set_result {
#define NULL_CHAR_SET_RESULT (char_set_result_t){NULL_PREFIX_RESULT, NO_CHAR_RESULT};
static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len, trie_prefix_result_t last_result, bool in_set) {
static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len, trie_prefix_result_t last_result, bool in_set, bool check_set_only) {
trie_prefix_result_t result = trie_get_prefix_from_index(trie, str, len, last_result.node_id, last_result.tail_pos);
bool has_empty_transition = false;
if (result.node_id != NULL_NODE_ID) {
if (!check_set_only && result.node_id != NULL_NODE_ID) {
last_result = result;
result = trie_get_prefix_from_index(trie, REPEAT_CHAR, REPEAT_CHAR_LEN, last_result.node_id, last_result.tail_pos);
if (result.node_id == NULL_NODE_ID) {
@@ -159,7 +160,7 @@ static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len,
}
in_set = true;
last_result = result;
last_result = result;
}
if (in_set) {
@@ -188,26 +189,18 @@ static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len,
return (char_set_result_t){result, CHAR_SET_REPEAT};
}
}
return NULL_CHAR_SET_RESULT;
}
static transliteration_state_t state_transition(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
static transliteration_state_t state_from_char_result(char_set_result_t char_result, size_t index, size_t len, transliteration_state_t prev_state) {
transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set);
log_debug("char_result.type = %d\n", char_result.type);
trie_prefix_result_t result = char_result.result;
trie_prefix_result_t prev_result = prev_state.result;
state.result = result;
state.char_len = len;
state.in_set = (char_result.type == OPEN_CHAR_SET || (prev_state.in_set && char_result.type == SINGLE_CHAR_ONLY));
state.repeat = (char_result.type == SINGLE_CHAR_REPEAT || char_result.type == CHAR_SET_REPEAT);
state.empty_transition = (char_result.type == SINGLE_EMPTY_TRANSITION || char_result.type == CHAR_SET_EMPTY_TRANSITION);
@@ -221,6 +214,20 @@ static transliteration_state_t state_transition(trie_t *trie, char *str, size_t
}
return state;
}
static transliteration_state_t state_transition(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set, false);
log_debug("char_result.type = %d\n", char_result.type);
return state_from_char_result(char_result, index, len, prev_state);
}
@@ -735,13 +742,14 @@ char *transliterate(char *trans_name, char *str, size_t len) {
start_state.result = step_result;
transliteration_state_t prev_state = start_state;
transliteration_state_t prev2_state = start_state;
transliteration_state_t repeat_state_end;
bool in_repeat = false;
int32_t ch = 0;
ssize_t char_len;
ssize_t char_len = 0;
uint8_t *ptr = (uint8_t *)str;
uint64_t idx = 0;
@@ -783,8 +791,12 @@ char *transliterate(char *trans_name, char *str, size_t len) {
bool context_no_match = false;
bool empty_context_match = false;
bool is_last_char = idx + char_len == len;
transliteration_state_t match_candidate_state = state.state == TRANS_STATE_PARTIAL_MATCH ? state : prev_state;
if (state.state == TRANS_STATE_PARTIAL_MATCH) {
log_debug("state.state == TRANS_STATE_PARTIAL_MATCH\n");
}
context_result = context_match(trie, str, match_candidate_state);
@@ -803,10 +815,28 @@ char *transliterate(char *trans_name, char *str, size_t len) {
} else {
log_debug("Checking for no-context match\n");
set_match_if_any(trie, match_candidate_state, &match_state);
if (match_state.state != TRANS_STATE_MATCH && !match_candidate_state.in_set) {
log_debug("Trying set for match candidate\n");
transliteration_state_t match_prev_state = !is_last_char ? prev2_state : prev_state;
char_set_result_t char_result = next_prefix_or_set(trie, str + idx - match_candidate_state.char_len, match_candidate_state.char_len, match_prev_state.result, false, true);
match_candidate_state = state_from_char_result(char_result, idx - match_candidate_state.char_len, match_candidate_state.char_len, match_prev_state);
if (match_candidate_state.state == TRANS_STATE_PARTIAL_MATCH) {
log_debug("Got partial match for set check\n");
set_match_if_any(trie, match_candidate_state, &match_state);
if (match_state.state != TRANS_STATE_MATCH) {
prev_state = match_candidate_state;
continue;
}
}
}
if (match_state.state == TRANS_STATE_MATCH) {
log_debug("Match no context\n");
replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
} else {
log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr);
context_no_match = true;
}
@@ -814,8 +844,6 @@ char *transliterate(char *trans_name, char *str, size_t len) {
}
if (replacement != NULL) {
char *replacement_string = cstring_array_get_string(trans_table->replacement_strings, replacement->string_index);
char *revisit_string = NULL;
@@ -917,6 +945,7 @@ char *transliterate(char *trans_name, char *str, size_t len) {
}
if (state.advance_state) {
prev2_state = prev_state;
prev_state = state;
}