diff --git a/src/libpostal.c b/src/libpostal.c index bfa55ab6..ba2dff5f 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -113,7 +113,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { size_t len = strlen(str); - log_debug("tokenized, num tokens=%d\n", tokens->n); + log_debug("tokenized, num tokens=%zu\n", tokens->n); phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; @@ -155,7 +155,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { string_tree_t *tree = string_tree_new_size(len); if (phrases != NULL) { - log_debug("phrases not NULL, n=%d\n", phrases->n); + log_debug("phrases not NULL, n=%zu\n", phrases->n); ks_introsort(phrase_language_array, phrases->n, phrases->a); phrase_language_t phrase_lang; @@ -181,7 +181,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { - log_debug("Adding previous token, %.*s\n", token.len, str + token.offset); + log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { @@ -250,7 +250,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { for (int j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; if (token.type != WHITESPACE) { - log_debug("Adding previous token, %.*s\n", token.len, str + token.offset); + log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { string_tree_add_string(tree, " "); @@ -271,7 +271,7 @@ string_tree_t *add_string_alternatives(char *str, normalize_options_t options) { for (int j = start; j < end; j++) { token_t token = tokens->a[j]; if (token.type != WHITESPACE) { - log_debug("Adding previous token, %.*s\n", token.len, str + token.offset); + log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); string_tree_add_string_len(tree, str + token.offset, token.len); } else { diff --git a/src/normalize.c b/src/normalize.c index 4b388dbf..6717aca1 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -7,7 +7,6 @@ char *normalize_string_utf8(char *str, uint64_t options) { int utf8proc_options = UTF8PROC_OPTIONS_BASE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC; uint8_t *utf8proc_normalized = NULL; - ssize_t normalized_len = 0; bool have_utf8proc_options = false; @@ -31,7 +30,7 @@ char *normalize_string_utf8(char *str, uint64_t options) { } if (have_utf8proc_options) { - ssize_t normalized_len = utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options); + utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options); return (char *)utf8proc_normalized; } @@ -113,7 +112,6 @@ string_tree_t *normalize_string(char *str, uint64_t options) { char *utf8_normalized = NULL; char *transliterated = NULL; - char *ascii = NULL; if (options & NORMALIZE_STRING_LOWERCASE && is_ascii) { utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); diff --git a/src/numex.c b/src/numex.c index 6c6a7fa9..72f9dac4 100644 --- a/src/numex.c +++ b/src/numex.c @@ -406,7 +406,7 @@ bool numex_table_read(FILE *f) { goto exit_numex_table_load_error; } - log_debug("read num_languages = %d\n", num_languages); + log_debug("read num_languages = %zu\n", num_languages); int i = 0; @@ -745,7 +745,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { bool set_rule = false; state.state = NUMEX_SEARCH_STATE_MATCH; - log_debug("phrase.len=%lld, phrase.data=%d\n", phrase.len, phrase.data); + log_debug("phrase.len=%u, phrase.data=%d\n", phrase.len, phrase.data); rule = get_numex_rule((size_t)phrase.data); log_debug("rule.value=%lld\n", rule.value); @@ -771,7 +771,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { } result.len = idx + phrase.start + phrase.len - result.start; - log_debug("idx=%d, phrase.len=%d\n", idx, phrase.len); + log_debug("idx=%zu, phrase.len=%d\n", idx, phrase.len); log_debug("prev_rule.radix=%d\n", prev_rule.radix); diff --git a/src/transliterate.c b/src/transliterate.c index bf518e37..9c138607 100644 --- a/src/transliterate.c +++ b/src/transliterate.c @@ -744,7 +744,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { int32_t ch = 0; ssize_t char_len = 0; uint8_t *ptr = (uint8_t *)str; - uint64_t idx = 0; + size_t idx = 0; char *original_str = str; char_array *revisit = NULL; @@ -770,7 +770,7 @@ char *transliterate(char *trans_name, char *str, size_t len) { if (ch == 0) break; - log_debug("Got char '%.*s' at idx=%llu\n", (int)char_len, str + idx, idx); + log_debug("Got char '%.*s' at idx=%zu\n", (int)char_len, str + idx, idx); state = state_transition(trie, str, idx, char_len, prev_state); set_match_if_any(trie, state, &match_state); @@ -783,7 +783,6 @@ char *transliterate(char *trans_name, char *str, size_t len) { log_debug("end of partial or last char, prev start=%zd, prev len=%zu\n", prev_state.phrase_start, prev_state.phrase_len); bool context_no_match = false; - bool empty_context_match = false; bool is_last_char = idx + char_len == len; @@ -1293,7 +1292,7 @@ bool transliterator_write(transliterator_t *trans, FILE *f) { return false; } - if (!file_write_uint32(f, trans->steps_length)) { + if (!file_write_uint32(f, (uint32_t)trans->steps_length)) { return false; } @@ -1913,7 +1912,7 @@ bool transliteration_module_setup(char *filename) { return transliteration_table_load(filename == NULL ? DEFAULT_TRANSLITERATION_PATH : filename); } - return false; + return true; } diff --git a/src/transliterate.h b/src/transliterate.h index 1ea2febf..885f9989 100644 --- a/src/transliterate.h +++ b/src/transliterate.h @@ -70,7 +70,7 @@ VECTOR_INIT_FREE_DATA(transliteration_replacement_array, transliteration_replace KHASH_MAP_INIT_STR(str_transliterator, transliterator_t *) -#define kh_script_lang_hash(key) ((uint64_t)(key).script ^ (((key).language == NULL) ? 0 : kh_str_hash_func((key).language))) +#define kh_script_lang_hash(key) ((khint_t)(key).script ^ (((key).language == NULL) ? 0 : kh_str_hash_func((key).language))) #define kh_script_lang_equal(a, b) (((a).script == (b).script) && strcmp((a).language, (b).language) == 0) typedef struct transliterator_index { @@ -153,14 +153,14 @@ char *transliterate(char *trans_name, char *str, size_t len); bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index); transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language); -#define foreach_transliterator(script, language, transliterator_var, code) do { \ - transliteration_table_t *__trans_table = get_transliteration_table(); \ - transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \ - for (int __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \ - transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, __i); \ - if (transliterator_var == NULL) break; \ - code; \ - } \ +#define foreach_transliterator(script, language, transliterator_var, code) do { \ + transliteration_table_t *__trans_table = get_transliteration_table(); \ + transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \ + for (size_t __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \ + transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, (uint32_t)__i); \ + if (transliterator_var == NULL) break; \ + code; \ + } \ } while (0); bool transliteration_table_write(FILE *file); diff --git a/src/trie.c b/src/trie.c index 5eb9345c..31157789 100644 --- a/src/trie.c +++ b/src/trie.c @@ -192,7 +192,7 @@ static bool trie_extend(trie_t *self, uint32_t to_index) { if (to_index < self->nodes->n) return true; - new_begin = self->nodes->n; + new_begin = (uint32_t)self->nodes->n; for (i = new_begin; i < to_index + 1; i++) { trie_node_array_push(self->nodes, (trie_node_t){-(i-1), -(i+1)}); @@ -346,7 +346,7 @@ static uint32_t trie_find_new_base(trie_t *self, unsigned char *transitions, int while (!trie_can_fit_transitions(self, index - first_char_index, transitions, num_transitions)) { trie_node_t node = trie_get_node(self, index); if (-node.check == FREE_LIST_ID) { - if (!trie_extend(self, self->nodes->n+self->alphabet_size)) { + if (!trie_extend(self, (uint32_t) self->nodes->n + self->alphabet_size)) { log_error("Trie index error extending to %d\n", index); return TRIE_INDEX_ERROR; } @@ -456,8 +456,8 @@ void trie_add_tail(trie_t *self, unsigned char *tail) { void trie_set_tail(trie_t *self, unsigned char *tail, int32_t tail_pos) { log_debug("Setting tail: %s at pos %d\n", tail, tail_pos); - int tail_len = strlen((char *)tail); - int num_appends = (tail_pos + tail_len) - self->tail->n; + size_t tail_len = strlen((char *)tail); + size_t num_appends = ((size_t)tail_pos + tail_len) - self->tail->n; int i = 0; // Pad with 0s if we're short @@ -536,9 +536,9 @@ int32_t trie_separate_tail(trie_t *self, uint32_t from_index, unsigned char *tai if (*tail != '\0') tail++; log_debug("Separating node at index %d into char %c with tail %s\n", from_index, c, tail); - trie_set_base(self, index, -1 * self->data->n); + trie_set_base(self, index, -1 * (int32_t)self->data->n); - trie_data_array_push(self->data, (trie_data_node_t){self->tail->n, data}); + trie_data_array_push(self->data, (trie_data_node_t){(uint32_t)self->tail->n, data}); trie_add_tail(self, tail); return index; @@ -557,9 +557,9 @@ void trie_tail_merge(trie_t *self, uint32_t old_node_id, unsigned char *suffix, unsigned char *old_tail = original_tail; log_debug("Merging existing tail %s with new tail %s, node_id=%d\n", original_tail, suffix, old_node_id); - int common_prefix = string_common_prefix((char *)old_tail, (char *)suffix); - int old_tail_len = strlen((char *)old_tail); - int suffix_len = strlen((char *)suffix); + size_t common_prefix = string_common_prefix((char *)old_tail, (char *)suffix); + size_t old_tail_len = strlen((char *)old_tail); + size_t suffix_len = strlen((char *)suffix); if (common_prefix == old_tail_len && old_tail_len == suffix_len) { log_debug("Key already exists, setting value to %d\n", data); self->data->a[old_data_index] = (trie_data_node_t) {old_tail_pos, data}; @@ -567,7 +567,7 @@ void trie_tail_merge(trie_t *self, uint32_t old_node_id, unsigned char *suffix, } uint32_t node_id = old_node_id; - log_debug("common_prefix=%d\n", common_prefix); + log_debug("common_prefix=%zu\n", common_prefix); for (int i=0; i < common_prefix; i++) { c = old_tail[i]; @@ -817,11 +817,11 @@ inline bool trie_set_data_at_index(trie_t *self, uint32_t index, uint32_t data) } inline bool trie_set_data(trie_t *self, char *key, uint32_t data) { - if (index == NULL_NODE_ID) { + uint32_t node_id = trie_get(self, key); + if (node_id == NULL_NODE_ID) { return trie_add(self, key, data); } - uint32_t node_id = trie_get(self, key); return trie_set_data_at_index(self, node_id, data); } diff --git a/src/trie_search.c b/src/trie_search.c index ae5a85fa..d54e7255 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -22,8 +22,9 @@ bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, ph uint32_t next_id; bool match = false; - uint64_t index = 0; - int phrase_len = 0, phrase_start = 0; + uint32_t index = 0; + uint32_t phrase_len = 0; + uint32_t phrase_start = 0; uint32_t data; trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN; @@ -91,7 +92,7 @@ bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, ph log_debug("node.check == node_id\n"); state = SEARCH_STATE_PARTIAL_MATCH; if (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN) { - log_debug("phrase_start=%llu\n", index); + log_debug("phrase_start=%u\n", index); phrase_start = index; fail_ptr = ptr + remaining; } @@ -113,7 +114,7 @@ bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, ph log_debug("Tail matches\n"); last_state = state; data = data_node.data; - log_debug("%llu, %d, %zu\n", index, phrase_len, tail_len); + log_debug("%u, %d, %zu\n", index, phrase_len, tail_len); ptr += tail_len; index += tail_len; advance_index = false; @@ -140,7 +141,7 @@ bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, ph log_debug("Transition to NUL byte matched\n"); state = SEARCH_STATE_MATCH; match = true; - phrase_len = index + len - phrase_start; + phrase_len = index + (uint32_t)len - phrase_start; if (terminal_node.base < 0) { int32_t data_index = -1*terminal_node.base; trie_data_node_t data_node = self->data->a[data_index]; @@ -168,7 +169,7 @@ bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, ph if (advance_index) index += len; advance_index = true; - log_debug("index now %llu\n", index); + log_debug("index now %u\n", index); } // while return true; @@ -186,7 +187,7 @@ inline phrase_array *trie_search(trie_t *self, char *text) { return phrases; } -int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, token_array *tokens, int tail_index, int token_index) { +int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, token_array *tokens, size_t tail_index, int token_index) { int32_t data_index = -1*node.base; trie_data_node_t old_data_node = self->data->a[data_index]; uint32_t current_tail_pos = old_data_node.tail; @@ -203,7 +204,7 @@ int trie_node_search_tail_tokens(trie_t *self, trie_node_t node, char *str, toke token_t token = tokens->a[i]; char *ptr = str + token.offset; - int token_length = token.len; + size_t token_length = token.len; if (!(*tail_ptr)) { log_debug("tail matches!\n"); @@ -242,7 +243,7 @@ bool trie_search_tokens_from_index(trie_t *self, char *str, token_array *tokens, trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN; token_t token; - size_t token_length, token_consumed; + size_t token_length; log_debug("num_tokens: %zu\n", tokens->n); for (int i = 0; i < tokens->n; i++, last_state = state) { @@ -460,7 +461,6 @@ phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, u const uint8_t *ptr = (const uint8_t *)word; const uint8_t *char_ptr; - bool done = false; bool in_tail = false; unsigned char *current_tail = (unsigned char *)""; size_t tail_remaining = 0; @@ -484,13 +484,13 @@ phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, u current_tail++; if (i == char_len - 1) { phrase_len += char_len; - phrase_start = index; + phrase_start = (uint32_t)index; } continue; } else if (in_tail && tail_remaining == 0 && i == char_len - 1) { log_debug("tail match!\n"); - phrase_start = index + char_len; - phrase_len = len - index - char_len; + phrase_start = (uint32_t)(index + char_len); + phrase_len = (uint32_t)(len - index - char_len); value = tail_value; index = 0; break; @@ -523,8 +523,8 @@ phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, u in_tail = true; if (tail_remaining == 0) { - phrase_start = index; - phrase_len = len - index; + phrase_start = (uint32_t)index; + phrase_len = (uint32_t)(len - index); value = tail_value; index = 0; break; @@ -581,7 +581,7 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u ssize_t char_len = 0; - size_t idx = 0; + uint32_t idx = 0; size_t separator_char_len = 0; @@ -593,8 +593,6 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u trie_node_t terminal_node; for (; idx < len; last_node = node, last_node_id = node_id) { - unsigned char ch = *ptr; - log_debug("Getting transition index for %d, (%d, %d)\n", last_node_id, last_node.base, last_node.check); node_id = trie_get_transition_index(self, last_node, *ptr); node = trie_get_node(self, node_id); @@ -648,7 +646,7 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u if (match_len >= current_tail_len) { if (first_char) phrase_start = idx; - phrase_len = (idx + match_len + 1) - phrase_start; + phrase_len = (uint32_t)(idx + match_len + 1) - phrase_start; log_debug("tail match! phrase_len=%u\n", phrase_len); value = data_node.data;