Files
libpostal/test/test_string_utils.c

343 lines
8.7 KiB
C

#include <stdio.h>
#include "greatest.h"
#include "../src/features.h"
#include "../src/scanner.h"
#include "../src/string_utils.h"
SUITE(libpostal_string_utils_tests);
TEST test_utf8_reverse(void) {
char *s = "Bünderstraße";
char *rev = utf8_reversed_string(s);
if (rev == NULL) {
FAIL();
}
ASSERT_STR_EQ(rev, "eßartsrednüB");
free(rev);
PASS();
}
TEST test_utf8proc_iterate_reversed(void) {
char *s = "\xce\xa9\xcc\x93\xcd\x85";
int32_t ch;
ssize_t char_len;
size_t idx = strlen(s);
char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch);
ASSERT_EQ(char_len, 2);
ASSERT_EQ(ch, 837);
idx -= char_len;
char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch);
ASSERT_EQ(char_len, 2);
ASSERT_EQ(ch, 787);
idx -= char_len;
char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch);
ASSERT_EQ(char_len, 2);
ASSERT_EQ(ch, 937);
idx -= char_len;
char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch);
ASSERT_EQ(char_len, 0);
ASSERT_EQ(ch, -1);
PASS();
}
TEST test_utf8_compare_ignore_separators(void) {
char *str1 = "Bünderstraße";
char *str2 = "Bünder-straße";
size_t prefix = utf8_common_prefix_ignore_separators(str1, str2);
ASSERT_EQ(prefix, 14);
PASS();
}
TEST test_utf8_equal_ignore_separators(void) {
char *str1 = "Bünderstraße ";
char *str2 = "Bünder-straße";
bool equal = utf8_common_prefix_ignore_separators(str1, str2);
ASSERT(equal);
str1 = " Bünder-straße ";
str2 = "Bünder straße";
equal = utf8_common_prefix_ignore_separators(str1, str2);
ASSERT(equal);
str1 = "Bünder-straße-a";
str2 = "Bünder straße aa";
equal = utf8_common_prefix_ignore_separators(str1, str2);
ASSERT_FALSE(equal);
PASS();
}
TEST test_feature_array_add(void) {
cstring_array *features = cstring_array_new();
if (features == NULL) {
FAIL();
}
feature_array_add(features, 3, "a", "foo", "blee");
feature_array_add(features, 1, "b");
ASSERT_EQ(cstring_array_num_strings(features), 2);
char *feature = cstring_array_get_string(features, 0);
size_t len = cstring_array_token_length(features, 0);
if (feature == NULL) {
cstring_array_destroy(features);
FAIL();
}
ASSERT_STR_EQ(feature, "a|foo|blee");
ASSERT_EQ(len, strlen(feature));
feature = cstring_array_get_string(features, 1);
len = cstring_array_token_length(features, 1);
if (feature == NULL) {
cstring_array_destroy(features);
FAIL();
}
ASSERT_STR_EQ(feature, "b");
ASSERT_EQ(len, strlen(feature));
char **strings = cstring_array_to_strings(features);
if (strings == NULL) {
FAIL();
}
ASSERT_STR_EQ(strings[0], "a|foo|blee");
free(strings[0]);
ASSERT_STR_EQ(strings[1], "b");
free(strings[1]);
free(strings);
PASS();
}
TEST test_char_array(void) {
char_array *str = char_array_new();
if (str == NULL) {
FAIL();
}
char_array_cat(str, "Bürgermeister");
char_array_cat(str, "|");
char_array_cat_reversed(str, "straße");
ASSERT_STR_EQ(str->a, "Bürgermeister|eßarts");
char_array_cat_printf(str, " %d %s %.2f \t ", 1234, "onetwothreefour", 12.34);
char *expected_output = "Bürgermeister|eßarts 1234 onetwothreefour 12.34 \t ";
ASSERT_STR_EQ(str->a, expected_output);
char *a = char_array_to_string(str);
ASSERT_STR_EQ(a, expected_output);
char *b = string_trim(a);
ASSERT_STR_EQ(b, "Bürgermeister|eßarts 1234 onetwothreefour 12.34");
free(a);
free(b);
str = char_array_new();
#define SEPARATOR "|*|*|*|"
char_array_add_joined(str, SEPARATOR, true, 3, "dictionaries" SEPARATOR, "foo", "bar");
a = char_array_get_string(str);
ASSERT_STR_EQ(a, "dictionaries|*|*|*|foo|*|*|*|bar");
char_array_destroy(str);
PASS();
}
TEST test_cstring_array(void) {
size_t count = 0;
cstring_array *array = cstring_array_split_no_copy(strdup("The|Low|End|Theory"), '|', &count);
if (array == NULL) {
FAIL();
}
ASSERT_EQ(count, 4);
char *str = NULL;
str = cstring_array_get_string(array, 0);
if (str == NULL) {
FAIL();
}
ASSERT_STR_EQ(str, "The");
str = cstring_array_get_string(array, 1);
if (str == NULL) {
FAIL();
}
ASSERT_STR_EQ(str, "Low");
str = cstring_array_get_string(array, 2);
if (str == NULL) {
FAIL();
}
ASSERT_STR_EQ(str, "End");
str = cstring_array_get_string(array, 3);
if (str == NULL) {
FAIL();
}
ASSERT_STR_EQ(str, "Theory");
cstring_array_destroy(array);
PASS();
}
TEST test_string_tree(void) {
string_tree_t *tree = string_tree_new();
if (tree == NULL) {
FAIL();
}
string_tree_finalize_token(tree);
string_tree_add_string(tree, "Twenty-fifth");
string_tree_add_string(tree, "Twentyfifth");
string_tree_finalize_token(tree);
string_tree_add_string(tree, "Bürgermeister");
string_tree_add_string(tree, "Buergermeister");
string_tree_add_string(tree, "Burgermeister");
string_tree_finalize_token(tree);
string_tree_add_string(tree, "Straße");
string_tree_add_string(tree, "Strasse");
string_tree_finalize_token(tree);
ASSERT_EQ(tree->token_indices->n - 1, 4);
ASSERT_EQ(string_tree_num_alternatives(tree, 0), 1);
ASSERT_EQ(string_tree_num_alternatives(tree, 1), 2);
ASSERT_EQ(string_tree_num_alternatives(tree, 2), 3);
ASSERT_EQ(string_tree_num_alternatives(tree, 3), 2);
string_tree_iterator_t *iter = string_tree_iterator_new(tree);
if (iter == NULL) {
string_tree_destroy(tree);
FAIL();
}
size_t expected_num_tokens = 4;
ASSERT_EQ(iter->num_tokens, expected_num_tokens);
ASSERT_EQ(iter->remaining, 12);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 0);
ASSERT_EQ(iter->path[2], 0);
ASSERT_EQ(iter->path[3], 0);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 0);
ASSERT_EQ(iter->path[2], 0);
ASSERT_EQ(iter->path[3], 1);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 0);
ASSERT_EQ(iter->path[2], 1);
ASSERT_EQ(iter->path[3], 0);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 0);
ASSERT_EQ(iter->path[2], 1);
ASSERT_EQ(iter->path[3], 1);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 0);
ASSERT_EQ(iter->path[2], 2);
ASSERT_EQ(iter->path[3], 0);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 0);
ASSERT_EQ(iter->path[2], 2);
ASSERT_EQ(iter->path[3], 1);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 1);
ASSERT_EQ(iter->path[2], 0);
ASSERT_EQ(iter->path[3], 0);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 1);
ASSERT_EQ(iter->path[2], 0);
ASSERT_EQ(iter->path[3], 1);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 1);
ASSERT_EQ(iter->path[2], 1);
ASSERT_EQ(iter->path[3], 0);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 1);
ASSERT_EQ(iter->path[2], 1);
ASSERT_EQ(iter->path[3], 1);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 1);
ASSERT_EQ(iter->path[2], 2);
ASSERT_EQ(iter->path[3], 0);
string_tree_iterator_next(iter);
ASSERT_FALSE(string_tree_iterator_done(iter));
ASSERT_EQ(iter->path[0], 0);
ASSERT_EQ(iter->path[1], 1);
ASSERT_EQ(iter->path[2], 2);
ASSERT_EQ(iter->path[3], 1);
string_tree_iterator_destroy(iter);
string_tree_destroy(tree);
PASS();
}
SUITE(libpostal_string_utils_tests) {
RUN_TEST(test_utf8_reverse);
RUN_TEST(test_utf8proc_iterate_reversed);
RUN_TEST(test_utf8_compare_ignore_separators);
RUN_TEST(test_feature_array_add);
RUN_TEST(test_char_array);
RUN_TEST(test_cstring_array);
RUN_TEST(test_string_tree);
}