From dd0bead63a698486e91640a8a0c109462b6342aa Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 10 Mar 2017 01:15:23 -0500 Subject: [PATCH] [test/utils] also a good thing to sanity check (in C especially): string handling code --- test/test.c | 2 + test/test_string_utils.c | 322 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 324 insertions(+) create mode 100644 test/test_string_utils.c diff --git a/test/test.c b/test/test.c index a42f6536..e7593866 100644 --- a/test/test.c +++ b/test/test.c @@ -4,6 +4,7 @@ SUITE_EXTERN(libpostal_expansion_tests); SUITE_EXTERN(libpostal_parser_tests); SUITE_EXTERN(libpostal_transliteration_tests); SUITE_EXTERN(libpostal_numex_tests); +SUITE_EXTERN(libpostal_string_utils_tests); SUITE_EXTERN(libpostal_trie_tests); SUITE_EXTERN(libpostal_crf_context_tests); @@ -17,6 +18,7 @@ int main(int argc, char **argv) { RUN_SUITE(libpostal_parser_tests); RUN_SUITE(libpostal_transliteration_tests); RUN_SUITE(libpostal_numex_tests); + RUN_SUITE(libpostal_string_utils_tests); RUN_SUITE(libpostal_trie_tests); RUN_SUITE(libpostal_crf_context_tests); GREATEST_MAIN_END(); diff --git a/test/test_string_utils.c b/test/test_string_utils.c new file mode 100644 index 00000000..7ded5a4e --- /dev/null +++ b/test/test_string_utils.c @@ -0,0 +1,322 @@ +#include + +#include "greatest.h" + +#include "../src/features.h" +#include "../src/scanner.h" +#include "../src/string_utils.h" + +SUITE(libpostal_string_utils_tests); + +TEST test_utf8_reverse(void) { + char *s = "Bünderstraße"; + char *rev = utf8_reversed_string(s); + if (rev == NULL) { + FAIL(); + } + + ASSERT_STR_EQ(rev, "eßartsrednüB"); + free(rev); + + PASS(); +} + +TEST test_utf8proc_iterate_reversed(void) { + char *s = "\xce\xa9\xcc\x93\xcd\x85"; + + int32_t ch; + ssize_t char_len; + size_t idx = strlen(s); + char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch); + ASSERT_EQ(char_len, 2); + ASSERT_EQ(ch, 837); + idx -= char_len; + + char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch); + ASSERT_EQ(char_len, 2); + ASSERT_EQ(ch, 787); + idx -= char_len; + + char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch); + ASSERT_EQ(char_len, 2); + ASSERT_EQ(ch, 937); + idx -= char_len; + + char_len = utf8proc_iterate_reversed((uint8_t *)s, idx, &ch); + ASSERT_EQ(char_len, 0); + ASSERT_EQ(ch, -1); + + PASS(); +} + +TEST test_utf8_compare_ignore_separators(void) { + char *str1 = "Bünderstraße"; + char *str2 = "Bünder-straße"; + + size_t prefix = utf8_common_prefix_ignore_separators(str1, str2); + + ASSERT_EQ(prefix, 14); + + PASS(); +} + +TEST test_feature_array_add(void) { + cstring_array *features = cstring_array_new(); + if (features == NULL) { + FAIL(); + } + feature_array_add(features, 3, "a", "foo", "blee"); + feature_array_add(features, 1, "b"); + + ASSERT_EQ(cstring_array_num_strings(features), 2); + + char *feature = cstring_array_get_string(features, 0); + size_t len = cstring_array_token_length(features, 0); + + if (feature == NULL) { + cstring_array_destroy(features); + FAIL(); + } + + ASSERT_STR_EQ(feature, "a|foo|blee"); + ASSERT_EQ(len, strlen(feature)); + + feature = cstring_array_get_string(features, 1); + len = cstring_array_token_length(features, 1); + + if (feature == NULL) { + cstring_array_destroy(features); + FAIL(); + } + + ASSERT_STR_EQ(feature, "b"); + ASSERT_EQ(len, strlen(feature)); + + char **strings = cstring_array_to_strings(features); + if (strings == NULL) { + FAIL(); + } + + ASSERT_STR_EQ(strings[0], "a|foo|blee"); + free(strings[0]); + ASSERT_STR_EQ(strings[1], "b"); + free(strings[1]); + + free(strings); + + PASS(); +} + +TEST test_char_array(void) { + char_array *str = char_array_new(); + if (str == NULL) { + FAIL(); + } + char_array_cat(str, "Bürgermeister"); + char_array_cat(str, "|"); + char_array_cat_reversed(str, "straße"); + + ASSERT_STR_EQ(str->a, "Bürgermeister|eßarts"); + + char_array_cat_printf(str, " %d %s %.2f \t ", 1234, "onetwothreefour", 12.34); + + char *expected_output = "Bürgermeister|eßarts 1234 onetwothreefour 12.34 \t "; + ASSERT_STR_EQ(str->a, expected_output); + + char *a = char_array_to_string(str); + ASSERT_STR_EQ(a, expected_output); + + char *b = string_trim(a); + ASSERT_STR_EQ(b, "Bürgermeister|eßarts 1234 onetwothreefour 12.34"); + + free(a); + free(b); + + str = char_array_new(); + #define SEPARATOR "|*|*|*|" + + char_array_add_joined(str, SEPARATOR, true, 3, "dictionaries" SEPARATOR, "foo", "bar"); + + a = char_array_get_string(str); + + ASSERT_STR_EQ(a, "dictionaries|*|*|*|foo|*|*|*|bar"); + + char_array_destroy(str); + + PASS(); +} + +TEST test_cstring_array(void) { + size_t count = 0; + cstring_array *array = cstring_array_split_no_copy(strdup("The|Low|End|Theory"), '|', &count); + if (array == NULL) { + FAIL(); + } + ASSERT_EQ(count, 4); + + char *str = NULL; + + str = cstring_array_get_string(array, 0); + if (str == NULL) { + FAIL(); + } + ASSERT_STR_EQ(str, "The"); + + str = cstring_array_get_string(array, 1); + if (str == NULL) { + FAIL(); + } + ASSERT_STR_EQ(str, "Low"); + + str = cstring_array_get_string(array, 2); + if (str == NULL) { + FAIL(); + } + ASSERT_STR_EQ(str, "End"); + + str = cstring_array_get_string(array, 3); + if (str == NULL) { + FAIL(); + } + ASSERT_STR_EQ(str, "Theory"); + + cstring_array_destroy(array); + + PASS(); +} + +TEST test_string_tree(void) { + string_tree_t *tree = string_tree_new(); + if (tree == NULL) { + FAIL(); + } + + string_tree_finalize_token(tree); + string_tree_add_string(tree, "Twenty-fifth"); + string_tree_add_string(tree, "Twentyfifth"); + string_tree_finalize_token(tree); + string_tree_add_string(tree, "Bürgermeister"); + string_tree_add_string(tree, "Buergermeister"); + string_tree_add_string(tree, "Burgermeister"); + string_tree_finalize_token(tree); + string_tree_add_string(tree, "Straße"); + string_tree_add_string(tree, "Strasse"); + string_tree_finalize_token(tree); + + ASSERT_EQ(tree->token_indices->n - 1, 4); + + ASSERT_EQ(string_tree_num_alternatives(tree, 0), 1); + ASSERT_EQ(string_tree_num_alternatives(tree, 1), 2); + ASSERT_EQ(string_tree_num_alternatives(tree, 2), 3); + ASSERT_EQ(string_tree_num_alternatives(tree, 3), 2); + + string_tree_iterator_t *iter = string_tree_iterator_new(tree); + + if (iter == NULL) { + string_tree_destroy(tree); + FAIL(); + } + size_t expected_num_tokens = 4; + ASSERT_EQ(iter->num_tokens, expected_num_tokens); + ASSERT_EQ(iter->remaining, 12); + + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 0); + ASSERT_EQ(iter->path[2], 0); + ASSERT_EQ(iter->path[3], 0); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 0); + ASSERT_EQ(iter->path[2], 0); + ASSERT_EQ(iter->path[3], 1); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 0); + ASSERT_EQ(iter->path[2], 1); + ASSERT_EQ(iter->path[3], 0); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 0); + ASSERT_EQ(iter->path[2], 1); + ASSERT_EQ(iter->path[3], 1); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 0); + ASSERT_EQ(iter->path[2], 2); + ASSERT_EQ(iter->path[3], 0); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 0); + ASSERT_EQ(iter->path[2], 2); + ASSERT_EQ(iter->path[3], 1); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 1); + ASSERT_EQ(iter->path[2], 0); + ASSERT_EQ(iter->path[3], 0); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 1); + ASSERT_EQ(iter->path[2], 0); + ASSERT_EQ(iter->path[3], 1); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 1); + ASSERT_EQ(iter->path[2], 1); + ASSERT_EQ(iter->path[3], 0); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 1); + ASSERT_EQ(iter->path[2], 1); + ASSERT_EQ(iter->path[3], 1); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 1); + ASSERT_EQ(iter->path[2], 2); + ASSERT_EQ(iter->path[3], 0); + + string_tree_iterator_next(iter); + ASSERT_FALSE(string_tree_iterator_done(iter)); + ASSERT_EQ(iter->path[0], 0); + ASSERT_EQ(iter->path[1], 1); + ASSERT_EQ(iter->path[2], 2); + ASSERT_EQ(iter->path[3], 1); + + string_tree_iterator_destroy(iter); + string_tree_destroy(tree); + + PASS(); +} + +SUITE(libpostal_string_utils_tests) { + RUN_TEST(test_utf8_reverse); + RUN_TEST(test_utf8proc_iterate_reversed); + RUN_TEST(test_utf8_compare_ignore_separators); + RUN_TEST(test_feature_array_add); + RUN_TEST(test_char_array); + RUN_TEST(test_cstring_array); + RUN_TEST(test_string_tree); +} + +