diff --git a/src/jaccard.c b/src/jaccard.c index 1f96c61f..87e27b8b 100644 --- a/src/jaccard.c +++ b/src/jaccard.c @@ -1,8 +1,7 @@ #include "jaccard.h" - double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) { - if (s1 == NULL || s2 == NULL) return -1.0; + if (s1 == NULL || s2 == NULL) return 0.0; size_t set_intersection = 0; size_t set_union = 0; @@ -24,4 +23,47 @@ double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) { set_union += kh_size(s2); return (double)set_intersection / set_union; -} \ No newline at end of file +} + + +double jaccard_similarity_string_arrays(size_t num_strings1, char **strings1, size_t num_strings2, char **strings2) { + if (strings1 == NULL || strings2 == NULL || num_strings1 == 0 || num_strings2 == 0) return 0.0; + + khash_t(str_set) *string_set1 = kh_init(str_set); + if (string_set1 == NULL) return 0.0; + + kh_resize(str_set, string_set1, num_strings1); + int ret = 0; + + khiter_t k; + + for (size_t i = 0; i < num_strings1; i++) { + char *str1 = strings1[i]; + k = kh_put(str_set, string_set1, str1, &ret); + if (ret < 0) { + kh_destroy(str_set, string_set1); + return 0.0; + } + } + + khash_t(str_set) *string_set2 = kh_init(str_set); + if (string_set2 == NULL) { + kh_destroy(str_set, string_set1); + return 0.0; + } + kh_resize(str_set, string_set2, num_strings2); + for (size_t i = 0; i < num_strings2; i++) { + char *str2 = strings2[i]; + k = kh_put(str_set, string_set2, str2, &ret); + if (ret < 0) { + kh_destroy(str_set, string_set1); + kh_destroy(str_set, string_set2); + return 0.0; + } + } + + double sim = jaccard_similarity(string_set1, string_set2); + kh_destroy(str_set, string_set1); + kh_destroy(str_set, string_set2); + return sim; +} diff --git a/src/jaccard.h b/src/jaccard.h index a6468078..9f93266d 100644 --- a/src/jaccard.h +++ b/src/jaccard.h @@ -7,5 +7,6 @@ #include "collections.h" double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2); +double jaccard_similarity_string_arrays(size_t num_strings1, char **strings1, size_t num_strings2, char **strings2); #endif \ No newline at end of file