From 5c0ecf89637ed67d1c6a94de7c5bf0ddf69faf2c Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 21 Oct 2017 10:34:12 -0400 Subject: [PATCH] [dedupe] Jaccard similarity --- src/jaccard.c | 27 +++++++++++++++++++++++++++ src/jaccard.h | 11 +++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/jaccard.c create mode 100644 src/jaccard.h diff --git a/src/jaccard.c b/src/jaccard.c new file mode 100644 index 00000000..1f96c61f --- /dev/null +++ b/src/jaccard.c @@ -0,0 +1,27 @@ +#include "jaccard.h" + + +double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) { + if (s1 == NULL || s2 == NULL) return -1.0; + + size_t set_intersection = 0; + size_t set_union = 0; + + khiter_t k; + const char *key; + + kh_foreach_key(s1, key, { + k = kh_get(str_set, s2, key); + if (k != kh_end(s2)) { + set_intersection++; + } else { + set_union++; + } + }); + + // set_union contains all the keys that were in s1 but not s2 + // so just add all the keys in s2 to complete the union + set_union += kh_size(s2); + + return (double)set_intersection / set_union; +} \ No newline at end of file diff --git a/src/jaccard.h b/src/jaccard.h new file mode 100644 index 00000000..a6468078 --- /dev/null +++ b/src/jaccard.h @@ -0,0 +1,11 @@ +#ifndef JACCARD_H +#define JACCARD_H + +#include +#include + +#include "collections.h" + +double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2); + +#endif \ No newline at end of file