[dedupe] Jaccard similarity

This commit is contained in:
Al
2017-10-21 10:34:12 -04:00
parent 4ccc2a9e9f
commit 5c0ecf8963
2 changed files with 38 additions and 0 deletions

27
src/jaccard.c Normal file
View File

@@ -0,0 +1,27 @@
#include "jaccard.h"
double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) {
if (s1 == NULL || s2 == NULL) return -1.0;
size_t set_intersection = 0;
size_t set_union = 0;
khiter_t k;
const char *key;
kh_foreach_key(s1, key, {
k = kh_get(str_set, s2, key);
if (k != kh_end(s2)) {
set_intersection++;
} else {
set_union++;
}
});
// set_union contains all the keys that were in s1 but not s2
// so just add all the keys in s2 to complete the union
set_union += kh_size(s2);
return (double)set_intersection / set_union;
}

11
src/jaccard.h Normal file
View File

@@ -0,0 +1,11 @@
#ifndef JACCARD_H
#define JACCARD_H
#include <stdio.h>
#include <stdlib.h>
#include "collections.h"
double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2);
#endif