[geodisambig] Bloom filter implementation for quick probabilistic set membership tests before hitting disk. 100% recall and bounded precision, saves disk seeks for keys that definitely do not exist (useful for Geonames disambiguation-related lookups and in-process deduping).

This commit is contained in:
Al
2015-03-11 17:47:15 -04:00
parent eb391bf4d5
commit cf613ee475
2 changed files with 129 additions and 0 deletions

98
src/bloom.c Normal file
View File

@@ -0,0 +1,98 @@
#include <math.h>
#include "bloom.h"
#include "murmur/murmur.h"
#define LOG2_SQUARED 0.4804530139182014
#define LOG2 0.6931471805599453
static int bloom_filter_check_add(bloom_filter_t *self, const char *key, size_t len, bool add) {
uint64_t checksum[2];
MurmurHash3_x64_128(key, len, SALT_CONSTANT, checksum);
/* Only calls the actual hash function once but effectively
creates K hash functions. */
uint64_t h;
uint64_t h1 = checksum[0];
uint64_t h2 = checksum[1];
uint64_t byte;
uint64_t mask;
uint64_t num_bits = self->num_bits;
unsigned char c;
uint64_t hits = 0;
for (int i = 0; i < self->num_hashes; i++) {
h = (h1 + i * h2) % num_bits;
byte = h >> 3;
c = self->filter[byte];
mask = 1 << (h % 8);
if (c & mask) {
hits++;
} else if (add) {
self->filter[byte] = c | mask;
}
}
if (hits == self->num_hashes) {
return 1;
}
return 0;
}
int bloom_filter_check(bloom_filter_t *self, const char *key, size_t len) {
return bloom_filter_check_add(self, key, len, false);
}
int bloom_filter_add(bloom_filter_t *self, const char *key, size_t len) {
return bloom_filter_check_add(self, key, len, true);
}
bloom_filter_t *bloom_filter_new(uint64_t capacity, double error) {
bloom_filter_t *bloom = malloc(sizeof(bloom_filter_t));
if (bloom == NULL)
return NULL;
bloom->ready = false;
if (capacity < 1 || error == 0.0) {
goto exit_free_bloom;
}
bloom->capacity = capacity;
bloom->error = error;
bloom->bits_per_entry = -(log(error) / LOG2_SQUARED);
bloom->num_bits = (uint64_t)((double)capacity * bloom->bits_per_entry);\
bloom->num_bytes = (uint64_t)(ceil((double)bloom->num_bits / 8));
bloom->num_hashes = (uint32_t)ceil(LOG2 * bloom->bits_per_entry);
// Using calloc to zero it out
bloom->filter = calloc(bloom->num_bytes, sizeof(char));
if (bloom->filter == NULL) {
goto exit_free_bloom;
}
bloom->ready = true;
return bloom;
exit_free_bloom:
free(bloom);
return NULL;
}
void bloom_filter_destroy(bloom_filter_t *self) {
free(self->filter);
free(self);
}

31
src/bloom.h Normal file
View File

@@ -0,0 +1,31 @@
#ifndef BLOOM_H
#define BLOOM_H
#include <stdlib.h>
#include <stdbool.h>
#define SALT_CONSTANT 0x66e8c41d
typedef struct bloom_filter {
uint64_t capacity;
double error;
uint64_t num_bits;
uint64_t num_bytes;
uint32_t num_hashes;
double bits_per_entry;
unsigned char *filter;
bool ready;
} bloom_filter_t;
bloom_filter_t *bloom_filter_new(uint64_t capacity, double error);
int bloom_filter_check(bloom_filter_t *self, const char *key, size_t len);
int bloom_filter_add(bloom_filter_t *self, const char *key, size_t len);
void bloom_filter_print(bloom_filter_t *self);
void bloom_filter_destroy(bloom_filter_t *self);
#endif