From cf613ee475458b91fb699211eb872ffdbc14624b Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 11 Mar 2015 17:47:15 -0400 Subject: [PATCH] [geodisambig] Bloom filter implementation for quick probabilistic set membership tests before hitting disk. 100% recall and bounded precision, saves disk seeks for keys that definitely do not exist (useful for Geonames disambiguation-related lookups and in-process deduping). --- src/bloom.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/bloom.h | 31 +++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 src/bloom.c create mode 100644 src/bloom.h diff --git a/src/bloom.c b/src/bloom.c new file mode 100644 index 00000000..d4a76da2 --- /dev/null +++ b/src/bloom.c @@ -0,0 +1,98 @@ +#include + +#include "bloom.h" +#include "murmur/murmur.h" + +#define LOG2_SQUARED 0.4804530139182014 +#define LOG2 0.6931471805599453 + +static int bloom_filter_check_add(bloom_filter_t *self, const char *key, size_t len, bool add) { + uint64_t checksum[2]; + + MurmurHash3_x64_128(key, len, SALT_CONSTANT, checksum); + + /* Only calls the actual hash function once but effectively + creates K hash functions. */ + uint64_t h; + uint64_t h1 = checksum[0]; + uint64_t h2 = checksum[1]; + uint64_t byte; + uint64_t mask; + uint64_t num_bits = self->num_bits; + unsigned char c; + + uint64_t hits = 0; + + for (int i = 0; i < self->num_hashes; i++) { + h = (h1 + i * h2) % num_bits; + byte = h >> 3; + c = self->filter[byte]; + + mask = 1 << (h % 8); + + if (c & mask) { + hits++; + } else if (add) { + self->filter[byte] = c | mask; + } + } + + if (hits == self->num_hashes) { + return 1; + } + + return 0; + +} + +int bloom_filter_check(bloom_filter_t *self, const char *key, size_t len) { + return bloom_filter_check_add(self, key, len, false); +} + +int bloom_filter_add(bloom_filter_t *self, const char *key, size_t len) { + return bloom_filter_check_add(self, key, len, true); +} + +bloom_filter_t *bloom_filter_new(uint64_t capacity, double error) { + bloom_filter_t *bloom = malloc(sizeof(bloom_filter_t)); + + if (bloom == NULL) + return NULL; + + bloom->ready = false; + + if (capacity < 1 || error == 0.0) { + goto exit_free_bloom; + } + + bloom->capacity = capacity; + bloom->error = error; + + bloom->bits_per_entry = -(log(error) / LOG2_SQUARED); + + bloom->num_bits = (uint64_t)((double)capacity * bloom->bits_per_entry);\ + bloom->num_bytes = (uint64_t)(ceil((double)bloom->num_bits / 8)); + + bloom->num_hashes = (uint32_t)ceil(LOG2 * bloom->bits_per_entry); + + // Using calloc to zero it out + bloom->filter = calloc(bloom->num_bytes, sizeof(char)); + if (bloom->filter == NULL) { + goto exit_free_bloom; + } + + bloom->ready = true; + + return bloom; + +exit_free_bloom: + free(bloom); + return NULL; +} + + + +void bloom_filter_destroy(bloom_filter_t *self) { + free(self->filter); + free(self); +} \ No newline at end of file diff --git a/src/bloom.h b/src/bloom.h new file mode 100644 index 00000000..8bc45450 --- /dev/null +++ b/src/bloom.h @@ -0,0 +1,31 @@ +#ifndef BLOOM_H +#define BLOOM_H + +#include +#include + +#define SALT_CONSTANT 0x66e8c41d + +typedef struct bloom_filter { + uint64_t capacity; + double error; + uint64_t num_bits; + uint64_t num_bytes; + uint32_t num_hashes; + + double bits_per_entry; + unsigned char *filter; + bool ready; +} bloom_filter_t; + + +bloom_filter_t *bloom_filter_new(uint64_t capacity, double error); + +int bloom_filter_check(bloom_filter_t *self, const char *key, size_t len); +int bloom_filter_add(bloom_filter_t *self, const char *key, size_t len); + +void bloom_filter_print(bloom_filter_t *self); + +void bloom_filter_destroy(bloom_filter_t *self); + +#endif \ No newline at end of file