Files
libpostal/src/sparkey/hashreader.c
2015-07-09 15:26:11 -04:00

256 lines
7.6 KiB
C

/*
* Copyright (c) 2012-2013 Spotify AB
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include "hashheader.h"
#include "hashiter.h"
#include "util.h"
#include "endiantools.h"
#include "sparkey.h"
#include "sparkey-internal.h"
#define MAGIC_VALUE_HASHREADER (0x75103df9)
sparkey_returncode sparkey_hash_open(sparkey_hashreader **reader_ref, const char *hash_filename, const char *log_filename) {
RETHROW(correct_endian_platform());
sparkey_returncode returncode;
sparkey_hashreader *reader = malloc(sizeof(sparkey_hashreader));
if (reader == NULL) {
return SPARKEY_INTERNAL_ERROR;
}
TRY(sparkey_load_hashheader(&reader->header, hash_filename), free_reader);
TRY(sparkey_logreader_open_noalloc(&reader->log, log_filename), free_reader);
if (reader->header.file_identifier != reader->log.header.file_identifier) {
returncode = SPARKEY_FILE_IDENTIFIER_MISMATCH;
goto close_reader;
}
if (reader->header.data_end > reader->log.header.data_end) {
returncode = SPARKEY_HASH_HEADER_CORRUPT;
goto close_reader;
}
if (reader->header.max_key_len > reader->log.header.max_key_len) {
returncode = SPARKEY_HASH_HEADER_CORRUPT;
goto close_reader;
}
if (reader->header.max_value_len > reader->log.header.max_value_len) {
returncode = SPARKEY_HASH_HEADER_CORRUPT;
goto close_reader;
}
reader->fd = open(hash_filename, O_RDONLY);
if (reader->fd < 0) {
int e = errno;
returncode = sparkey_open_returncode(e);
goto close_reader;
}
reader->data_len = reader->header.header_size + reader->header.hash_capacity * (reader->header.hash_size + reader->header.address_size);
struct stat s;
stat(hash_filename, &s);
if (reader->data_len > (uint64_t) s.st_size) {
returncode = SPARKEY_HASH_TOO_SMALL;
goto close_reader;
}
reader->data = mmap(NULL, reader->data_len, PROT_READ, MAP_SHARED, reader->fd, 0);
if (reader->data == MAP_FAILED) {
returncode = SPARKEY_MMAP_FAILED;
goto close_reader;
}
*reader_ref = reader;
reader->open_status = MAGIC_VALUE_HASHREADER;
return SPARKEY_SUCCESS;
close_reader:
sparkey_hash_close(&reader);
return returncode;
free_reader:
free(reader);
return returncode;
}
void sparkey_hash_close(sparkey_hashreader **reader_ref) {
if (reader_ref == NULL) {
return;
}
sparkey_hashreader *reader = *reader_ref;
if (reader == NULL) {
return;
}
if (reader->open_status != MAGIC_VALUE_HASHREADER) {
return;
}
sparkey_logreader_close_nodealloc(&reader->log);
reader->open_status = 0;
if (reader->data != NULL) {
munmap(reader->data, reader->data_len);
reader->data = NULL;
}
close(reader->fd);
reader->fd = -1;
free(reader);
*reader_ref = NULL;
}
static sparkey_returncode assert_reader_open(sparkey_hashreader *reader) {
if (reader->open_status != MAGIC_VALUE_HASHREADER) {
return SPARKEY_HASH_CLOSED;
}
return SPARKEY_SUCCESS;
}
sparkey_returncode sparkey_hash_get(sparkey_hashreader *reader, const uint8_t *key, uint64_t keylen, sparkey_logiter *iter) {
RETHROW(assert_reader_open(reader));
uint64_t hash = reader->header.hash_algorithm.hash(key, keylen, reader->header.hash_seed);
uint64_t wanted_slot = hash % reader->header.hash_capacity;
int slot_size = reader->header.address_size + reader->header.hash_size;
uint64_t pos = wanted_slot * slot_size;
uint64_t displacement = 0;
uint64_t slot = wanted_slot;
uint8_t *hashtable = reader->data + reader->header.header_size;
while (1) {
uint64_t hash2 = reader->header.hash_algorithm.read_hash(hashtable, pos);
uint64_t position2 = read_addr(hashtable, pos + reader->header.hash_size, reader->header.address_size);
if (position2 == 0) {
iter->state = SPARKEY_ITER_INVALID;
return SPARKEY_SUCCESS;
}
int entry_index2 = (int) (position2) & reader->header.entry_block_bitmask;
position2 >>= reader->header.entry_block_bits;
if (hash == hash2) {
RETHROW(sparkey_logiter_seek(iter, &reader->log, position2));
RETHROW(sparkey_logiter_skip(iter, &reader->log, entry_index2));
RETHROW(sparkey_logiter_next(iter, &reader->log));
uint64_t keylen2 = iter->keylen;
if (iter->type != SPARKEY_ENTRY_PUT) {
iter->state = SPARKEY_ITER_INVALID;
return SPARKEY_INTERNAL_ERROR;
}
if (keylen == keylen2) {
uint64_t pos2 = 0;
int equals = 1;
while (pos2 < keylen) {
uint8_t *buf2;
uint64_t len2;
RETHROW(sparkey_logiter_keychunk(iter, &reader->log, keylen, &buf2, &len2));
if (memcmp(&key[pos2], buf2, len2) != 0) {
equals = 0;
break;
}
pos2 += len2;
}
if (equals) {
return SPARKEY_SUCCESS;
}
}
}
uint64_t other_displacement = get_displacement(reader->header.hash_capacity, slot, hash2);
if (displacement > other_displacement) {
iter->state = SPARKEY_ITER_INVALID;
return SPARKEY_SUCCESS;
}
pos += slot_size;
displacement++;
slot++;
if (slot >= reader->header.hash_capacity) {
pos = 0;
slot = 0;
}
}
iter->state = SPARKEY_ITER_INVALID;
return SPARKEY_INTERNAL_ERROR;
}
sparkey_returncode sparkey_logiter_hashnext(sparkey_logiter *iter, sparkey_hashreader *reader) {
RETHROW(assert_reader_open(reader));
uint8_t *hashtable = reader->data + reader->header.header_size;
int slot_size = reader->header.address_size + reader->header.hash_size;
while (1) {
RETHROW(sparkey_logiter_next(iter, &reader->log));
if (iter->state != SPARKEY_ITER_ACTIVE) {
return SPARKEY_SUCCESS;
}
if (iter->type != SPARKEY_ENTRY_PUT) {
continue;
}
uint64_t position = (iter->entry_block_position << reader->header.entry_block_bits) | iter->entry_count;
uint64_t key_hash = sparkey_iter_hash(&reader->header, iter, &reader->log);
uint64_t wanted_slot = key_hash % reader->header.hash_capacity;
uint64_t pos = wanted_slot * slot_size;
uint64_t displacement = 0;
uint64_t slot = wanted_slot;
while (1) {
uint64_t hash2 = reader->header.hash_algorithm.read_hash(hashtable, pos);
uint64_t position2 = read_addr(hashtable, pos + reader->header.hash_size, reader->header.address_size);
if (position2 == 0) {
break;
}
if (position == position2) {
// Found a match! Just reset the iterator
RETHROW(sparkey_logiter_reset(iter, &reader->log));
return SPARKEY_SUCCESS;
}
uint64_t other_displacement = get_displacement(reader->header.hash_capacity, slot, hash2);
if (displacement > other_displacement) {
break;
}
pos += slot_size;
displacement++;
slot++;
if (slot >= reader->header.hash_capacity) {
pos = 0;
slot = 0;
}
}
}
}
sparkey_logreader * sparkey_hash_getreader(sparkey_hashreader *reader) {
return &reader->log;
}
uint64_t sparkey_hash_numentries(sparkey_hashreader *reader) {
return reader->header.num_entries;
}
uint64_t sparkey_hash_numcollisions(sparkey_hashreader *reader) {
return reader->header.hash_collisions;
}