/*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*
* $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.c#1 $
*/
/**
* Hash table implementation of a map from integers to pointers, implemented
* using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see
* http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does
* not contain any of the locking/concurrency features of the algorithm, just
* the collision resolution scheme.
*
* Hopscotch Hashing is based on hashing with open addressing and linear
* probing. All the entries are stored in a fixed array of buckets, with no
* dynamic allocation for collisions. Unlike linear probing, all the entries
* that hash to a given bucket are stored within a fixed neighborhood starting
* at that bucket. Chaining is effectively represented as a bit vector
* relative to each bucket instead of as pointers or explicit offsets.
*
* When an empty bucket cannot be found within a given neighborhood,
* subsequent neighborhoods are searched, and one or more entries will "hop"
* into those neighborhoods. When this process works, an empty bucket will
* move into the desired neighborhood, allowing the entry to be added. When
* that process fails (typically when the buckets are around 90% full), the
* table must be resized and the all entries rehashed and added to the
* expanded table.
*
* Unlike linear probing, the number of buckets that must be searched in the
* worst case has a fixed upper bound (the size of the neighborhood). Those
* entries occupy a small number of memory cache lines, leading to improved
* use of the cache (fewer misses on both successful and unsuccessful
* searches). Hopscotch hashing outperforms linear probing at much higher load
* factors, so even with the increased memory burden for maintaining the hop
* vectors, less memory is needed to achieve that performance. Hopscotch is
* also immune to "contamination" from deleting entries since entries are
* genuinely removed instead of being replaced by a placeholder.
*
* The published description of the algorithm used a bit vector, but the paper
* alludes to an offset scheme which is used by this implementation. Since the
* entries in the neighborhood are within N entries of the hash bucket at the
* start of the neighborhood, a pair of small offset fields each log2(N) bits
* wide is all that's needed to maintain the hops as a linked list. In order
* to encode "no next hop" (i.e. NULL) as the natural initial value of zero,
* the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 =>
* offset=1, etc.) We can represent neighborhoods of up to 255 entries with
* just 8+8=16 bits per entry. The hop list is sorted by hop offset so the
* first entry in the list is always the bucket closest to the start of the
* neighborhood.
*
* While individual accesses tend to be very fast, the table resize operations
* are very very expensive. If an upper bound on the latency of adding an
* entry to the table is needed, we either need to ensure the table is
* pre-sized to be large enough so no resize is ever needed, or we'll need to
* develop an approach to incrementally resize the table.
**/
#include "pointerMap.h"
#include "errors.h"
#include "logger.h"
#include "memoryAlloc.h"
#include "numeric.h"
#include "permassert.h"
enum {
DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table
NEIGHBORHOOD = 255, // the number of buckets in each neighborhood
MAX_PROBES = 1024, // limit on the number of probes for a free bucket
NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list
DEFAULT_LOAD = 75 // a compromise between memory use and performance
};
/**
* Buckets are packed together to reduce memory usage and improve cache
* efficiency. It would be tempting to encode the hop offsets separately and
* maintain alignment of key/value pairs, but it's crucial to keep the hop
* fields near the buckets that they use them so they'll tend to share cache
* lines.
**/
typedef struct __attribute__((packed)) bucket {
uint8_t firstHop; // the biased offset of the first entry in the hop list
// of the neighborhood that hashes to this bucket
uint8_t nextHop; // the biased offset of the next bucket in the hop list
const void *key; // the key stored in this bucket
void *value; // the value stored in this bucket (NULL if empty)
} Bucket;
/**
* The concrete definition of the opaque PointerMap type. To avoid having to
* wrap the neighborhoods of the last entries back around to the start of the
* bucket array, we allocate a few more buckets at the end of the array
* instead, which is why capacity and bucketCount are different.
**/
struct pointerMap {
/** the number of entries stored in the map */
size_t size;
/** the number of neighborhoods in the map */
size_t capacity;
/** the number of buckets in the bucket array */
size_t bucketCount;
/** the array of hash buckets */
Bucket *buckets;
/** the function for comparing keys for equality */
PointerKeyComparator *comparator;
/** the function for getting a hash code from a key */
PointerKeyHasher *hasher;
};
/**
* Initialize a PointerMap.
*
* @param map the map to initialize
* @param capacity the initial capacity of the map
*
* @return UDS_SUCCESS or an error code
**/
static int allocateBuckets(PointerMap *map, size_t capacity)
{
map->size = 0;
map->capacity = capacity;
// Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a
// full neighborhood without have to wrap back around to element zero.
map->bucketCount = capacity + (NEIGHBORHOOD - 1);
return ALLOCATE(map->bucketCount, Bucket, "PointerMap buckets",
&map->buckets);
}
/**********************************************************************/
int makePointerMap(size_t initialCapacity,
unsigned int initialLoad,
PointerKeyComparator comparator,
PointerKeyHasher hasher,
PointerMap **mapPtr)
{
// Use the default initial load if the caller did not specify one.
if (initialLoad == 0) {
initialLoad = DEFAULT_LOAD;
}
if (initialLoad > 100) {
return UDS_INVALID_ARGUMENT;
}
PointerMap *map;
int result = ALLOCATE(1, PointerMap, "PointerMap", &map);
if (result != UDS_SUCCESS) {
return result;
}
map->hasher = hasher;
map->comparator = comparator;
// Use the default capacity if the caller did not specify one.
size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY;
// Scale up the capacity by the specified initial load factor.
// (i.e to hold 1000 entries at 80% load we need a capacity of 1250)
capacity = capacity * 100 / initialLoad;
result = allocateBuckets(map, capacity);
if (result != UDS_SUCCESS) {
freePointerMap(&map);
return result;
}
*mapPtr = map;
return UDS_SUCCESS;
}
/**
* Free the bucket array for the map.
*
* @param map the map whose bucket array is to be freed
**/
static void freeBuckets(PointerMap *map)
{
FREE(map->buckets);
map->buckets = NULL;
}
/**********************************************************************/
void freePointerMap(PointerMap **mapPtr)
{
if (*mapPtr != NULL) {
freeBuckets(*mapPtr);
FREE(*mapPtr);
*mapPtr = NULL;
}
}
/**********************************************************************/
size_t pointerMapSize(const PointerMap *map)
{
return map->size;
}
/**
* Convert a biased hop offset within a neighborhood to a pointer to the
* bucket it references.
*
* @param neighborhood the first bucket in the neighborhood
* @param hopOffset the biased hop offset to the desired bucket
*
* @return NULL
if hopOffset is zero, otherwise a pointer to
* the bucket in the neighborhood at hopOffset - 1
**/
static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset)
{
if (hopOffset == NULL_HOP_OFFSET) {
return NULL;
}
STATIC_ASSERT(NULL_HOP_OFFSET == 0);
return &neighborhood[hopOffset - 1];
}
/**
* Add a bucket into the hop list for the neighborhood, inserting it into the
* list so the hop list remains sorted by hop offset.
*
* @param neighborhood the first bucket in the neighborhood
* @param newBucket the bucket to add to the hop list
**/
static void insertInHopList(Bucket *neighborhood, Bucket *newBucket)
{
// Zero indicates a NULL hop offset, so bias the hop offset by one.
int hopOffset = 1 + (newBucket - neighborhood);
// Handle the special case of adding a bucket at the start of the list.
int nextHop = neighborhood->firstHop;
if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) {
newBucket->nextHop = nextHop;
neighborhood->firstHop = hopOffset;
return;
}
// Search the hop list for the insertion point that maintains the sort
// order.
for (;;) {
Bucket *bucket = dereferenceHop(neighborhood, nextHop);
nextHop = bucket->nextHop;
if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) {
newBucket->nextHop = nextHop;
bucket->nextHop = hopOffset;
return;
}
}
}
/**
* Select and return the hash bucket for a given search key.
*
* @param map the map to search
* @param key the mapping key
**/
static Bucket *selectBucket(const PointerMap *map, const void *key)
{
/*
* Scale the 32-bit hash to a bucket index by treating it as a binary
* fraction and multiplying that by the capacity. If the hash is uniformly
* distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be
* uniformly distributed over [0 .. capacity-1]. The multiply and shift is
* much faster than a divide (modulus) on X86 CPUs.
*/
uint64_t hash = map->hasher(key);
return &map->buckets[(hash * map->capacity) >> 32];
}
/**
* Search the hop list associated with given hash bucket for a given search
* key. If the key is found, returns a pointer to the entry (bucket or
* collision), otherwise returns NULL
.
*
* @param [in] map the map being searched
* @param [in] bucket the map bucket to search for the key
* @param [in] key the mapping key
* @param [out] previousPtr if not NULL
, a pointer in which to
* store the bucket in the list preceding the one
* that had the matching key
*
* @return an entry that matches the key, or NULL
if not found
**/
static Bucket *searchHopList(PointerMap *map,
Bucket *bucket,
const void *key,
Bucket **previousPtr)
{
Bucket *previous = NULL;
unsigned int nextHop = bucket->firstHop;
while (nextHop != NULL_HOP_OFFSET) {
// Check the neighboring bucket indexed by the offset for the desired key.
Bucket *entry = dereferenceHop(bucket, nextHop);
if ((entry->value != NULL) && map->comparator(key, entry->key)) {
if (previousPtr != NULL) {
*previousPtr = previous;
}
return entry;
}
nextHop = entry->nextHop;
previous = entry;
}
return NULL;
}
/**********************************************************************/
void *pointerMapGet(PointerMap *map, const void *key)
{
Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL);
return ((match != NULL) ? match->value : NULL);
}
/**
* Increase the number of hash buckets and rehash all the existing entries,
* storing them in the new buckets.
*
* @param map the map to resize
**/
static int resizeBuckets(PointerMap *map)
{
// Copy the top-level map data to the stack.
PointerMap oldMap = *map;
// Re-initialize the map to be empty and 50% larger.
size_t newCapacity = map->capacity / 2 * 3;
logInfo("%s: attempting resize from %zu to %zu, current size=%zu",
__func__, map->capacity, newCapacity, map->size);
int result = allocateBuckets(map, newCapacity);
if (result != UDS_SUCCESS) {
*map = oldMap;
return result;
}
// Populate the new hash table from the entries in the old bucket array.
for (size_t i = 0; i < oldMap.bucketCount; i++) {
Bucket *entry = &oldMap.buckets[i];
if (entry->value == NULL) {
continue;
}
result = pointerMapPut(map, entry->key, entry->value, true, NULL);
if (result != UDS_SUCCESS) {
// Destroy the new partial map and restore the map from the stack.
freeBuckets(map);
*map = oldMap;
return result;
}
}
// Destroy the old bucket array.
freeBuckets(&oldMap);
return UDS_SUCCESS;
}
/**
* Probe the bucket array starting at the given bucket for the next empty
* bucket, returning a pointer to it. NULL
will be returned if
* the search reaches the end of the bucket array or if the number of linear
* probes exceeds a specified limit.
*
* @param map the map containing the buckets to search
* @param bucket the bucket at which to start probing
* @param maxProbes the maximum number of buckets to search
*
* @return the next empty bucket, or NULL
if the search failed
**/
static Bucket *findEmptyBucket(PointerMap *map,
Bucket *bucket,
unsigned int maxProbes)
{
// Limit the search to either the nearer of the end of the bucket array or a
// fixed distance beyond the initial bucket.
size_t remaining = &map->buckets[map->bucketCount] - bucket;
Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)];
for (Bucket *entry = bucket; entry < sentinel; entry++) {
if (entry->value == NULL) {
return entry;
}
}
return NULL;
}
/**
* Move an empty bucket closer to the start of the bucket array. This searches
* the neighborhoods that contain the empty bucket for a non-empty bucket
* closer to the start of the array. If such a bucket is found, this swaps the
* two buckets by moving the entry to the empty bucket.
*
* @param map the map containing the bucket
* @param hole the empty bucket to fill with an entry that precedes it in one
* of its enclosing neighborhoods
*
* @return the bucket that was vacated by moving its entry to the provided
* hole, or NULL
if no entry could be moved
**/
static Bucket *moveEmptyBucket(PointerMap *map __attribute__((unused)),
Bucket *hole)
{
/*
* Examine every neighborhood that the empty bucket is part of, starting
* with the one in which it is the last bucket. No boundary check is needed
* for the negative array arithmetic since this function is only called when
* hole is at least NEIGHBORHOOD cells deeper into the array than a valid
* bucket.
*/
for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) {
// Find the entry that is nearest to the bucket, which means it will be
// nearest to the hash bucket whose neighborhood is full.
Bucket *newHole = dereferenceHop(bucket, bucket->firstHop);
if (newHole == NULL) {
// There are no buckets in this neighborhood that are in use by this one
// (they must all be owned by overlapping neighborhoods).
continue;
}
// Skip this bucket if its first entry is actually further away than the
// hole that we're already trying to fill.
if (hole < newHole) {
continue;
}
/*
* We've found an entry in this neighborhood that we can "hop" further
* away, moving the hole closer to the hash bucket, if not all the way
* into its neighborhood.
*/
// The entry that will be the new hole is the first bucket in the list,
// so setting firstHop is all that's needed remove it from the list.
bucket->firstHop = newHole->nextHop;
newHole->nextHop = NULL_HOP_OFFSET;
// Move the entry into the original hole.
hole->key = newHole->key;
hole->value = newHole->value;
newHole->value = NULL;
// Insert the filled hole into the hop list for the neighborhood.
insertInHopList(bucket, hole);
return newHole;
}
// We couldn't find an entry to relocate to the hole.
return NULL;
}
/**
* Find and update any existing mapping for a given key, returning the value
* associated with the key in the provided pointer.
*
* @param [in] map the PointerMap to attempt to modify
* @param [in] neighborhood the first bucket in the neighborhood that
* would contain the search key
* @param [in] key the key with which to associate the new value
* @param [in] newValue the value to be associated with the key
* @param [in] update whether to overwrite an existing value
* @param [out] oldValuePtr a pointer in which to store the old value
* (unmodified if no mapping was found)
*
* @return true
if the map contains a mapping for the key
* false
if it does not
**/
static bool updateMapping(PointerMap *map,
Bucket *neighborhood,
const void *key,
void *newValue,
bool update,
void **oldValuePtr)
{
Bucket *bucket = searchHopList(map, neighborhood, key, NULL);
if (bucket == NULL) {
// There is no bucket containing the key in the neighborhood.
return false;
}
// Return the value of the current mapping (if desired) and update the
// mapping with the new value (if desired).
if (oldValuePtr != NULL) {
*oldValuePtr = bucket->value;
}
if (update) {
// We're dropping the old key pointer on the floor here, assuming it's a
// property of the value or that it's otherwise safe to just forget.
bucket->key = key;
bucket->value = newValue;
}
return true;
}
/**
* Find an empty bucket in a specified neighborhood for a new mapping or
* attempt to re-arrange mappings so there is such a bucket. This operation
* may fail (returning NULL) if an empty bucket is not available or could not
* be relocated to the neighborhood.
*
* @param map the PointerMap to search or modify
* @param neighborhood the first bucket in the neighborhood in which
* an empty bucket is needed for a new mapping
*
* @return a pointer to an empty bucket in the desired neighborhood, or
* NULL
if a vacancy could not be found or arranged
**/
static Bucket *findOrMakeVacancy(PointerMap *map, Bucket *neighborhood)
{
// Probe within and beyond the neighborhood for the first empty bucket.
Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES);
// Keep trying until the empty bucket is in the bucket's neighborhood or we
// are unable to move it any closer by swapping it with a filled bucket.
while (hole != NULL) {
int distance = hole - neighborhood;
if (distance < NEIGHBORHOOD) {
// We've found or relocated an empty bucket close enough to the initial
// hash bucket to be referenced by its hop vector.
return hole;
}
// The nearest empty bucket isn't within the neighborhood that must
// contain the new entry, so try to swap it with bucket that is closer.
hole = moveEmptyBucket(map, hole);
}
return NULL;
}
/**********************************************************************/
int pointerMapPut(PointerMap *map,
const void *key,
void *newValue,
bool update,
void **oldValuePtr)
{
if (newValue == NULL) {
return UDS_INVALID_ARGUMENT;
}
// Select the bucket at the start of the neighborhood that must contain any
// entry for the provided key.
Bucket *neighborhood = selectBucket(map, key);
// Check whether the neighborhood already contains an entry for the key, in
// which case we optionally update it, returning the old value.
if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) {
return UDS_SUCCESS;
}
/*
* Find an empty bucket in the desired neighborhood for the new entry or
* re-arrange entries in the map so there is such a bucket. This operation
* will usually succeed; the loop body will only be executed on the rare
* occasions that we have to resize the map.
*/
Bucket *bucket;
while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) {
/*
* There is no empty bucket in which to put the new entry in the current
* map, so we're forced to allocate a new bucket array with a larger
* capacity, re-hash all the entries into those buckets, and try again (a
* very expensive operation for large maps).
*/
int result = resizeBuckets(map);
if (result != UDS_SUCCESS) {
return result;
}
// Resizing the map invalidates all pointers to buckets, so recalculate
// the neighborhood pointer.
neighborhood = selectBucket(map, key);
}
// Put the new entry in the empty bucket, adding it to the neighborhood.
bucket->key = key;
bucket->value = newValue;
insertInHopList(neighborhood, bucket);
map->size += 1;
// There was no existing entry, so there was no old value to be returned.
if (oldValuePtr != NULL) {
*oldValuePtr = NULL;
}
return UDS_SUCCESS;
}
/**********************************************************************/
void *pointerMapRemove(PointerMap *map, const void *key)
{
// Select the bucket to search and search it for an existing entry.
Bucket *bucket = selectBucket(map, key);
Bucket *previous;
Bucket *victim = searchHopList(map, bucket, key, &previous);
if (victim == NULL) {
// There is no matching entry to remove.
return NULL;
}
// We found an entry to remove. Save the mapped value to return later and
// empty the bucket.
map->size -= 1;
void *value = victim->value;
victim->value = NULL;
victim->key = 0;
// The victim bucket is now empty, but it still needs to be spliced out of
// the hop list.
if (previous == NULL) {
// The victim is the head of the list, so swing firstHop.
bucket->firstHop = victim->nextHop;
} else {
previous->nextHop = victim->nextHop;
}
victim->nextHop = NULL_HOP_OFFSET;
return value;
}