Blob Blame History Raw
/*
 * Copyright (c) 2020 Red Hat, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA. 
 *
 * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.c#7 $
 */

#include "slabSummary.h"

#include "memoryAlloc.h"

#include "adminState.h"
#include "constants.h"
#include "extent.h"
#include "readOnlyNotifier.h"
#include "slabSummaryInternals.h"
#include "threadConfig.h"
#include "types.h"

// SIZING

/**********************************************************************/
static BlockCount getSlabSummaryZoneSize(BlockSize blockSize)
{
  SlabCount entriesPerBlock = blockSize / sizeof(SlabSummaryEntry);
  BlockCount blocksNeeded   = MAX_SLABS / entriesPerBlock;
  return blocksNeeded;
}

/**********************************************************************/
BlockCount getSlabSummarySize(BlockSize blockSize)
{
  return getSlabSummaryZoneSize(blockSize) * MAX_PHYSICAL_ZONES;
}

// FULLNESS HINT COMPUTATION

/**
 * Translate a slab's free block count into a 'fullness hint' that can be
 * stored in a SlabSummaryEntry's 7 bits that are dedicated to its free count.
 *
 * Note: the number of free blocks must be strictly less than 2^23 blocks,
 * even though theoretically slabs could contain precisely 2^23 blocks; there
 * is an assumption that at least one block is used by metadata. This
 * assumption is necessary; otherwise, the fullness hint might overflow.
 * The fullness hint formula is roughly (fullness >> 16) & 0x7f, but
 * ((1 << 23) >> 16) & 0x7f is the same as (0 >> 16) & 0x7f, namely 0, which
 * is clearly a bad hint if it could indicate both 2^23 free blocks or 0 free
 * blocks.
 *
 * @param summary     The summary which is being updated
 * @param freeBlocks  The number of free blocks
 *
 * @return A fullness hint, which can be stored in 7 bits.
 **/
__attribute__((warn_unused_result))
static uint8_t computeFullnessHint(SlabSummary *summary, BlockCount freeBlocks)
{
  ASSERT_LOG_ONLY((freeBlocks < (1 << 23)),
                  "free blocks must be less than 2^23");

  if (freeBlocks == 0) {
    return 0;
  }

  BlockCount hint = freeBlocks >> summary->hintShift;
  return ((hint == 0) ? 1 : hint);
}

/**
 * Translate a slab's free block hint into an approximate count, such that
 * computeFullnessHint() is the inverse function of getApproximateFreeBlocks()
 * (i.e. computeFullnessHint(getApproximateFreeBlocks(x)) == x).
 *
 * @param  summary        The summary from which the hint was obtained
 * @param  freeBlockHint  The hint read from the summary
 *
 * @return An approximation to the free block count
 **/
__attribute__((warn_unused_result))
static BlockCount getApproximateFreeBlocks(SlabSummary *summary,
                                           uint8_t      freeBlockHint)
{
  return ((BlockCount) freeBlockHint) << summary->hintShift;
}

// MAKE/FREE FUNCTIONS

/**********************************************************************/
static void launchWrite(SlabSummaryBlock *summaryBlock);

/**
 * Initialize a SlabSummaryBlock.
 *
 * @param layer             The backing layer
 * @param summaryZone       The parent SlabSummaryZone
 * @param threadID          The ID of the thread of physical zone of this block
 * @param entries           The entries this block manages
 * @param index             The index of this block in its zone's summary
 * @param slabSummaryBlock  The block to intialize
 *
 * @return VDO_SUCCESS or an error
 **/
static int initializeSlabSummaryBlock(PhysicalLayer    *layer,
                                      SlabSummaryZone  *summaryZone,
                                      ThreadID          threadID,
                                      SlabSummaryEntry *entries,
                                      BlockCount        index,
                                      SlabSummaryBlock *slabSummaryBlock)
{
  int result = ALLOCATE(VDO_BLOCK_SIZE, char, __func__,
                        &slabSummaryBlock->outgoingEntries);
  if (result != VDO_SUCCESS) {
    return result;
  }

  result = createVIO(layer, VIO_TYPE_SLAB_SUMMARY, VIO_PRIORITY_METADATA,
                     slabSummaryBlock, slabSummaryBlock->outgoingEntries,
                     &slabSummaryBlock->vio);
  if (result != VDO_SUCCESS) {
    return result;
  }

  slabSummaryBlock->vio->completion.callbackThreadID = threadID;
  slabSummaryBlock->zone                             = summaryZone;
  slabSummaryBlock->entries                          = entries;
  slabSummaryBlock->index                            = index;
  return VDO_SUCCESS;
}

/**
 * Create a new, empty SlabSummaryZone object.
 *
 * @param summary     The summary to which the new zone will belong
 * @param layer       The layer
 * @param zoneNumber  The zone this is
 * @param threadID    The ID of the thread for this zone
 * @param entries     The buffer to hold the entries in this zone
 *
 * @return VDO_SUCCESS or an error
 **/
static int makeSlabSummaryZone(SlabSummary      *summary,
                               PhysicalLayer    *layer,
                               ZoneCount         zoneNumber,
                               ThreadID          threadID,
                               SlabSummaryEntry *entries)
{
  int result = ALLOCATE_EXTENDED(SlabSummaryZone, summary->blocksPerZone,
                                 SlabSummaryBlock, __func__,
                                 &summary->zones[zoneNumber]);
  if (result != VDO_SUCCESS) {
    return result;
  }

  SlabSummaryZone *summaryZone = summary->zones[zoneNumber];
  summaryZone->summary         = summary;
  summaryZone->zoneNumber      = zoneNumber;
  summaryZone->entries         = entries;

  if (layer->createMetadataVIO == NULL) {
    // Blocks are only used for writing, and without a createVIO() call,
    // we'll never be writing anything.
    return VDO_SUCCESS;
  }

  // Initialize each block.
  for (BlockCount i = 0; i < summary->blocksPerZone; i++) {
    result = initializeSlabSummaryBlock(layer, summaryZone, threadID, entries,
                                        i, &summaryZone->summaryBlocks[i]);
    if (result != VDO_SUCCESS) {
      return result;
    }
    entries += summary->entriesPerBlock;
  }

  return VDO_SUCCESS;
}

/**********************************************************************/
int makeSlabSummary(PhysicalLayer       *layer,
                    Partition           *partition,
                    const ThreadConfig  *threadConfig,
                    unsigned int         slabSizeShift,
                    BlockCount           maximumFreeBlocksPerSlab,
                    ReadOnlyNotifier    *readOnlyNotifier,
                    SlabSummary        **slabSummaryPtr)
{
  BlockCount blocksPerZone   = getSlabSummaryZoneSize(VDO_BLOCK_SIZE);
  SlabCount  entriesPerBlock = MAX_SLABS / blocksPerZone;
  int result = ASSERT((entriesPerBlock * blocksPerZone) == MAX_SLABS,
                      "block size must be a multiple of entry size");
  if (result != VDO_SUCCESS) {
    return result;
  }

  if (partition == NULL) {
    // Don't make a slab summary for the formatter since it doesn't need it.
    return VDO_SUCCESS;
  }

  SlabSummary *summary;
  result = ALLOCATE_EXTENDED(SlabSummary, threadConfig->physicalZoneCount,
                             SlabSummaryZone *, __func__, &summary);
  if (result != VDO_SUCCESS) {
    return result;
  }

  summary->zoneCount       = threadConfig->physicalZoneCount;
  summary->readOnlyNotifier = readOnlyNotifier;
  summary->hintShift       = (slabSizeShift > 6) ? (slabSizeShift - 6) : 0;
  summary->blocksPerZone   = blocksPerZone;
  summary->entriesPerBlock = entriesPerBlock;

  size_t totalEntries = MAX_SLABS * MAX_PHYSICAL_ZONES;
  size_t entryBytes = totalEntries * sizeof(SlabSummaryEntry);
  result = layer->allocateIOBuffer(layer, entryBytes, "summary entries",
                                   (char **) &summary->entries);
  if (result != VDO_SUCCESS) {
    freeSlabSummary(&summary);
    return result;
  }

  // Initialize all the entries.
  uint8_t hint = computeFullnessHint(summary, maximumFreeBlocksPerSlab);
  for (size_t i = 0; i < totalEntries; i++) {
    // This default tail block offset must be reflected in
    // slabJournal.c::readSlabJournalTail().
    summary->entries[i] = (SlabSummaryEntry) {
      .tailBlockOffset = 0,
      .fullnessHint    = hint,
      .loadRefCounts   = false,
      .isDirty         = false,
    };
  }

  setSlabSummaryOrigin(summary, partition);
  for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) {
    result = makeSlabSummaryZone(summary, layer, zone,
                                 getPhysicalZoneThread(threadConfig, zone),
                                 summary->entries + (MAX_SLABS * zone));
    if (result != VDO_SUCCESS) {
      freeSlabSummary(&summary);
      return result;
    }
  }

  *slabSummaryPtr = summary;
  return VDO_SUCCESS;
}

/**********************************************************************/
void freeSlabSummary(SlabSummary **slabSummaryPtr)
{
  if (*slabSummaryPtr == NULL) {
    return;
  }

  SlabSummary *summary = *slabSummaryPtr;
  for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) {
    SlabSummaryZone *summaryZone = summary->zones[zone];
    if (summaryZone != NULL) {
      for (BlockCount i = 0; i < summary->blocksPerZone; i++) {
        freeVIO(&summaryZone->summaryBlocks[i].vio);
        FREE(summaryZone->summaryBlocks[i].outgoingEntries);
      }
      FREE(summaryZone);
    }
  }
  FREE(summary->entries);
  FREE(summary);
  *slabSummaryPtr = NULL;
}

/**********************************************************************/
SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone)
{
  return summary->zones[zone];
}

// WRITING FUNCTIONALITY

/**
 * Check whether a summary zone has finished draining.
 *
 * @param summaryZone  The zone to check
 **/
static void checkForDrainComplete(SlabSummaryZone *summaryZone)
{
  if (!isDraining(&summaryZone->state) || (summaryZone->writeCount > 0)) {
    return;
  }

  finishOperationWithResult(&summaryZone->state,
                            (isReadOnly(summaryZone->summary->readOnlyNotifier)
                             ? VDO_READ_ONLY : VDO_SUCCESS));
}

/**
 * Wake all the waiters in a given queue. If the VDO is in read-only mode they
 * will be given a VDO_READ_ONLY error code as their context, otherwise they
 * will be given VDO_SUCCESS.
 *
 * @param summaryZone  The slab summary which owns the queue
 * @param queue        The queue to notify
 **/
static void notifyWaiters(SlabSummaryZone *summaryZone, WaitQueue *queue)
{
  int result = (isReadOnly(summaryZone->summary->readOnlyNotifier)
                ? VDO_READ_ONLY : VDO_SUCCESS);
  notifyAllWaiters(queue, NULL, &result);
}

/**
 * Finish processing a block which attempted to write, whether or not the
 * attempt succeeded.
 *
 * @param block  The block
 **/
static void finishUpdatingSlabSummaryBlock(SlabSummaryBlock *block)
{
  notifyWaiters(block->zone, &block->currentUpdateWaiters);
  block->writing = false;
  block->zone->writeCount--;
  if (hasWaiters(&block->nextUpdateWaiters)) {
    launchWrite(block);
  } else {
    checkForDrainComplete(block->zone);
  }
}

/**
 * This is the callback for a successful block write.
 *
 * @param completion  The write VIO
 **/
static void finishUpdate(VDOCompletion *completion)
{
  SlabSummaryBlock *block = completion->parent;
  atomicAdd64(&block->zone->summary->statistics.blocksWritten, 1);
  finishUpdatingSlabSummaryBlock(block);
}

/**
 * Handle an error writing a slab summary block.
 *
 * @param completion  The write VIO
 **/
static void handleWriteError(VDOCompletion *completion)
{
  SlabSummaryBlock *block = completion->parent;
  enterReadOnlyMode(block->zone->summary->readOnlyNotifier,
                    completion->result);
  finishUpdatingSlabSummaryBlock(block);
}

/**
 * Write a slab summary block unless it is currently out for writing.
 *
 * @param [in] block  The block that needs to be committed
 **/
static void launchWrite(SlabSummaryBlock *block)
{
  if (block->writing) {
    return;
  }

  SlabSummaryZone *zone = block->zone;
  zone->writeCount++;
  transferAllWaiters(&block->nextUpdateWaiters, &block->currentUpdateWaiters);
  block->writing = true;

  SlabSummary *summary = zone->summary;
  if (isReadOnly(summary->readOnlyNotifier)) {
    finishUpdatingSlabSummaryBlock(block);
    return;
  }

  memcpy(block->outgoingEntries, block->entries,
         sizeof(SlabSummaryEntry) * summary->entriesPerBlock);

  // Flush before writing to ensure that the slab journal tail blocks and
  // reference updates covered by this summary update are stable (VDO-2332).
  PhysicalBlockNumber pbn = (summary->origin
                             + (summary->blocksPerZone * zone->zoneNumber)
                             + block->index);
  launchWriteMetadataVIOWithFlush(block->vio, pbn, finishUpdate,
                                  handleWriteError, true, false);
}

/**
 * Initiate a drain.
 *
 * Implements AdminInitiator.
 **/
static void initiateDrain(AdminState *state)
{
  checkForDrainComplete(container_of(state, SlabSummaryZone, state));
}

/**********************************************************************/
void drainSlabSummaryZone(SlabSummaryZone *summaryZone,
                          AdminStateCode   operation,
                          VDOCompletion   *parent)
{
  startDraining(&summaryZone->state, operation, parent, initiateDrain);
}

/**********************************************************************/
void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, VDOCompletion *parent)
{
  finishCompletion(parent, resumeIfQuiescent(&summaryZone->state));
}

// READ/UPDATE FUNCTIONS

/**
 * Get the summary block, and offset into it, for storing the summary for a
 * slab.
 *
 * @param summaryZone    The SlabSummaryZone being queried
 * @param slabNumber     The slab whose summary location is sought
 *
 * @return A pointer to the SlabSummaryEntryBlock containing this
 *         SlabSummaryEntry
 **/
static SlabSummaryBlock *getSummaryBlockForSlab(SlabSummaryZone *summaryZone,
                                                SlabCount        slabNumber)
{
  SlabCount entriesPerBlock = summaryZone->summary->entriesPerBlock;
  return &summaryZone->summaryBlocks[slabNumber / entriesPerBlock];
}

/**********************************************************************/
void updateSlabSummaryEntry(SlabSummaryZone *summaryZone,
                            Waiter          *waiter,
                            SlabCount        slabNumber,
                            TailBlockOffset  tailBlockOffset,
                            bool             loadRefCounts,
                            bool             isClean,
                            BlockCount       freeBlocks)
{
  SlabSummaryBlock *block = getSummaryBlockForSlab(summaryZone, slabNumber);
  int               result;
  if (isReadOnly(summaryZone->summary->readOnlyNotifier)) {
    result = VDO_READ_ONLY;
  } else if (isDraining(&summaryZone->state)
             || isQuiescent(&summaryZone->state)) {
    result = VDO_INVALID_ADMIN_STATE;
  } else {
    uint8_t hint = computeFullnessHint(summaryZone->summary, freeBlocks);
    SlabSummaryEntry *entry = &summaryZone->entries[slabNumber];
    *entry = (SlabSummaryEntry) {
      .tailBlockOffset = tailBlockOffset,
      .loadRefCounts   = (entry->loadRefCounts || loadRefCounts),
      .isDirty         = !isClean,
      .fullnessHint    = hint,
    };
    result = enqueueWaiter(&block->nextUpdateWaiters, waiter);
  }

  if (result != VDO_SUCCESS) {
    waiter->callback(waiter, &result);
    return;
  }

  launchWrite(block);
}

/**********************************************************************/
TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone,
                                             SlabCount        slabNumber)
{
  return summaryZone->entries[slabNumber].tailBlockOffset;
}

/**********************************************************************/
bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber)
{
  return summaryZone->entries[slabNumber].loadRefCounts;
}

/**********************************************************************/
bool getSummarizedCleanliness(SlabSummaryZone *summaryZone,
                              SlabCount        slabNumber)
{
  return !summaryZone->entries[slabNumber].isDirty;
}

/**********************************************************************/
BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone,
                                       SlabCount        slabNumber)
{
  SlabSummaryEntry *entry = &summaryZone->entries[slabNumber];
  return getApproximateFreeBlocks(summaryZone->summary, entry->fullnessHint);
}

/**********************************************************************/
void getSummarizedRefCountsState(SlabSummaryZone *summaryZone,
                                 SlabCount        slabNumber,
                                 size_t          *freeBlockHint,
                                 bool            *isClean)
{
  SlabSummaryEntry *entry = &summaryZone->entries[slabNumber];
  *freeBlockHint          = entry->fullnessHint;
  *isClean                = !entry->isDirty;
}

/**********************************************************************/
void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone,
                               SlabCount        slabCount,
                               SlabStatus      *statuses)
{
  for (SlabCount i = 0; i < slabCount; i++) {
    statuses[i] = (SlabStatus) {
      .slabNumber = i,
      .isClean    = !summaryZone->entries[i].isDirty,
      .emptiness  = summaryZone->entries[i].fullnessHint
    };
  }
}

// RESIZE FUNCTIONS

/**********************************************************************/
void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition)
{
  summary->origin = getFixedLayoutPartitionOffset(partition);
}

// COMBINING FUNCTIONS (LOAD)

/**
 * Clean up after saving out the combined slab summary. This callback is
 * registered in finishLoadingSummary() and loadSlabSummary().
 *
 * @param completion  The extent which was used to write the summary data
 **/
static void finishCombiningZones(VDOCompletion *completion)
{
  SlabSummary *summary = completion->parent;
  int          result  = completion->result;
  VDOExtent   *extent  = asVDOExtent(completion);
  freeExtent(&extent);
  finishLoadingWithResult(&summary->zones[0]->state, result);
}

/**********************************************************************/
void combineZones(SlabSummary *summary)
{
  // Combine all the old summary data into the portion of the buffer
  // corresponding to the first zone.
  ZoneCount zone = 0;
  if (summary->zonesToCombine > 1) {
    for (SlabCount entryNumber = 0; entryNumber < MAX_SLABS; entryNumber++) {
      if (zone != 0) {
        memcpy(summary->entries + entryNumber,
               summary->entries + (zone * MAX_SLABS) + entryNumber,
               sizeof(SlabSummaryEntry));
      }
      zone++;
      if (zone == summary->zonesToCombine) {
        zone = 0;
      }
    }
  }

  // Copy the combined data to each zones's region of the buffer.
  for (zone = 1; zone < MAX_PHYSICAL_ZONES; zone++) {
    memcpy(summary->entries + (zone * MAX_SLABS), summary->entries,
           MAX_SLABS * sizeof(SlabSummaryEntry));
  }
}

/**
 * Combine the slab summary data from all the previously written zones
 * and copy the combined summary to each partition's data region. Then write
 * the combined summary back out to disk. This callback is registered in
 * loadSlabSummary().
 *
 * @param completion  The extent which was used to read the summary data
 **/
static void finishLoadingSummary(VDOCompletion *completion)
{
  SlabSummary *summary = completion->parent;
  VDOExtent   *extent  = asVDOExtent(completion);

  // Combine the zones so each zone is correct for all slabs.
  combineZones(summary);

  // Write the combined summary back out.
  extent->completion.callback = finishCombiningZones;
  writeMetadataExtent(extent, summary->origin);
}

/**********************************************************************/
void loadSlabSummary(SlabSummary    *summary,
                     AdminStateCode  operation,
                     ZoneCount       zonesToCombine,
                     VDOCompletion  *parent)
{
  SlabSummaryZone *zone = summary->zones[0];
  if (!startLoading(&zone->state, operation, parent, NULL)) {
    return;
  }

  VDOExtent *extent;
  BlockCount blocks = summary->blocksPerZone * MAX_PHYSICAL_ZONES;
  int        result = createExtent(parent->layer, VIO_TYPE_SLAB_SUMMARY,
                                   VIO_PRIORITY_METADATA, blocks,
                                   (char *) summary->entries, &extent);
  if (result != VDO_SUCCESS) {
    finishLoadingWithResult(&zone->state, result);
    return;
  }

  if ((operation == ADMIN_STATE_FORMATTING)
      || (operation == ADMIN_STATE_LOADING_FOR_REBUILD)) {
    prepareCompletion(&extent->completion, finishCombiningZones,
                      finishCombiningZones, 0, summary);
    writeMetadataExtent(extent, summary->origin);
    return;
  }

  summary->zonesToCombine = zonesToCombine;
  prepareCompletion(&extent->completion, finishLoadingSummary,
                    finishCombiningZones, 0, summary);
  readMetadataExtent(extent, summary->origin);
}

/**********************************************************************/
SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary)
{
  const AtomicSlabSummaryStatistics *atoms = &summary->statistics;
  return (SlabSummaryStatistics) {
    .blocksWritten = atomicLoad64(&atoms->blocksWritten),
  };
}