Blob Blame History Raw
/*
 * Copyright (c) 2020 Red Hat, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA. 
 *
 * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.c#9 $
 */

#include "slab.h"

#include "logger.h"
#include "memoryAlloc.h"

#include "adminState.h"
#include "blockAllocatorInternals.h"
#include "completion.h"
#include "constants.h"
#include "numUtils.h"
#include "pbnLock.h"
#include "recoveryJournal.h"
#include "refCounts.h"
#include "slabDepot.h"
#include "slabJournal.h"
#include "slabJournalInternals.h"
#include "slabSummary.h"

/**********************************************************************/
int configureSlab(BlockCount  slabSize,
                  BlockCount  slabJournalBlocks,
                  SlabConfig *slabConfig)
{
  if (slabJournalBlocks >= slabSize) {
    return VDO_BAD_CONFIGURATION;
  }

  /*
   * This calculation should technically be a recurrence, but the total number
   * of metadata blocks is currently less than a single block of refCounts, so
   * we'd gain at most one data block in each slab with more iteration.
   */
  BlockCount refBlocks
    = getSavedReferenceCountSize(slabSize - slabJournalBlocks);
  BlockCount metaBlocks = (refBlocks + slabJournalBlocks);

  // Make sure test code hasn't configured slabs to be too small.
  if (metaBlocks >= slabSize) {
    return VDO_BAD_CONFIGURATION;
  }

  /*
   * If the slab size is very small, assume this must be a unit test and
   * override the number of data blocks to be a power of two (wasting blocks
   * in the slab). Many tests need their dataBlocks fields to be the exact
   * capacity of the configured volume, and that used to fall out since they
   * use a power of two for the number of data blocks, the slab size was a
   * power of two, and every block in a slab was a data block.
   *
   * XXX Try to figure out some way of structuring testParameters and unit
   * tests so this hack isn't needed without having to edit several unit tests
   * every time the metadata size changes by one block.
   */
  BlockCount dataBlocks = slabSize - metaBlocks;
  if ((slabSize < 1024) && !isPowerOfTwo(dataBlocks)) {
    dataBlocks = ((BlockCount) 1 << logBaseTwo(dataBlocks));
  }

  /*
   * Configure the slab journal thresholds. The flush threshold is 168 of 224
   * blocks in production, or 3/4ths, so we use this ratio for all sizes.
   */
  BlockCount flushingThreshold = ((slabJournalBlocks * 3) + 3) / 4;
  /*
   * The blocking threshold should be far enough from the the flushing
   * threshold to not produce delays, but far enough from the end of the
   * journal to allow multiple successive recovery failures.
   */
  BlockCount remaining = slabJournalBlocks - flushingThreshold;
  BlockCount blockingThreshold = flushingThreshold + ((remaining * 5) / 7);
  /*
   * The scrubbing threshold should be at least 2048 entries before the end of
   * the journal.
   */
  BlockCount minimalExtraSpace
    = 1 + (MAXIMUM_USER_VIOS / SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK);
  BlockCount scrubbingThreshold = blockingThreshold;
  if (slabJournalBlocks > minimalExtraSpace) {
    scrubbingThreshold = slabJournalBlocks - minimalExtraSpace;
  }
  if (blockingThreshold > scrubbingThreshold) {
    blockingThreshold = scrubbingThreshold;
  }

  *slabConfig = (SlabConfig) {
    .slabBlocks                    = slabSize,
    .dataBlocks                    = dataBlocks,
    .referenceCountBlocks          = refBlocks,
    .slabJournalBlocks             = slabJournalBlocks,
    .slabJournalFlushingThreshold  = flushingThreshold,
    .slabJournalBlockingThreshold  = blockingThreshold,
    .slabJournalScrubbingThreshold = scrubbingThreshold
  };
  return VDO_SUCCESS;
}

/**********************************************************************/
PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig    *slabConfig,
                                             PhysicalBlockNumber  origin)
{
  return origin + slabConfig->dataBlocks + slabConfig->referenceCountBlocks;
}

/**********************************************************************/
int makeSlab(PhysicalBlockNumber   slabOrigin,
             BlockAllocator       *allocator,
             PhysicalBlockNumber   translation,
             RecoveryJournal      *recoveryJournal,
             SlabCount             slabNumber,
             bool                  isNew,
             Slab                **slabPtr)
{
  Slab *slab;
  int result = ALLOCATE(1, Slab, __func__, &slab);
  if (result != VDO_SUCCESS) {
    return result;
  }

  const SlabConfig *slabConfig = getSlabConfig(allocator->depot);

  slab->allocator  = allocator;
  slab->start      = slabOrigin;
  slab->end        = slab->start + slabConfig->slabBlocks;
  slab->slabNumber = slabNumber;
  initializeRing(&slab->ringNode);

  slab->refCountsOrigin = slabOrigin + slabConfig->dataBlocks + translation;
  slab->journalOrigin   = (getSlabJournalStartBlock(slabConfig, slabOrigin)
                           + translation);

  result = makeSlabJournal(allocator, slab, recoveryJournal, &slab->journal);
  if (result != VDO_SUCCESS) {
    freeSlab(&slab);
    return result;
  }

  if (isNew) {
    slab->state.state = ADMIN_STATE_NEW;
    result = allocateRefCountsForSlab(slab);
    if (result != VDO_SUCCESS) {
      freeSlab(&slab);
      return result;
    }
  }

  *slabPtr = slab;
  return VDO_SUCCESS;
}

/**********************************************************************/
int allocateRefCountsForSlab(Slab *slab)
{
  BlockAllocator   *allocator  = slab->allocator;
  const SlabConfig *slabConfig = getSlabConfig(allocator->depot);

  int result = ASSERT(slab->referenceCounts == NULL,
                      "Slab %u doesn't allocate refcounts twice",
                      slab->slabNumber);
  if (result != VDO_SUCCESS) {
    return result;
  }

  return makeRefCounts(slabConfig->dataBlocks, slab, slab->refCountsOrigin,
                       allocator->readOnlyNotifier, &slab->referenceCounts);
}

/**********************************************************************/
void freeSlab(Slab **slabPtr)
{
  Slab *slab = *slabPtr;
  if (slab == NULL) {
    return;
  }

  unspliceRingNode(&slab->ringNode);
  freeSlabJournal(&slab->journal);
  freeRefCounts(&slab->referenceCounts);
  FREE(slab);
  *slabPtr = NULL;
}

/**********************************************************************/
ZoneCount getSlabZoneNumber(Slab *slab)
{
  return slab->allocator->zoneNumber;
}

/**********************************************************************/
void markSlabReplaying(Slab *slab)
{
  if (slab->status == SLAB_REBUILT) {
    slab->status = SLAB_REPLAYING;
  }
}

/**********************************************************************/
void markSlabUnrecovered(Slab *slab)
{
  slab->status = SLAB_REQUIRES_SCRUBBING;
}

/**********************************************************************/
BlockCount getSlabFreeBlockCount(const Slab *slab)
{
  return getUnreferencedBlockCount(slab->referenceCounts);
}

/**********************************************************************/
int modifySlabReferenceCount(Slab               *slab,
                             const JournalPoint *journalPoint,
                             ReferenceOperation  operation)
{
  if (slab == NULL) {
    return VDO_SUCCESS;
  }

  /*
   * If the slab is unrecovered, preserve the refCount state and let scrubbing
   * correct the refCount. Note that the slab journal has already captured all
   * refCount updates.
   */
  if (isUnrecoveredSlab(slab)) {
    SequenceNumber entryLock = journalPoint->sequenceNumber;
    adjustSlabJournalBlockReference(slab->journal, entryLock, -1);
    return VDO_SUCCESS;
  }

  bool freeStatusChanged;
  int result = adjustReferenceCount(slab->referenceCounts, operation,
                                    journalPoint, &freeStatusChanged);
  if (result != VDO_SUCCESS) {
    return result;
  }

  if (freeStatusChanged) {
    adjustFreeBlockCount(slab, !isIncrementOperation(operation.type));
  }

  return VDO_SUCCESS;
}

/**********************************************************************/
int acquireProvisionalReference(Slab                *slab,
                                PhysicalBlockNumber  pbn,
                                PBNLock             *lock)
{
  if (hasProvisionalReference(lock)) {
    return VDO_SUCCESS;
  }

  int result = provisionallyReferenceBlock(slab->referenceCounts, pbn, lock);
  if (result != VDO_SUCCESS) {
    return result;
  }

  if (hasProvisionalReference(lock)) {
    adjustFreeBlockCount(slab, false);
  }

  return VDO_SUCCESS;
}

/**********************************************************************/
int slabBlockNumberFromPBN(Slab                *slab,
                           PhysicalBlockNumber  physicalBlockNumber,
                           SlabBlockNumber     *slabBlockNumberPtr)
{
  if (physicalBlockNumber < slab->start) {
    return VDO_OUT_OF_RANGE;
  }

  uint64_t slabBlockNumber = physicalBlockNumber - slab->start;
  if (slabBlockNumber >= getSlabConfig(slab->allocator->depot)->dataBlocks) {
    return VDO_OUT_OF_RANGE;
  }

  *slabBlockNumberPtr = slabBlockNumber;
  return VDO_SUCCESS;
}

/**********************************************************************/
bool shouldSaveFullyBuiltSlab(const Slab *slab)
{
  // Write out the refCounts if the slab has written them before, or it has
  // any non-zero reference counts, or there are any slab journal blocks.
  BlockCount dataBlocks = getSlabConfig(slab->allocator->depot)->dataBlocks;
  return (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)
          || (getSlabFreeBlockCount(slab) != dataBlocks)
          || !isSlabJournalBlank(slab->journal));
}

/**
 * Initiate a slab action.
 *
 * Implements AdminInitiator.
 **/
static void initiateSlabAction(AdminState *state)
{
  Slab *slab = container_of(state, Slab, state);
  if (isDraining(state)) {
    if (state->state == ADMIN_STATE_SCRUBBING) {
      slab->status = SLAB_REBUILDING;
    }

    drainSlabJournal(slab->journal);

    if (slab->referenceCounts != NULL) {
      drainRefCounts(slab->referenceCounts);
    }

    checkIfSlabDrained(slab);
    return;
  }

  if (isLoading(state)) {
    decodeSlabJournal(slab->journal);
    return;
  }

  if (isResuming(state)) {
    queueSlab(slab);
    finishResuming(state);
    return;
  }

  finishOperationWithResult(state, VDO_INVALID_ADMIN_STATE);
}

/**********************************************************************/
void startSlabAction(Slab           *slab,
                     AdminStateCode  operation,
                     VDOCompletion  *parent)
{
  startOperationWithWaiter(&slab->state, operation, parent,
                           initiateSlabAction);
}

/**********************************************************************/
void notifySlabJournalIsLoaded(Slab *slab, int result)
{
  if ((result == VDO_SUCCESS) && isCleanLoad(&slab->state)) {
    // Since this is a normal or new load, we don't need the memory to read and
    // process the recovery journal, so we can allocate reference counts now.
    result = allocateRefCountsForSlab(slab);
  }

  finishLoadingWithResult(&slab->state, result);
}

/**********************************************************************/
bool isSlabOpen(Slab *slab)
{
  return (!isQuiescing(&slab->state) && !isQuiescent(&slab->state));
}

/**********************************************************************/
bool isSlabDraining(Slab *slab)
{
  return isDraining(&slab->state);
}

/**********************************************************************/
void checkIfSlabDrained(Slab *slab)
{
  if (isDraining(&slab->state)
      && !isSlabJournalActive(slab->journal)
      && ((slab->referenceCounts == NULL)
          || !areRefCountsActive(slab->referenceCounts))) {
    finishDrainingWithResult(&slab->state,
                             (isReadOnly(slab->allocator->readOnlyNotifier)
                              ? VDO_READ_ONLY : VDO_SUCCESS));
  }
}

/**********************************************************************/
void notifySlabJournalIsDrained(Slab *slab, int result)
{
  if (slab->referenceCounts == NULL) {
    // This can happen when shutting down a VDO that was in read-only mode when
    // loaded.
    notifyRefCountsAreDrained(slab, result);
    return;
  }

  setOperationResult(&slab->state, result);
  drainRefCounts(slab->referenceCounts);
}

/**********************************************************************/
void notifyRefCountsAreDrained(Slab *slab, int result)
{
  finishDrainingWithResult(&slab->state, result);
}

/**********************************************************************/
bool isSlabResuming(Slab *slab)
{
  return isResuming(&slab->state);
}

/**********************************************************************/
void finishScrubbingSlab(Slab *slab)
{
  slab->status = SLAB_REBUILT;
  queueSlab(slab);
  reopenSlabJournal(slab->journal);
}

/**********************************************************************/
static const char *statusToString(SlabRebuildStatus status)
{
  switch (status) {
  case SLAB_REBUILT:
    return "REBUILT";
  case SLAB_REQUIRES_SCRUBBING:
    return "SCRUBBING";
  case SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING:
    return "PRIORITY_SCRUBBING";
  case SLAB_REBUILDING:
    return "REBUILDING";
  case SLAB_REPLAYING:
    return "REPLAYING";
  default:
    return "UNKNOWN";
  }
}

/**********************************************************************/
void dumpSlab(const Slab *slab)
{
  if (slab->referenceCounts != NULL) {
    // Terse because there are a lot of slabs to dump and syslog is lossy.
    logInfo("slab %u: P%u, %llu free",
            slab->slabNumber, slab->priority, getSlabFreeBlockCount(slab));
  } else {
    logInfo("slab %u: status %s", slab->slabNumber,
            statusToString(slab->status));
  }

  dumpSlabJournal(slab->journal);

  if (slab->referenceCounts != NULL) {
    dumpRefCounts(slab->referenceCounts);
  } else {
    logInfo("refCounts is null");
  }
}