Blame source/vdo/base/vdoRecovery.c

Packit Service 310c69
/*
Packit Service 310c69
 * Copyright (c) 2020 Red Hat, Inc.
Packit Service 310c69
 *
Packit Service 310c69
 * This program is free software; you can redistribute it and/or
Packit Service 310c69
 * modify it under the terms of the GNU General Public License
Packit Service 310c69
 * as published by the Free Software Foundation; either version 2
Packit Service 310c69
 * of the License, or (at your option) any later version.
Packit Service 310c69
 * 
Packit Service 310c69
 * This program is distributed in the hope that it will be useful,
Packit Service 310c69
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service 310c69
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit Service 310c69
 * GNU General Public License for more details.
Packit Service 310c69
 * 
Packit Service 310c69
 * You should have received a copy of the GNU General Public License
Packit Service 310c69
 * along with this program; if not, write to the Free Software
Packit Service 310c69
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
Packit Service 310c69
 * 02110-1301, USA. 
Packit Service 310c69
 *
Packit Service 310c69
 * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.c#16 $
Packit Service 310c69
 */
Packit Service 310c69
Packit Service 310c69
#include "vdoRecoveryInternals.h"
Packit Service 310c69
Packit Service 310c69
#include "logger.h"
Packit Service 310c69
#include "memoryAlloc.h"
Packit Service 310c69
Packit Service 310c69
#include "blockAllocator.h"
Packit Service 310c69
#include "blockAllocatorInternals.h"
Packit Service 310c69
#include "blockMapInternals.h"
Packit Service 310c69
#include "blockMapPage.h"
Packit Service 310c69
#include "blockMapRecovery.h"
Packit Service 310c69
#include "completion.h"
Packit Service 310c69
#include "numUtils.h"
Packit Service 310c69
#include "packedRecoveryJournalBlock.h"
Packit Service 310c69
#include "recoveryJournal.h"
Packit Service 310c69
#include "recoveryUtils.h"
Packit Service 310c69
#include "slab.h"
Packit Service 310c69
#include "slabDepot.h"
Packit Service 310c69
#include "slabJournal.h"
Packit Service 310c69
#include "slabJournalInternals.h"
Packit Service 310c69
#include "vdoInternal.h"
Packit Service 310c69
#include "waitQueue.h"
Packit Service 310c69
Packit Service 310c69
enum {
Packit Service 310c69
  // The int map needs capacity of twice the number of VIOs in the system.
Packit Service 310c69
  INT_MAP_CAPACITY            = MAXIMUM_USER_VIOS * 2,
Packit Service 310c69
  // There can be as many missing decrefs as there are VIOs in the system.
Packit Service 310c69
  MAXIMUM_SYNTHESIZED_DECREFS = MAXIMUM_USER_VIOS,
Packit Service 310c69
};
Packit Service 310c69
Packit Service 310c69
typedef struct missingDecref {
Packit Service 310c69
  /** A waiter for queueing this object */
Packit Service 310c69
  Waiter              waiter;
Packit Service 310c69
  /** The parent of this object */
Packit Service 310c69
  RecoveryCompletion *recovery;
Packit Service 310c69
  /** Whether this decref is complete */
Packit Service 310c69
  bool                complete;
Packit Service 310c69
  /** The slot for which the last decref was lost */
Packit Service 310c69
  BlockMapSlot        slot;
Packit Service 310c69
  /** The penultimate block map entry for this LBN */
Packit Service 310c69
  DataLocation        penultimateMapping;
Packit Service 310c69
  /** The page completion used to fetch the block map page for this LBN */
Packit Service 310c69
  VDOPageCompletion   pageCompletion;
Packit Service 310c69
  /** The journal point which will be used for this entry */
Packit Service 310c69
  JournalPoint        journalPoint;
Packit Service 310c69
  /** The slab journal to which this entry will be applied */
Packit Service 310c69
  SlabJournal        *slabJournal;
Packit Service 310c69
} MissingDecref;
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Convert a Waiter to the missing decref of which it is a part.
Packit Service 310c69
 *
Packit Service 310c69
 * @param waiter  The Waiter to convert
Packit Service 310c69
 *
Packit Service 310c69
 * @return The MissingDecref wrapping the Waiter
Packit Service 310c69
 **/
Packit Service 310c69
__attribute__((warn_unused_result))
Packit Service 310c69
static inline MissingDecref *asMissingDecref(Waiter *waiter)
Packit Service 310c69
{
Packit Service 310c69
  STATIC_ASSERT(offsetof(MissingDecref, waiter) == 0);
Packit Service 310c69
  return (MissingDecref *) waiter;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Enqueue a MissingDecref. If the enqueue fails, enter read-only mode.
Packit Service 310c69
 *
Packit Service 310c69
 * @param queue  The queue on which to enqueue the decref
Packit Service 310c69
 * @param decref  The MissingDecref to enqueue
Packit Service 310c69
 *
Packit Service 310c69
 * @return VDO_SUCCESS or an error
Packit Service 310c69
 **/
Packit Service 310c69
static int enqueueMissingDecref(WaitQueue *queue, MissingDecref *decref)
Packit Service 310c69
{
Packit Service 310c69
  int result = enqueueWaiter(queue, &decref->waiter);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    enterReadOnlyMode(decref->recovery->vdo->readOnlyNotifier, result);
Packit Service 310c69
    setCompletionResult(&decref->recovery->completion, result);
Packit Service 310c69
    FREE(decref);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  return result;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Convert a BlockMapSlot into a unique uint64_t.
Packit Service 310c69
 *
Packit Service 310c69
 * @param slot  The block map slot to convert.
Packit Service 310c69
 *
Packit Service 310c69
 * @return a one-to-one mappable uint64_t.
Packit Service 310c69
 **/
Packit Service 310c69
static uint64_t slotAsNumber(BlockMapSlot slot)
Packit Service 310c69
{
Packit Service 310c69
  return (((uint64_t) slot.pbn << 10) + slot.slot);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Create a MissingDecref and enqueue it to wait for a determination of its
Packit Service 310c69
 * penultimate mapping.
Packit Service 310c69
 *
Packit Service 310c69
 * @param [in]  recovery   The parent recovery completion
Packit Service 310c69
 * @param [in]  entry      The recovery journal entry for the increment which is
Packit Service 310c69
 *                         missing a decref
Packit Service 310c69
 * @param [out] decrefPtr  A pointer to hold the new MissingDecref
Packit Service 310c69
 *
Packit Service 310c69
 * @return VDO_SUCCESS or an error code
Packit Service 310c69
 **/
Packit Service 310c69
__attribute__((warn_unused_result))
Packit Service 310c69
static int makeMissingDecref(RecoveryCompletion    *recovery,
Packit Service 310c69
                             RecoveryJournalEntry   entry,
Packit Service 310c69
                             MissingDecref        **decrefPtr)
Packit Service 310c69
{
Packit Service 310c69
  MissingDecref *decref;
Packit Service 310c69
  int result = ALLOCATE(1, MissingDecref, __func__, &decref);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    return result;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  decref->recovery = recovery;
Packit Service 310c69
  result = enqueueMissingDecref(&recovery->missingDecrefs[0], decref);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    return result;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  /*
Packit Service 310c69
   * Each synthsized decref needs a unique journal point. Otherwise, in the
Packit Service 310c69
   * event of a crash, we would be unable to tell which synthesized decrefs had
Packit Service 310c69
   * already been committed in the slab journals. Instead of using real
Packit Service 310c69
   * recovery journal space for this, we can use fake journal points between
Packit Service 310c69
   * the last currently valid entry in the tail block and the first journal
Packit Service 310c69
   * entry in the next block. We can't overflow the entry count since the
Packit Service 310c69
   * number of synthesized decrefs is bounded by the DataVIO limit.
Packit Service 310c69
   *
Packit Service 310c69
   * It is vital that any given missing decref always have the same fake
Packit Service 310c69
   * journal point since a failed recovery may be retried with a different
Packit Service 310c69
   * number of zones after having written out some slab journal blocks. Since
Packit Service 310c69
   * the missing decrefs are always read out of the journal in the same order,
Packit Service 310c69
   * we can assign them a journal point when they are read. Their subsequent
Packit Service 310c69
   * use will ensure that, for any given slab journal, they are applied in
Packit Service 310c69
   * the order dictated by these assigned journal points.
Packit Service 310c69
   */
Packit Service 310c69
  decref->slot         = entry.slot;
Packit Service 310c69
  decref->journalPoint = recovery->nextSynthesizedJournalPoint;
Packit Service 310c69
  recovery->nextSynthesizedJournalPoint.entryCount++;
Packit Service 310c69
  recovery->missingDecrefCount++;
Packit Service 310c69
  recovery->incompleteDecrefCount++;
Packit Service 310c69
Packit Service 310c69
  *decrefPtr = decref;
Packit Service 310c69
  return VDO_SUCCESS;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Move the given recovery point forward by one entry.
Packit Service 310c69
 *
Packit Service 310c69
 * @param point  The recovery point to alter
Packit Service 310c69
 **/
Packit Service 310c69
static void incrementRecoveryPoint(RecoveryPoint *point)
Packit Service 310c69
{
Packit Service 310c69
  point->entryCount++;
Packit Service 310c69
  if ((point->sectorCount == (SECTORS_PER_BLOCK - 1))
Packit Service 310c69
      && (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR)) {
Packit Service 310c69
    point->sequenceNumber++;
Packit Service 310c69
    point->sectorCount = 1;
Packit Service 310c69
    point->entryCount = 0;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) {
Packit Service 310c69
    point->sectorCount++;
Packit Service 310c69
    point->entryCount = 0;
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Move the given recovery point backwards by one entry.
Packit Service 310c69
 *
Packit Service 310c69
 * @param point  The recovery point to alter
Packit Service 310c69
 **/
Packit Service 310c69
static void decrementRecoveryPoint(RecoveryPoint *point)
Packit Service 310c69
{
Packit Service 310c69
  STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR > 0);
Packit Service 310c69
Packit Service 310c69
  if ((point->sectorCount <= 1) && (point->entryCount == 0)) {
Packit Service 310c69
    point->sequenceNumber--;
Packit Service 310c69
    point->sectorCount = SECTORS_PER_BLOCK - 1;
Packit Service 310c69
    point->entryCount  = RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR - 1;
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (point->entryCount == 0) {
Packit Service 310c69
    point->sectorCount--;
Packit Service 310c69
    point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR - 1;
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  point->entryCount--;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Check whether the first point precedes the second point.
Packit Service 310c69
 *
Packit Service 310c69
 * @param first   The first recovery point
Packit Service 310c69
 * @param second  The second recovery point
Packit Service 310c69
 *
Packit Service 310c69
 * @return true if the first point precedes the second point
Packit Service 310c69
 **/
Packit Service 310c69
__attribute__((warn_unused_result))
Packit Service 310c69
static bool beforeRecoveryPoint(const RecoveryPoint *first,
Packit Service 310c69
                                const RecoveryPoint *second)
Packit Service 310c69
{
Packit Service 310c69
  if (first->sequenceNumber < second->sequenceNumber) {
Packit Service 310c69
    return true;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (first->sequenceNumber > second->sequenceNumber) {
Packit Service 310c69
    return false;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (first->sectorCount < second->sectorCount) {
Packit Service 310c69
    return true;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  return ((first->sectorCount == second->sectorCount)
Packit Service 310c69
          && (first->entryCount < second->entryCount));
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Prepare the sub-task completion.
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery      The RecoveryCompletion whose sub-task completion is to
Packit Service 310c69
 *                      be prepared
Packit Service 310c69
 * @param callback      The callback to register for the next sub-task
Packit Service 310c69
 * @param errorHandler  The error handler for the next sub-task
Packit Service 310c69
 * @param zoneType      The type of zone on which the callback or errorHandler
Packit Service 310c69
 *                      should run
Packit Service 310c69
 **/
Packit Service 310c69
static void prepareSubTask(RecoveryCompletion *recovery,
Packit Service 310c69
                           VDOAction           callback,
Packit Service 310c69
                           VDOAction           errorHandler,
Packit Service 310c69
                           ZoneType            zoneType)
Packit Service 310c69
{
Packit Service 310c69
  const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo);
Packit Service 310c69
  ThreadID threadID;
Packit Service 310c69
  switch (zoneType) {
Packit Service 310c69
  case ZONE_TYPE_LOGICAL:
Packit Service 310c69
    // All blockmap access is done on single thread, so use logical zone 0.
Packit Service 310c69
    threadID = getLogicalZoneThread(threadConfig, 0);
Packit Service 310c69
    break;
Packit Service 310c69
Packit Service 310c69
  case ZONE_TYPE_PHYSICAL:
Packit Service 310c69
    threadID = recovery->allocator->threadID;
Packit Service 310c69
    break;
Packit Service 310c69
Packit Service 310c69
  case ZONE_TYPE_ADMIN:
Packit Service 310c69
  default:
Packit Service 310c69
    threadID = getAdminThread(threadConfig);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  prepareCompletion(&recovery->subTaskCompletion, callback, errorHandler,
Packit Service 310c69
                    threadID, recovery);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**********************************************************************/
Packit Service 310c69
int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr)
Packit Service 310c69
{
Packit Service 310c69
  const ThreadConfig *threadConfig = getThreadConfig(vdo);
Packit Service 310c69
  RecoveryCompletion *recovery;
Packit Service 310c69
  int result = ALLOCATE_EXTENDED(RecoveryCompletion,
Packit Service 310c69
                                 threadConfig->physicalZoneCount, RingNode,
Packit Service 310c69
                                 __func__, &recovery);
Packit Service 310c69
 if (result != VDO_SUCCESS) {
Packit Service 310c69
    return result;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  recovery->vdo = vdo;
Packit Service 310c69
  for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) {
Packit Service 310c69
    initializeWaitQueue(&recovery->missingDecrefs[z]);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  result = initializeEnqueueableCompletion(&recovery->completion,
Packit Service 310c69
                                           RECOVERY_COMPLETION, vdo->layer);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    freeRecoveryCompletion(&recovery);
Packit Service 310c69
    return result;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  result = initializeEnqueueableCompletion(&recovery->subTaskCompletion,
Packit Service 310c69
                                           SUB_TASK_COMPLETION, vdo->layer);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    freeRecoveryCompletion(&recovery);
Packit Service 310c69
    return result;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  result = makeIntMap(INT_MAP_CAPACITY, 0, &recovery->slotEntryMap);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    freeRecoveryCompletion(&recovery);
Packit Service 310c69
    return result;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  *recoveryPtr  = recovery;
Packit Service 310c69
  return VDO_SUCCESS;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * A waiter callback to free MissingDecrefs.
Packit Service 310c69
 *
Packit Service 310c69
 * Implements WaiterCallback.
Packit Service 310c69
 **/
Packit Service 310c69
static void freeMissingDecref(Waiter *waiter,
Packit Service 310c69
                              void   *context __attribute__((unused)))
Packit Service 310c69
{
Packit Service 310c69
  FREE(asMissingDecref(waiter));
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**********************************************************************/
Packit Service 310c69
void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = *recoveryPtr;
Packit Service 310c69
  if (recovery == NULL) {
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  freeIntMap(&recovery->slotEntryMap);
Packit Service 310c69
  const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo);
Packit Service 310c69
  for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) {
Packit Service 310c69
    notifyAllWaiters(&recovery->missingDecrefs[z], freeMissingDecref, NULL);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  FREE(recovery->journalData);
Packit Service 310c69
  FREE(recovery->entries);
Packit Service 310c69
  destroyEnqueueable(&recovery->subTaskCompletion);
Packit Service 310c69
  destroyEnqueueable(&recovery->completion);
Packit Service 310c69
  FREE(recovery);
Packit Service 310c69
  *recoveryPtr = NULL;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Finish recovering, free the recovery completion and notify the parent.
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The recovery completion
Packit Service 310c69
 **/
Packit Service 310c69
static void finishRecovery(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  VDOCompletion      *parent        = completion->parent;
Packit Service 310c69
  RecoveryCompletion *recovery      = asRecoveryCompletion(completion);
Packit Service 310c69
  VDO                *vdo           = recovery->vdo;
Packit Service 310c69
  uint64_t            recoveryCount = ++vdo->completeRecoveries;
Packit Service 310c69
  initializeRecoveryJournalPostRecovery(vdo->recoveryJournal,
Packit Service 310c69
                                        recoveryCount, recovery->highestTail);
Packit Service 310c69
  freeRecoveryCompletion(&recovery);
Packit Service 310c69
  logInfo("Rebuild complete.");
Packit Service 310c69
Packit Service 310c69
  // Now that we've freed the recovery completion and its vast array of
Packit Service 310c69
  // journal entries, we can allocate refcounts.
Packit Service 310c69
  int result = allocateSlabRefCounts(vdo->depot);
Packit Service 310c69
  finishCompletion(parent, result);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Handle a recovery error.
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion   The recovery completion
Packit Service 310c69
 **/
Packit Service 310c69
static void abortRecovery(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  VDOCompletion      *parent   = completion->parent;
Packit Service 310c69
  int                 result   = completion->result;
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion);
Packit Service 310c69
  freeRecoveryCompletion(&recovery);
Packit Service 310c69
  logWarning("Recovery aborted");
Packit Service 310c69
  finishCompletion(parent, result);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Abort a recovery if there is an error.
Packit Service 310c69
 *
Packit Service 310c69
 * @param result    The result to check
Packit Service 310c69
 * @param recovery  The recovery completion
Packit Service 310c69
 *
Packit Service 310c69
 * @return true if the result was an error
Packit Service 310c69
 **/
Packit Service 310c69
__attribute__((warn_unused_result))
Packit Service 310c69
static bool abortRecoveryOnError(int result, RecoveryCompletion *recovery)
Packit Service 310c69
{
Packit Service 310c69
  if (result == VDO_SUCCESS) {
Packit Service 310c69
    return false;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  finishCompletion(&recovery->completion, result);
Packit Service 310c69
  return true;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Unpack the recovery journal entry associated with the given recovery point.
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery  The recovery completion
Packit Service 310c69
 * @param point     The recovery point
Packit Service 310c69
 *
Packit Service 310c69
 * @return The unpacked contents of the matching recovery journal entry
Packit Service 310c69
 **/
Packit Service 310c69
static RecoveryJournalEntry getEntry(const RecoveryCompletion *recovery,
Packit Service 310c69
                                     const RecoveryPoint      *point)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryJournal *journal = recovery->vdo->recoveryJournal;
Packit Service 310c69
  PhysicalBlockNumber blockNumber
Packit Service 310c69
    = getRecoveryJournalBlockNumber(journal, point->sequenceNumber);
Packit Service 310c69
  off_t sectorOffset
Packit Service 310c69
    = (blockNumber * VDO_BLOCK_SIZE) + (point->sectorCount * VDO_SECTOR_SIZE);
Packit Service 310c69
  PackedJournalSector *sector
Packit Service 310c69
    = (PackedJournalSector *) &recovery->journalData[sectorOffset];
Packit Service 310c69
  return unpackRecoveryJournalEntry(&sector->entries[point->entryCount]);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Create an array of all valid journal entries, in order, and store it in the
Packit Service 310c69
 * recovery completion.
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery  The recovery completion
Packit Service 310c69
 *
Packit Service 310c69
 * @return VDO_SUCCESS or an error code
Packit Service 310c69
 **/
Packit Service 310c69
static int extractJournalEntries(RecoveryCompletion *recovery)
Packit Service 310c69
{
Packit Service 310c69
  // Allocate a NumberedBlockMapping array just large enough to transcribe
Packit Service 310c69
  // every increment PackedRecoveryJournalEntry from every valid journal block.
Packit Service 310c69
  int result = ALLOCATE(recovery->increfCount, NumberedBlockMapping, __func__,
Packit Service 310c69
                        &recovery->entries);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    return result;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  RecoveryPoint recoveryPoint = {
Packit Service 310c69
    .sequenceNumber = recovery->blockMapHead,
Packit Service 310c69
    .sectorCount    = 1,
Packit Service 310c69
    .entryCount     = 0,
Packit Service 310c69
  };
Packit Service 310c69
  while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
Packit Service 310c69
    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
Packit Service 310c69
    result = validateRecoveryJournalEntry(recovery->vdo, &entry);
Packit Service 310c69
    if (result != VDO_SUCCESS) {
Packit Service 310c69
      enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
Packit Service 310c69
      return result;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    if (isIncrementOperation(entry.operation)) {
Packit Service 310c69
      recovery->entries[recovery->entryCount] = (NumberedBlockMapping) {
Packit Service 310c69
        .blockMapSlot  = entry.slot,
Packit Service 310c69
        .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state),
Packit Service 310c69
        .number        = recovery->entryCount,
Packit Service 310c69
      };
Packit Service 310c69
      recovery->entryCount++;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    incrementRecoveryPoint(&recoveryPoint);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  result = ASSERT((recovery->entryCount <= recovery->increfCount),
Packit Service 310c69
                  "approximate incref count is an upper bound");
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  return result;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Extract journal entries and recover the block map. This callback is
Packit Service 310c69
 * registered in startSuperBlockSave().
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The sub-task completion
Packit Service 310c69
 **/
Packit Service 310c69
static void launchBlockMapRecovery(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  VDO                *vdo      = recovery->vdo;
Packit Service 310c69
  assertOnLogicalZoneThread(vdo, 0, __func__);
Packit Service 310c69
Packit Service 310c69
  // Extract the journal entries for the block map recovery.
Packit Service 310c69
  int result = extractJournalEntries(recovery);
Packit Service 310c69
  if (abortRecoveryOnError(result, recovery)) {
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  prepareToFinishParent(completion, &recovery->completion);
Packit Service 310c69
  recoverBlockMap(vdo, recovery->entryCount, recovery->entries, completion);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Finish flushing all slab journals and start a write of the super block.
Packit Service 310c69
 * This callback is registered in addSynthesizedEntries().
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The sub-task completion
Packit Service 310c69
 **/
Packit Service 310c69
static void startSuperBlockSave(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  VDO                *vdo      = recovery->vdo;
Packit Service 310c69
  assertOnAdminThread(vdo, __func__);
Packit Service 310c69
Packit Service 310c69
  logInfo("Saving recovery progress");
Packit Service 310c69
  vdo->state = VDO_REPLAYING;
Packit Service 310c69
Packit Service 310c69
  // The block map access which follows the super block save must be done
Packit Service 310c69
  // on a logical thread.
Packit Service 310c69
  prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback,
Packit Service 310c69
                 ZONE_TYPE_LOGICAL);
Packit Service 310c69
  saveVDOComponentsAsync(vdo, completion);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * The callback from loading the slab depot. It will update the logical blocks
Packit Service 310c69
 * and block map data blocks counts in the recovery journal and then drain the
Packit Service 310c69
 * slab depot in order to commit the recovered slab journals. It is registered
Packit Service 310c69
 * in applyToDepot().
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The sub-task completion
Packit Service 310c69
 **/
Packit Service 310c69
static void finishRecoveringDepot(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  VDO                *vdo      = recovery->vdo;
Packit Service 310c69
  assertOnAdminThread(vdo, __func__);
Packit Service 310c69
Packit Service 310c69
  logInfo("Replayed %zu journal entries into slab journals",
Packit Service 310c69
          recovery->entriesAddedToSlabJournals);
Packit Service 310c69
  logInfo("Synthesized %zu missing journal entries",
Packit Service 310c69
          recovery->missingDecrefCount);
Packit Service 310c69
  vdo->recoveryJournal->logicalBlocksUsed  = recovery->logicalBlocksUsed;
Packit Service 310c69
  vdo->recoveryJournal->blockMapDataBlocks = recovery->blockMapDataBlocks;
Packit Service 310c69
Packit Service 310c69
  prepareSubTask(recovery, startSuperBlockSave, finishParentCallback,
Packit Service 310c69
                 ZONE_TYPE_ADMIN);
Packit Service 310c69
  drainSlabDepot(vdo->depot, ADMIN_STATE_RECOVERING, completion);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * The error handler for recovering slab journals. It will skip any remaining
Packit Service 310c69
 * recovery on the current zone and propagate the error. It is registered in
Packit Service 310c69
 * addSlabJournalEntries() and addSynthesizedEntries().
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The completion of the block allocator being recovered
Packit Service 310c69
 **/
Packit Service 310c69
static void handleAddSlabJournalEntryError(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  notifySlabJournalsAreRecovered(recovery->allocator, completion->result);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Add synthesized entries into slab journals, waiting when necessary.
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The allocator completion
Packit Service 310c69
 **/
Packit Service 310c69
static void addSynthesizedEntries(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
Packit Service 310c69
  // Get ready in case we need to enqueue again
Packit Service 310c69
  prepareCompletion(completion, addSynthesizedEntries,
Packit Service 310c69
                    handleAddSlabJournalEntryError,
Packit Service 310c69
                    completion->callbackThreadID, recovery);
Packit Service 310c69
  WaitQueue *missingDecrefs
Packit Service 310c69
    = &recovery->missingDecrefs[recovery->allocator->zoneNumber];
Packit Service 310c69
  while (hasWaiters(missingDecrefs)) {
Packit Service 310c69
    MissingDecref *decref = asMissingDecref(getFirstWaiter(missingDecrefs));
Packit Service 310c69
    if (!attemptReplayIntoSlabJournal(decref->slabJournal,
Packit Service 310c69
                                      decref->penultimateMapping.pbn,
Packit Service 310c69
                                      DATA_DECREMENT, &decref->journalPoint,
Packit Service 310c69
                                      completion)) {
Packit Service 310c69
      return;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    dequeueNextWaiter(missingDecrefs);
Packit Service 310c69
    FREE(decref);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  notifySlabJournalsAreRecovered(recovery->allocator, VDO_SUCCESS);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Determine the LBNs used count as of the end of the journal (but
Packit Service 310c69
 * not including any changes to that count from entries that will be
Packit Service 310c69
 * synthesized later).
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery  The recovery completion
Packit Service 310c69
 *
Packit Service 310c69
 * @return VDO_SUCCESS or an error
Packit Service 310c69
 **/
Packit Service 310c69
static int computeUsages(RecoveryCompletion *recovery)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryJournal *journal = recovery->vdo->recoveryJournal;
Packit Service 310c69
  PackedJournalHeader *tailHeader
Packit Service 310c69
    = getJournalBlockHeader(journal, recovery->journalData, recovery->tail);
Packit Service 310c69
Packit Service 310c69
  RecoveryBlockHeader unpacked;
Packit Service 310c69
  unpackRecoveryBlockHeader(tailHeader, &unpacked);
Packit Service 310c69
  recovery->logicalBlocksUsed  = unpacked.logicalBlocksUsed;
Packit Service 310c69
  recovery->blockMapDataBlocks = unpacked.blockMapDataBlocks;
Packit Service 310c69
Packit Service 310c69
  RecoveryPoint recoveryPoint = {
Packit Service 310c69
    .sequenceNumber = recovery->tail,
Packit Service 310c69
    .sectorCount    = 1,
Packit Service 310c69
    .entryCount     = 0,
Packit Service 310c69
  };
Packit Service 310c69
  while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
Packit Service 310c69
    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
Packit Service 310c69
    if (isMappedLocation(&entry.mapping)) {
Packit Service 310c69
      switch (entry.operation) {
Packit Service 310c69
      case DATA_INCREMENT:
Packit Service 310c69
        recovery->logicalBlocksUsed++;
Packit Service 310c69
        break;
Packit Service 310c69
Packit Service 310c69
      case DATA_DECREMENT:
Packit Service 310c69
        recovery->logicalBlocksUsed--;
Packit Service 310c69
        break;
Packit Service 310c69
Packit Service 310c69
      case BLOCK_MAP_INCREMENT:
Packit Service 310c69
        recovery->blockMapDataBlocks++;
Packit Service 310c69
        break;
Packit Service 310c69
Packit Service 310c69
      default:
Packit Service 310c69
        return logErrorWithStringError(VDO_CORRUPT_JOURNAL,
Packit Service 310c69
                                       "Recovery journal entry at "
Packit Service 310c69
                                       "sequence number %" PRIu64
Packit Service 310c69
                                       ", sector %u, entry %u had invalid "
Packit Service 310c69
                                       "operation %u",
Packit Service 310c69
                                       recoveryPoint.sequenceNumber,
Packit Service 310c69
                                       recoveryPoint.sectorCount,
Packit Service 310c69
                                       recoveryPoint.entryCount,
Packit Service 310c69
                                       entry.operation);
Packit Service 310c69
      }
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    incrementRecoveryPoint(&recoveryPoint);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  return VDO_SUCCESS;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Advance the current recovery and journal points.
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery         The RecoveryCompletion whose points are to be
Packit Service 310c69
 *                         advanced
Packit Service 310c69
 * @param entriesPerBlock  The number of entries in a recovery journal block
Packit Service 310c69
 **/
Packit Service 310c69
static void advancePoints(RecoveryCompletion *recovery,
Packit Service 310c69
                          JournalEntryCount   entriesPerBlock)
Packit Service 310c69
{
Packit Service 310c69
  incrementRecoveryPoint(&recovery->nextRecoveryPoint);
Packit Service 310c69
  advanceJournalPoint(&recovery->nextJournalPoint, entriesPerBlock);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Replay recovery journal entries into the slab journals of the allocator
Packit Service 310c69
 * currently being recovered, waiting for slab journal tailblock space when
Packit Service 310c69
 * necessary. This method is its own callback.
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The allocator completion
Packit Service 310c69
 **/
Packit Service 310c69
static void addSlabJournalEntries(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  VDO                *vdo      = recovery->vdo;
Packit Service 310c69
  RecoveryJournal    *journal  = vdo->recoveryJournal;
Packit Service 310c69
Packit Service 310c69
  // Get ready in case we need to enqueue again.
Packit Service 310c69
  prepareCompletion(completion, addSlabJournalEntries,
Packit Service 310c69
                    handleAddSlabJournalEntryError,
Packit Service 310c69
                    completion->callbackThreadID, recovery);
Packit Service 310c69
  for (RecoveryPoint *recoveryPoint = &recovery->nextRecoveryPoint;
Packit Service 310c69
       beforeRecoveryPoint(recoveryPoint, &recovery->tailRecoveryPoint);
Packit Service 310c69
       advancePoints(recovery, journal->entriesPerBlock)) {
Packit Service 310c69
    RecoveryJournalEntry entry = getEntry(recovery, recoveryPoint);
Packit Service 310c69
    int result = validateRecoveryJournalEntry(vdo, &entry);
Packit Service 310c69
    if (result != VDO_SUCCESS) {
Packit Service 310c69
      enterReadOnlyMode(journal->readOnlyNotifier, result);
Packit Service 310c69
      finishCompletion(completion, result);
Packit Service 310c69
      return;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    if (entry.mapping.pbn == ZERO_BLOCK) {
Packit Service 310c69
      continue;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    Slab *slab = getSlab(vdo->depot, entry.mapping.pbn);
Packit Service 310c69
    if (slab->allocator != recovery->allocator) {
Packit Service 310c69
      continue;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    if (!attemptReplayIntoSlabJournal(slab->journal, entry.mapping.pbn,
Packit Service 310c69
                                      entry.operation,
Packit Service 310c69
                                      &recovery->nextJournalPoint,
Packit Service 310c69
                                      completion)) {
Packit Service 310c69
      return;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    recovery->entriesAddedToSlabJournals++;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  logInfo("Recreating missing journal entries for zone %u",
Packit Service 310c69
          recovery->allocator->zoneNumber);
Packit Service 310c69
  addSynthesizedEntries(completion);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**********************************************************************/
Packit Service 310c69
void replayIntoSlabJournals(BlockAllocator *allocator,
Packit Service 310c69
                            VDOCompletion  *completion,
Packit Service 310c69
                            void           *context)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = context;
Packit Service 310c69
  assertOnPhysicalZoneThread(recovery->vdo, allocator->zoneNumber, __func__);
Packit Service 310c69
  if ((recovery->journalData == NULL) || isReplaying(recovery->vdo)) {
Packit Service 310c69
    // there's nothing to replay
Packit Service 310c69
    notifySlabJournalsAreRecovered(allocator, VDO_SUCCESS);
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  recovery->allocator = allocator;
Packit Service 310c69
  recovery->nextRecoveryPoint = (RecoveryPoint) {
Packit Service 310c69
    .sequenceNumber = recovery->slabJournalHead,
Packit Service 310c69
    .sectorCount    = 1,
Packit Service 310c69
    .entryCount     = 0,
Packit Service 310c69
  };
Packit Service 310c69
Packit Service 310c69
  recovery->nextJournalPoint = (JournalPoint) {
Packit Service 310c69
    .sequenceNumber = recovery->slabJournalHead,
Packit Service 310c69
    .entryCount     = 0,
Packit Service 310c69
  };
Packit Service 310c69
Packit Service 310c69
  logInfo("Replaying entries into slab journals for zone %u",
Packit Service 310c69
          allocator->zoneNumber);
Packit Service 310c69
  completion->parent = recovery;
Packit Service 310c69
  addSlabJournalEntries(completion);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * A waiter callback to enqueue a MissingDecref on the queue for the physical
Packit Service 310c69
 * zone in which it will be applied.
Packit Service 310c69
 *
Packit Service 310c69
 * Implements WaiterCallback.
Packit Service 310c69
 **/
Packit Service 310c69
static void queueOnPhysicalZone(Waiter *waiter, void *context)
Packit Service 310c69
{
Packit Service 310c69
  MissingDecref *decref  = asMissingDecref(waiter);
Packit Service 310c69
  DataLocation   mapping = decref->penultimateMapping;
Packit Service 310c69
  if (isMappedLocation(&mapping)) {
Packit Service 310c69
    decref->recovery->logicalBlocksUsed--;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (mapping.pbn == ZERO_BLOCK) {
Packit Service 310c69
    // Decrefs of zero are not applied to slab journals.
Packit Service 310c69
    FREE(decref);
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  decref->slabJournal = getSlabJournal((SlabDepot *) context, mapping.pbn);
Packit Service 310c69
  ZoneCount zoneNumber = decref->slabJournal->slab->allocator->zoneNumber;
Packit Service 310c69
  enqueueMissingDecref(&decref->recovery->missingDecrefs[zoneNumber], decref);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Queue each missing decref on the slab journal to which it is to be applied
Packit Service 310c69
 * then load the slab depot. This callback is registered in
Packit Service 310c69
 * findSlabJournalEntries().
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The sub-task completion
Packit Service 310c69
 **/
Packit Service 310c69
static void applyToDepot(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  assertOnAdminThread(recovery->vdo, __func__);
Packit Service 310c69
  prepareSubTask(recovery, finishRecoveringDepot, finishParentCallback,
Packit Service 310c69
                 ZONE_TYPE_ADMIN);
Packit Service 310c69
Packit Service 310c69
  SlabDepot *depot = getSlabDepot(recovery->vdo);
Packit Service 310c69
  notifyAllWaiters(&recovery->missingDecrefs[0], queueOnPhysicalZone, depot);
Packit Service 310c69
  if (abortRecoveryOnError(recovery->completion.result, recovery)) {
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  loadSlabDepot(depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Validate the location of the penultimate mapping for a MissingDecref. If it
Packit Service 310c69
 * is valid, enqueue it for the appropriate physical zone or account for it.
Packit Service 310c69
 * Otherwise, dispose of it and signal an error.
Packit Service 310c69
 *
Packit Service 310c69
 * @param decref     The decref whose penultimate mapping has just been found
Packit Service 310c69
 * @param location   The penultimate mapping
Packit Service 310c69
 * @param errorCode  The error code to use if the location is invalid
Packit Service 310c69
 **/
Packit Service 310c69
static int recordMissingDecref(MissingDecref *decref,
Packit Service 310c69
                               DataLocation   location,
Packit Service 310c69
                               int            errorCode)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = decref->recovery;
Packit Service 310c69
  recovery->incompleteDecrefCount--;
Packit Service 310c69
  if (isValidLocation(&location)
Packit Service 310c69
      && isPhysicalDataBlock(recovery->vdo->depot, location.pbn)) {
Packit Service 310c69
    decref->penultimateMapping = location;
Packit Service 310c69
    decref->complete           = true;
Packit Service 310c69
    return VDO_SUCCESS;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  // The location was invalid
Packit Service 310c69
  enterReadOnlyMode(recovery->vdo->readOnlyNotifier, errorCode);
Packit Service 310c69
  setCompletionResult(&recovery->completion, errorCode);
Packit Service 310c69
  logErrorWithStringError(errorCode,
Packit Service 310c69
                          "Invalid mapping for pbn %llu with state %u",
Packit Service 310c69
                          location.pbn, location.state);
Packit Service 310c69
  return errorCode;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Find the block map slots with missing decrefs.
Packit Service 310c69
 *
Packit Service 310c69
 * To find the slots missing decrefs, we iterate through the journal in reverse
Packit Service 310c69
 * so we see decrefs before increfs; if we see an incref before its paired
Packit Service 310c69
 * decref, we instantly know this incref is missing its decref.
Packit Service 310c69
 *
Packit Service 310c69
 * Simultaneously, we attempt to determine the missing decref. If there is a
Packit Service 310c69
 * missing decref, and at least two increfs for that slot, we know we should
Packit Service 310c69
 * decref the PBN from the penultimate incref. Otherwise, there is only one
Packit Service 310c69
 * incref for that slot: we must synthesize the decref out of the block map
Packit Service 310c69
 * instead of the recovery journal.
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery  The recovery completion
Packit Service 310c69
 *
Packit Service 310c69
 * @return VDO_SUCCESS or an error code
Packit Service 310c69
 **/
Packit Service 310c69
__attribute__((warn_unused_result))
Packit Service 310c69
static int findMissingDecrefs(RecoveryCompletion *recovery)
Packit Service 310c69
{
Packit Service 310c69
  IntMap *slotEntryMap = recovery->slotEntryMap;
Packit Service 310c69
  // This placeholder decref is used to mark lbns for which we have observed a
Packit Service 310c69
  // decref but not the paired incref (going backwards through the journal).
Packit Service 310c69
  MissingDecref foundDecref;
Packit Service 310c69
Packit Service 310c69
  // A buffer is allocated based on the number of incRef entries found, so use
Packit Service 310c69
  // the earliest head.
Packit Service 310c69
  SequenceNumber head = minSequenceNumber(recovery->blockMapHead,
Packit Service 310c69
                                          recovery->slabJournalHead);
Packit Service 310c69
  RecoveryPoint headPoint = {
Packit Service 310c69
    .sequenceNumber = head,
Packit Service 310c69
    .sectorCount    = 1,
Packit Service 310c69
    .entryCount     = 0,
Packit Service 310c69
  };
Packit Service 310c69
Packit Service 310c69
  // Set up for the first fake journal point that will be used for a
Packit Service 310c69
  // synthesized entry.
Packit Service 310c69
  recovery->nextSynthesizedJournalPoint = (JournalPoint) {
Packit Service 310c69
    .sequenceNumber = recovery->tail,
Packit Service 310c69
    .entryCount     = recovery->vdo->recoveryJournal->entriesPerBlock,
Packit Service 310c69
  };
Packit Service 310c69
Packit Service 310c69
  RecoveryPoint recoveryPoint = recovery->tailRecoveryPoint;
Packit Service 310c69
  while (beforeRecoveryPoint(&headPoint, &recoveryPoint)) {
Packit Service 310c69
    decrementRecoveryPoint(&recoveryPoint);
Packit Service 310c69
    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
Packit Service 310c69
Packit Service 310c69
    if (!isIncrementOperation(entry.operation)) {
Packit Service 310c69
      // Observe that we've seen a decref before its incref, but only if
Packit Service 310c69
      // the IntMap does not contain an unpaired incref for this lbn.
Packit Service 310c69
      int result = intMapPut(slotEntryMap, slotAsNumber(entry.slot),
Packit Service 310c69
                             &foundDecref, false, NULL);
Packit Service 310c69
      if (result != VDO_SUCCESS) {
Packit Service 310c69
        return result;
Packit Service 310c69
      }
Packit Service 310c69
Packit Service 310c69
      continue;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    recovery->increfCount++;
Packit Service 310c69
Packit Service 310c69
    MissingDecref *decref
Packit Service 310c69
      = intMapRemove(slotEntryMap, slotAsNumber(entry.slot));
Packit Service 310c69
    if (entry.operation == BLOCK_MAP_INCREMENT) {
Packit Service 310c69
      if (decref != NULL) {
Packit Service 310c69
        return logErrorWithStringError(VDO_CORRUPT_JOURNAL,
Packit Service 310c69
                                       "decref found for block map block %"
Packit Service 310c69
                                       PRIu64 " with state %u",
Packit Service 310c69
                                       entry.mapping.pbn, entry.mapping.state);
Packit Service 310c69
      }
Packit Service 310c69
Packit Service 310c69
      // There are no decrefs for block map pages, so they can't be missing.
Packit Service 310c69
      continue;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    if (decref == &foundDecref) {
Packit Service 310c69
      // This incref already had a decref in the intmap, so we know it is
Packit Service 310c69
      // not missing its decref.
Packit Service 310c69
      continue;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    if (decref == NULL) {
Packit Service 310c69
      // This incref is missing a decref. Add a missing decref object.
Packit Service 310c69
      int result = makeMissingDecref(recovery, entry, &decref);
Packit Service 310c69
      if (result != VDO_SUCCESS) {
Packit Service 310c69
        return result;
Packit Service 310c69
      }
Packit Service 310c69
Packit Service 310c69
      result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), decref,
Packit Service 310c69
                         false, NULL);
Packit Service 310c69
      if (result != VDO_SUCCESS) {
Packit Service 310c69
        return result;
Packit Service 310c69
      }
Packit Service 310c69
Packit Service 310c69
      continue;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    /*
Packit Service 310c69
     * This MissingDecref was left here by an incref without a decref.
Packit Service 310c69
     * We now know what its penultimate mapping is, and all entries
Packit Service 310c69
     * before here in the journal are paired, decref before incref, so
Packit Service 310c69
     * we needn't remember it in the intmap any longer.
Packit Service 310c69
     */
Packit Service 310c69
    int result = recordMissingDecref(decref, entry.mapping,
Packit Service 310c69
                                     VDO_CORRUPT_JOURNAL);
Packit Service 310c69
    if (result != VDO_SUCCESS) {
Packit Service 310c69
      return result;
Packit Service 310c69
    }
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  return VDO_SUCCESS;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Process a fetched block map page for a missing decref. This callback is
Packit Service 310c69
 * registered in findSlabJournalEntries().
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The page completion which has just finished loading
Packit Service 310c69
 **/
Packit Service 310c69
static void processFetchedPage(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  MissingDecref      *currentDecref = completion->parent;
Packit Service 310c69
  RecoveryCompletion *recovery      = currentDecref->recovery;
Packit Service 310c69
  assertOnLogicalZoneThread(recovery->vdo, 0, __func__);
Packit Service 310c69
Packit Service 310c69
  const BlockMapPage *page = dereferenceReadableVDOPage(completion);
Packit Service 310c69
  DataLocation location
Packit Service 310c69
    = unpackBlockMapEntry(&page->entries[currentDecref->slot.slot]);
Packit Service 310c69
  releaseVDOPageCompletion(completion);
Packit Service 310c69
  recordMissingDecref(currentDecref, location, VDO_BAD_MAPPING);
Packit Service 310c69
  if (recovery->incompleteDecrefCount == 0) {
Packit Service 310c69
    completeCompletion(&recovery->subTaskCompletion);
Packit Service 310c69
  }
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Handle an error fetching a block map page for a missing decref.
Packit Service 310c69
 * This error handler is registered in findSlabJournalEntries().
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The page completion which has just finished loading
Packit Service 310c69
 **/
Packit Service 310c69
static void handleFetchError(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  MissingDecref      *decref   = completion->parent;
Packit Service 310c69
  RecoveryCompletion *recovery = decref->recovery;
Packit Service 310c69
  assertOnLogicalZoneThread(recovery->vdo, 0, __func__);
Packit Service 310c69
Packit Service 310c69
  // If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read from
Packit Service 310c69
  // the journal was bad, so convert the error code
Packit Service 310c69
  setCompletionResult(&recovery->subTaskCompletion,
Packit Service 310c69
                      ((completion->result == VDO_OUT_OF_RANGE)
Packit Service 310c69
                       ? VDO_CORRUPT_JOURNAL : completion->result));
Packit Service 310c69
  releaseVDOPageCompletion(completion);
Packit Service 310c69
  if (--recovery->incompleteDecrefCount == 0) {
Packit Service 310c69
    completeCompletion(&recovery->subTaskCompletion);
Packit Service 310c69
  }
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * The waiter callback to requeue a missing decref and launch its page fetch.
Packit Service 310c69
 *
Packit Service 310c69
 * Implements WaiterCallback.
Packit Service 310c69
 **/
Packit Service 310c69
static void launchFetch(Waiter *waiter, void *context)
Packit Service 310c69
{
Packit Service 310c69
  MissingDecref      *decref   = asMissingDecref(waiter);
Packit Service 310c69
  RecoveryCompletion *recovery = decref->recovery;
Packit Service 310c69
  if (enqueueMissingDecref(&recovery->missingDecrefs[0], decref)
Packit Service 310c69
      != VDO_SUCCESS) {
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (decref->complete) {
Packit Service 310c69
    // We've already found the mapping for this decref, no fetch needed.
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  BlockMapZone *zone = context;
Packit Service 310c69
  initVDOPageCompletion(&decref->pageCompletion, zone->pageCache,
Packit Service 310c69
                        decref->slot.pbn, false, decref, processFetchedPage,
Packit Service 310c69
                        handleFetchError);
Packit Service 310c69
  getVDOPageAsync(&decref->pageCompletion.completion);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Find all entries which need to be replayed into the slab journals.
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The sub-task completion
Packit Service 310c69
 **/
Packit Service 310c69
static void findSlabJournalEntries(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  VDO                *vdo      = recovery->vdo;
Packit Service 310c69
Packit Service 310c69
  // We need to be on logical zone 0's thread since we are going to use its
Packit Service 310c69
  // page cache.
Packit Service 310c69
  assertOnLogicalZoneThread(vdo, 0, __func__);
Packit Service 310c69
  int result = findMissingDecrefs(recovery);
Packit Service 310c69
  if (abortRecoveryOnError(result, recovery)) {
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  prepareSubTask(recovery, applyToDepot, finishParentCallback,
Packit Service 310c69
                 ZONE_TYPE_ADMIN);
Packit Service 310c69
Packit Service 310c69
  /*
Packit Service 310c69
   * Increment the incompleteDecrefCount so that the fetch callback can't
Packit Service 310c69
   * complete the sub-task while we are still processing the queue of missing
Packit Service 310c69
   * decrefs.
Packit Service 310c69
   */
Packit Service 310c69
  if (recovery->incompleteDecrefCount++ > 0) {
Packit Service 310c69
    // Fetch block map pages to fill in the incomplete missing decrefs.
Packit Service 310c69
    notifyAllWaiters(&recovery->missingDecrefs[0], launchFetch,
Packit Service 310c69
                     getBlockMapZone(getBlockMap(vdo), 0));
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (--recovery->incompleteDecrefCount == 0) {
Packit Service 310c69
    completeCompletion(completion);
Packit Service 310c69
  }
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Find the contiguous range of journal blocks.
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery  The recovery completion
Packit Service 310c69
 *
Packit Service 310c69
 * @return true if there were valid journal blocks
Packit Service 310c69
 **/
Packit Service 310c69
static bool findContiguousRange(RecoveryCompletion *recovery)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryJournal *journal = recovery->vdo->recoveryJournal;
Packit Service 310c69
  SequenceNumber head
Packit Service 310c69
    = minSequenceNumber(recovery->blockMapHead, recovery->slabJournalHead);
Packit Service 310c69
Packit Service 310c69
  bool foundEntries = false;
Packit Service 310c69
  for (SequenceNumber i = head; i <= recovery->highestTail; i++) {
Packit Service 310c69
    recovery->tail = i;
Packit Service 310c69
    recovery->tailRecoveryPoint = (RecoveryPoint) {
Packit Service 310c69
      .sequenceNumber = i,
Packit Service 310c69
      .sectorCount    = 0,
Packit Service 310c69
      .entryCount     = 0,
Packit Service 310c69
    };
Packit Service 310c69
Packit Service 310c69
    PackedJournalHeader *packedHeader
Packit Service 310c69
      = getJournalBlockHeader(journal, recovery->journalData, i);
Packit Service 310c69
    RecoveryBlockHeader header;
Packit Service 310c69
    unpackRecoveryBlockHeader(packedHeader, &header);
Packit Service 310c69
Packit Service 310c69
    if (!isExactRecoveryJournalBlock(journal, &header, i)
Packit Service 310c69
        || (header.entryCount > journal->entriesPerBlock)) {
Packit Service 310c69
      // A bad block header was found so this must be the end of the journal.
Packit Service 310c69
      break;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    JournalEntryCount blockEntries = header.entryCount;
Packit Service 310c69
    // Examine each sector in turn to determine the last valid sector.
Packit Service 310c69
    for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) {
Packit Service 310c69
      PackedJournalSector *sector = getJournalBlockSector(packedHeader, j);
Packit Service 310c69
Packit Service 310c69
      // A bad sector means that this block was torn.
Packit Service 310c69
      if (!isValidRecoveryJournalSector(&header, sector)) {
Packit Service 310c69
        break;
Packit Service 310c69
      }
Packit Service 310c69
Packit Service 310c69
      JournalEntryCount sectorEntries = minBlock(sector->entryCount,
Packit Service 310c69
                                                 blockEntries);
Packit Service 310c69
      if (sectorEntries > 0) {
Packit Service 310c69
        foundEntries = true;
Packit Service 310c69
        recovery->tailRecoveryPoint.sectorCount++;
Packit Service 310c69
        recovery->tailRecoveryPoint.entryCount = sectorEntries;
Packit Service 310c69
        blockEntries -= sectorEntries;
Packit Service 310c69
      }
Packit Service 310c69
Packit Service 310c69
      // If this sector is short, the later sectors can't matter.
Packit Service 310c69
      if ((sectorEntries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
Packit Service 310c69
          || (blockEntries == 0)) {
Packit Service 310c69
        break;
Packit Service 310c69
      }
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    // If this block was not filled, or if it tore, no later block can matter.
Packit Service 310c69
    if ((header.entryCount != journal->entriesPerBlock)
Packit Service 310c69
        || (blockEntries > 0)) {
Packit Service 310c69
      break;
Packit Service 310c69
    }
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  // Set the tail to the last valid tail block, if there is one.
Packit Service 310c69
  if (foundEntries && (recovery->tailRecoveryPoint.sectorCount == 0)) {
Packit Service 310c69
    recovery->tail--;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  return foundEntries;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Count the number of increment entries in the journal.
Packit Service 310c69
 *
Packit Service 310c69
 * @param recovery  The recovery completion
Packit Service 310c69
 **/
Packit Service 310c69
static int countIncrementEntries(RecoveryCompletion *recovery)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryPoint recoveryPoint = {
Packit Service 310c69
    .sequenceNumber = recovery->blockMapHead,
Packit Service 310c69
    .sectorCount    = 1,
Packit Service 310c69
    .entryCount     = 0,
Packit Service 310c69
  };
Packit Service 310c69
  while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
Packit Service 310c69
    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
Packit Service 310c69
    int result = validateRecoveryJournalEntry(recovery->vdo, &entry);
Packit Service 310c69
    if (result != VDO_SUCCESS) {
Packit Service 310c69
      enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
Packit Service 310c69
      return result;
Packit Service 310c69
    }
Packit Service 310c69
    if (isIncrementOperation(entry.operation)) {
Packit Service 310c69
      recovery->increfCount++;
Packit Service 310c69
    }
Packit Service 310c69
    incrementRecoveryPoint(&recoveryPoint);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  return VDO_SUCCESS;
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**
Packit Service 310c69
 * Determine the limits of the valid recovery journal and prepare to replay
Packit Service 310c69
 * into the slab journals and block map.
Packit Service 310c69
 *
Packit Service 310c69
 * @param completion  The sub-task completion
Packit Service 310c69
 **/
Packit Service 310c69
static void prepareToApplyJournalEntries(VDOCompletion *completion)
Packit Service 310c69
{
Packit Service 310c69
  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
Packit Service 310c69
  VDO                *vdo      = recovery->vdo;
Packit Service 310c69
  RecoveryJournal    *journal  = vdo->recoveryJournal;
Packit Service 310c69
  logInfo("Finished reading recovery journal");
Packit Service 310c69
  bool foundEntries = findHeadAndTail(journal, recovery->journalData,
Packit Service 310c69
                                      &recovery->highestTail,
Packit Service 310c69
                                      &recovery->blockMapHead,
Packit Service 310c69
                                      &recovery->slabJournalHead);
Packit Service 310c69
  if (foundEntries) {
Packit Service 310c69
    foundEntries = findContiguousRange(recovery);
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  // Both reap heads must be behind the tail.
Packit Service 310c69
  if ((recovery->blockMapHead > recovery->tail)
Packit Service 310c69
      || (recovery->slabJournalHead > recovery->tail)) {
Packit Service 310c69
    int result = logErrorWithStringError(VDO_CORRUPT_JOURNAL,
Packit Service 310c69
                                         "Journal tail too early. "
Packit Service 310c69
                                         "block map head: %" PRIu64
Packit Service 310c69
                                         ", slab journal head: %" PRIu64
Packit Service 310c69
                                         ", tail: %llu",
Packit Service 310c69
                                         recovery->blockMapHead,
Packit Service 310c69
                                         recovery->slabJournalHead,
Packit Service 310c69
                                         recovery->tail);
Packit Service 310c69
    finishCompletion(&recovery->completion, result);
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  if (!foundEntries) {
Packit Service 310c69
    // This message must be recognizable by VDOTest::RebuildBase.
Packit Service 310c69
    logInfo("Replaying 0 recovery entries into block map");
Packit Service 310c69
    // We still need to load the SlabDepot.
Packit Service 310c69
    FREE(recovery->journalData);
Packit Service 310c69
    recovery->journalData = NULL;
Packit Service 310c69
    prepareSubTask(recovery, finishParentCallback, finishParentCallback,
Packit Service 310c69
                   ZONE_TYPE_ADMIN);
Packit Service 310c69
    loadSlabDepot(getSlabDepot(vdo), ADMIN_STATE_LOADING_FOR_RECOVERY,
Packit Service 310c69
                  completion, recovery);
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  logInfo("Highest-numbered recovery journal block has sequence number"
Packit Service 310c69
          " %llu, and the highest-numbered usable block is %"
Packit Service 310c69
          PRIu64, recovery->highestTail, recovery->tail);
Packit Service 310c69
Packit Service 310c69
  if (isReplaying(vdo)) {
Packit Service 310c69
    // We need to know how many entries the block map rebuild completion will
Packit Service 310c69
    // need to hold.
Packit Service 310c69
    int result = countIncrementEntries(recovery);
Packit Service 310c69
    if (result != VDO_SUCCESS) {
Packit Service 310c69
      finishCompletion(&recovery->completion, result);
Packit Service 310c69
      return;
Packit Service 310c69
    }
Packit Service 310c69
Packit Service 310c69
    // We need to access the block map from a logical zone.
Packit Service 310c69
    prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback,
Packit Service 310c69
                   ZONE_TYPE_LOGICAL);
Packit Service 310c69
    loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion,
Packit Service 310c69
                  recovery);
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  int result = computeUsages(recovery);
Packit Service 310c69
  if (abortRecoveryOnError(result, recovery)) {
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  prepareSubTask(recovery, findSlabJournalEntries, finishParentCallback,
Packit Service 310c69
                 ZONE_TYPE_LOGICAL);
Packit Service 310c69
  invokeCallback(completion);
Packit Service 310c69
}
Packit Service 310c69
Packit Service 310c69
/**********************************************************************/
Packit Service 310c69
void launchRecovery(VDO *vdo, VDOCompletion *parent)
Packit Service 310c69
{
Packit Service 310c69
  // Note: This message must be recognizable by Permabit::VDODeviceBase.
Packit Service 310c69
  logWarning("Device was dirty, rebuilding reference counts");
Packit Service 310c69
Packit Service 310c69
  RecoveryCompletion *recovery;
Packit Service 310c69
  int result = makeRecoveryCompletion(vdo, &recovery);
Packit Service 310c69
  if (result != VDO_SUCCESS) {
Packit Service 310c69
    finishCompletion(parent, result);
Packit Service 310c69
    return;
Packit Service 310c69
  }
Packit Service 310c69
Packit Service 310c69
  VDOCompletion *completion = &recovery->completion;
Packit Service 310c69
  prepareCompletion(completion, finishRecovery, abortRecovery,
Packit Service 310c69
                    parent->callbackThreadID, parent);
Packit Service 310c69
  prepareSubTask(recovery, prepareToApplyJournalEntries, finishParentCallback,
Packit Service 310c69
                 ZONE_TYPE_ADMIN);
Packit Service 310c69
  loadJournalAsync(vdo->recoveryJournal, &recovery->subTaskCompletion,
Packit Service 310c69
                   &recovery->journalData);
Packit Service 310c69
}