/*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*
* $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.c#16 $
*/
#include "vdoRecoveryInternals.h"
#include "logger.h"
#include "memoryAlloc.h"
#include "blockAllocator.h"
#include "blockAllocatorInternals.h"
#include "blockMapInternals.h"
#include "blockMapPage.h"
#include "blockMapRecovery.h"
#include "completion.h"
#include "numUtils.h"
#include "packedRecoveryJournalBlock.h"
#include "recoveryJournal.h"
#include "recoveryUtils.h"
#include "slab.h"
#include "slabDepot.h"
#include "slabJournal.h"
#include "slabJournalInternals.h"
#include "vdoInternal.h"
#include "waitQueue.h"
enum {
// The int map needs capacity of twice the number of VIOs in the system.
INT_MAP_CAPACITY = MAXIMUM_USER_VIOS * 2,
// There can be as many missing decrefs as there are VIOs in the system.
MAXIMUM_SYNTHESIZED_DECREFS = MAXIMUM_USER_VIOS,
};
typedef struct missingDecref {
/** A waiter for queueing this object */
Waiter waiter;
/** The parent of this object */
RecoveryCompletion *recovery;
/** Whether this decref is complete */
bool complete;
/** The slot for which the last decref was lost */
BlockMapSlot slot;
/** The penultimate block map entry for this LBN */
DataLocation penultimateMapping;
/** The page completion used to fetch the block map page for this LBN */
VDOPageCompletion pageCompletion;
/** The journal point which will be used for this entry */
JournalPoint journalPoint;
/** The slab journal to which this entry will be applied */
SlabJournal *slabJournal;
} MissingDecref;
/**
* Convert a Waiter to the missing decref of which it is a part.
*
* @param waiter The Waiter to convert
*
* @return The MissingDecref wrapping the Waiter
**/
__attribute__((warn_unused_result))
static inline MissingDecref *asMissingDecref(Waiter *waiter)
{
STATIC_ASSERT(offsetof(MissingDecref, waiter) == 0);
return (MissingDecref *) waiter;
}
/**
* Enqueue a MissingDecref. If the enqueue fails, enter read-only mode.
*
* @param queue The queue on which to enqueue the decref
* @param decref The MissingDecref to enqueue
*
* @return VDO_SUCCESS or an error
**/
static int enqueueMissingDecref(WaitQueue *queue, MissingDecref *decref)
{
int result = enqueueWaiter(queue, &decref->waiter);
if (result != VDO_SUCCESS) {
enterReadOnlyMode(decref->recovery->vdo->readOnlyNotifier, result);
setCompletionResult(&decref->recovery->completion, result);
FREE(decref);
}
return result;
}
/**
* Convert a BlockMapSlot into a unique uint64_t.
*
* @param slot The block map slot to convert.
*
* @return a one-to-one mappable uint64_t.
**/
static uint64_t slotAsNumber(BlockMapSlot slot)
{
return (((uint64_t) slot.pbn << 10) + slot.slot);
}
/**
* Create a MissingDecref and enqueue it to wait for a determination of its
* penultimate mapping.
*
* @param [in] recovery The parent recovery completion
* @param [in] entry The recovery journal entry for the increment which is
* missing a decref
* @param [out] decrefPtr A pointer to hold the new MissingDecref
*
* @return VDO_SUCCESS or an error code
**/
__attribute__((warn_unused_result))
static int makeMissingDecref(RecoveryCompletion *recovery,
RecoveryJournalEntry entry,
MissingDecref **decrefPtr)
{
MissingDecref *decref;
int result = ALLOCATE(1, MissingDecref, __func__, &decref);
if (result != VDO_SUCCESS) {
return result;
}
decref->recovery = recovery;
result = enqueueMissingDecref(&recovery->missingDecrefs[0], decref);
if (result != VDO_SUCCESS) {
return result;
}
/*
* Each synthsized decref needs a unique journal point. Otherwise, in the
* event of a crash, we would be unable to tell which synthesized decrefs had
* already been committed in the slab journals. Instead of using real
* recovery journal space for this, we can use fake journal points between
* the last currently valid entry in the tail block and the first journal
* entry in the next block. We can't overflow the entry count since the
* number of synthesized decrefs is bounded by the DataVIO limit.
*
* It is vital that any given missing decref always have the same fake
* journal point since a failed recovery may be retried with a different
* number of zones after having written out some slab journal blocks. Since
* the missing decrefs are always read out of the journal in the same order,
* we can assign them a journal point when they are read. Their subsequent
* use will ensure that, for any given slab journal, they are applied in
* the order dictated by these assigned journal points.
*/
decref->slot = entry.slot;
decref->journalPoint = recovery->nextSynthesizedJournalPoint;
recovery->nextSynthesizedJournalPoint.entryCount++;
recovery->missingDecrefCount++;
recovery->incompleteDecrefCount++;
*decrefPtr = decref;
return VDO_SUCCESS;
}
/**
* Move the given recovery point forward by one entry.
*
* @param point The recovery point to alter
**/
static void incrementRecoveryPoint(RecoveryPoint *point)
{
point->entryCount++;
if ((point->sectorCount == (SECTORS_PER_BLOCK - 1))
&& (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR)) {
point->sequenceNumber++;
point->sectorCount = 1;
point->entryCount = 0;
}
if (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) {
point->sectorCount++;
point->entryCount = 0;
return;
}
}
/**
* Move the given recovery point backwards by one entry.
*
* @param point The recovery point to alter
**/
static void decrementRecoveryPoint(RecoveryPoint *point)
{
STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR > 0);
if ((point->sectorCount <= 1) && (point->entryCount == 0)) {
point->sequenceNumber--;
point->sectorCount = SECTORS_PER_BLOCK - 1;
point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR - 1;
return;
}
if (point->entryCount == 0) {
point->sectorCount--;
point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR - 1;
return;
}
point->entryCount--;
}
/**
* Check whether the first point precedes the second point.
*
* @param first The first recovery point
* @param second The second recovery point
*
* @return true
if the first point precedes the second point
**/
__attribute__((warn_unused_result))
static bool beforeRecoveryPoint(const RecoveryPoint *first,
const RecoveryPoint *second)
{
if (first->sequenceNumber < second->sequenceNumber) {
return true;
}
if (first->sequenceNumber > second->sequenceNumber) {
return false;
}
if (first->sectorCount < second->sectorCount) {
return true;
}
return ((first->sectorCount == second->sectorCount)
&& (first->entryCount < second->entryCount));
}
/**
* Prepare the sub-task completion.
*
* @param recovery The RecoveryCompletion whose sub-task completion is to
* be prepared
* @param callback The callback to register for the next sub-task
* @param errorHandler The error handler for the next sub-task
* @param zoneType The type of zone on which the callback or errorHandler
* should run
**/
static void prepareSubTask(RecoveryCompletion *recovery,
VDOAction callback,
VDOAction errorHandler,
ZoneType zoneType)
{
const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo);
ThreadID threadID;
switch (zoneType) {
case ZONE_TYPE_LOGICAL:
// All blockmap access is done on single thread, so use logical zone 0.
threadID = getLogicalZoneThread(threadConfig, 0);
break;
case ZONE_TYPE_PHYSICAL:
threadID = recovery->allocator->threadID;
break;
case ZONE_TYPE_ADMIN:
default:
threadID = getAdminThread(threadConfig);
}
prepareCompletion(&recovery->subTaskCompletion, callback, errorHandler,
threadID, recovery);
}
/**********************************************************************/
int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr)
{
const ThreadConfig *threadConfig = getThreadConfig(vdo);
RecoveryCompletion *recovery;
int result = ALLOCATE_EXTENDED(RecoveryCompletion,
threadConfig->physicalZoneCount, RingNode,
__func__, &recovery);
if (result != VDO_SUCCESS) {
return result;
}
recovery->vdo = vdo;
for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) {
initializeWaitQueue(&recovery->missingDecrefs[z]);
}
result = initializeEnqueueableCompletion(&recovery->completion,
RECOVERY_COMPLETION, vdo->layer);
if (result != VDO_SUCCESS) {
freeRecoveryCompletion(&recovery);
return result;
}
result = initializeEnqueueableCompletion(&recovery->subTaskCompletion,
SUB_TASK_COMPLETION, vdo->layer);
if (result != VDO_SUCCESS) {
freeRecoveryCompletion(&recovery);
return result;
}
result = makeIntMap(INT_MAP_CAPACITY, 0, &recovery->slotEntryMap);
if (result != VDO_SUCCESS) {
freeRecoveryCompletion(&recovery);
return result;
}
*recoveryPtr = recovery;
return VDO_SUCCESS;
}
/**
* A waiter callback to free MissingDecrefs.
*
* Implements WaiterCallback.
**/
static void freeMissingDecref(Waiter *waiter,
void *context __attribute__((unused)))
{
FREE(asMissingDecref(waiter));
}
/**********************************************************************/
void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr)
{
RecoveryCompletion *recovery = *recoveryPtr;
if (recovery == NULL) {
return;
}
freeIntMap(&recovery->slotEntryMap);
const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo);
for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) {
notifyAllWaiters(&recovery->missingDecrefs[z], freeMissingDecref, NULL);
}
FREE(recovery->journalData);
FREE(recovery->entries);
destroyEnqueueable(&recovery->subTaskCompletion);
destroyEnqueueable(&recovery->completion);
FREE(recovery);
*recoveryPtr = NULL;
}
/**
* Finish recovering, free the recovery completion and notify the parent.
*
* @param completion The recovery completion
**/
static void finishRecovery(VDOCompletion *completion)
{
VDOCompletion *parent = completion->parent;
RecoveryCompletion *recovery = asRecoveryCompletion(completion);
VDO *vdo = recovery->vdo;
uint64_t recoveryCount = ++vdo->completeRecoveries;
initializeRecoveryJournalPostRecovery(vdo->recoveryJournal,
recoveryCount, recovery->highestTail);
freeRecoveryCompletion(&recovery);
logInfo("Rebuild complete.");
// Now that we've freed the recovery completion and its vast array of
// journal entries, we can allocate refcounts.
int result = allocateSlabRefCounts(vdo->depot);
finishCompletion(parent, result);
}
/**
* Handle a recovery error.
*
* @param completion The recovery completion
**/
static void abortRecovery(VDOCompletion *completion)
{
VDOCompletion *parent = completion->parent;
int result = completion->result;
RecoveryCompletion *recovery = asRecoveryCompletion(completion);
freeRecoveryCompletion(&recovery);
logWarning("Recovery aborted");
finishCompletion(parent, result);
}
/**
* Abort a recovery if there is an error.
*
* @param result The result to check
* @param recovery The recovery completion
*
* @return true
if the result was an error
**/
__attribute__((warn_unused_result))
static bool abortRecoveryOnError(int result, RecoveryCompletion *recovery)
{
if (result == VDO_SUCCESS) {
return false;
}
finishCompletion(&recovery->completion, result);
return true;
}
/**
* Unpack the recovery journal entry associated with the given recovery point.
*
* @param recovery The recovery completion
* @param point The recovery point
*
* @return The unpacked contents of the matching recovery journal entry
**/
static RecoveryJournalEntry getEntry(const RecoveryCompletion *recovery,
const RecoveryPoint *point)
{
RecoveryJournal *journal = recovery->vdo->recoveryJournal;
PhysicalBlockNumber blockNumber
= getRecoveryJournalBlockNumber(journal, point->sequenceNumber);
off_t sectorOffset
= (blockNumber * VDO_BLOCK_SIZE) + (point->sectorCount * VDO_SECTOR_SIZE);
PackedJournalSector *sector
= (PackedJournalSector *) &recovery->journalData[sectorOffset];
return unpackRecoveryJournalEntry(§or->entries[point->entryCount]);
}
/**
* Create an array of all valid journal entries, in order, and store it in the
* recovery completion.
*
* @param recovery The recovery completion
*
* @return VDO_SUCCESS or an error code
**/
static int extractJournalEntries(RecoveryCompletion *recovery)
{
// Allocate a NumberedBlockMapping array just large enough to transcribe
// every increment PackedRecoveryJournalEntry from every valid journal block.
int result = ALLOCATE(recovery->increfCount, NumberedBlockMapping, __func__,
&recovery->entries);
if (result != VDO_SUCCESS) {
return result;
}
RecoveryPoint recoveryPoint = {
.sequenceNumber = recovery->blockMapHead,
.sectorCount = 1,
.entryCount = 0,
};
while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
result = validateRecoveryJournalEntry(recovery->vdo, &entry);
if (result != VDO_SUCCESS) {
enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
return result;
}
if (isIncrementOperation(entry.operation)) {
recovery->entries[recovery->entryCount] = (NumberedBlockMapping) {
.blockMapSlot = entry.slot,
.blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state),
.number = recovery->entryCount,
};
recovery->entryCount++;
}
incrementRecoveryPoint(&recoveryPoint);
}
result = ASSERT((recovery->entryCount <= recovery->increfCount),
"approximate incref count is an upper bound");
if (result != VDO_SUCCESS) {
enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
}
return result;
}
/**
* Extract journal entries and recover the block map. This callback is
* registered in startSuperBlockSave().
*
* @param completion The sub-task completion
**/
static void launchBlockMapRecovery(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
VDO *vdo = recovery->vdo;
assertOnLogicalZoneThread(vdo, 0, __func__);
// Extract the journal entries for the block map recovery.
int result = extractJournalEntries(recovery);
if (abortRecoveryOnError(result, recovery)) {
return;
}
prepareToFinishParent(completion, &recovery->completion);
recoverBlockMap(vdo, recovery->entryCount, recovery->entries, completion);
}
/**
* Finish flushing all slab journals and start a write of the super block.
* This callback is registered in addSynthesizedEntries().
*
* @param completion The sub-task completion
**/
static void startSuperBlockSave(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
VDO *vdo = recovery->vdo;
assertOnAdminThread(vdo, __func__);
logInfo("Saving recovery progress");
vdo->state = VDO_REPLAYING;
// The block map access which follows the super block save must be done
// on a logical thread.
prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback,
ZONE_TYPE_LOGICAL);
saveVDOComponentsAsync(vdo, completion);
}
/**
* The callback from loading the slab depot. It will update the logical blocks
* and block map data blocks counts in the recovery journal and then drain the
* slab depot in order to commit the recovered slab journals. It is registered
* in applyToDepot().
*
* @param completion The sub-task completion
**/
static void finishRecoveringDepot(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
VDO *vdo = recovery->vdo;
assertOnAdminThread(vdo, __func__);
logInfo("Replayed %zu journal entries into slab journals",
recovery->entriesAddedToSlabJournals);
logInfo("Synthesized %zu missing journal entries",
recovery->missingDecrefCount);
vdo->recoveryJournal->logicalBlocksUsed = recovery->logicalBlocksUsed;
vdo->recoveryJournal->blockMapDataBlocks = recovery->blockMapDataBlocks;
prepareSubTask(recovery, startSuperBlockSave, finishParentCallback,
ZONE_TYPE_ADMIN);
drainSlabDepot(vdo->depot, ADMIN_STATE_RECOVERING, completion);
}
/**
* The error handler for recovering slab journals. It will skip any remaining
* recovery on the current zone and propagate the error. It is registered in
* addSlabJournalEntries() and addSynthesizedEntries().
*
* @param completion The completion of the block allocator being recovered
**/
static void handleAddSlabJournalEntryError(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
notifySlabJournalsAreRecovered(recovery->allocator, completion->result);
}
/**
* Add synthesized entries into slab journals, waiting when necessary.
*
* @param completion The allocator completion
**/
static void addSynthesizedEntries(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
// Get ready in case we need to enqueue again
prepareCompletion(completion, addSynthesizedEntries,
handleAddSlabJournalEntryError,
completion->callbackThreadID, recovery);
WaitQueue *missingDecrefs
= &recovery->missingDecrefs[recovery->allocator->zoneNumber];
while (hasWaiters(missingDecrefs)) {
MissingDecref *decref = asMissingDecref(getFirstWaiter(missingDecrefs));
if (!attemptReplayIntoSlabJournal(decref->slabJournal,
decref->penultimateMapping.pbn,
DATA_DECREMENT, &decref->journalPoint,
completion)) {
return;
}
dequeueNextWaiter(missingDecrefs);
FREE(decref);
}
notifySlabJournalsAreRecovered(recovery->allocator, VDO_SUCCESS);
}
/**
* Determine the LBNs used count as of the end of the journal (but
* not including any changes to that count from entries that will be
* synthesized later).
*
* @param recovery The recovery completion
*
* @return VDO_SUCCESS or an error
**/
static int computeUsages(RecoveryCompletion *recovery)
{
RecoveryJournal *journal = recovery->vdo->recoveryJournal;
PackedJournalHeader *tailHeader
= getJournalBlockHeader(journal, recovery->journalData, recovery->tail);
RecoveryBlockHeader unpacked;
unpackRecoveryBlockHeader(tailHeader, &unpacked);
recovery->logicalBlocksUsed = unpacked.logicalBlocksUsed;
recovery->blockMapDataBlocks = unpacked.blockMapDataBlocks;
RecoveryPoint recoveryPoint = {
.sequenceNumber = recovery->tail,
.sectorCount = 1,
.entryCount = 0,
};
while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
if (isMappedLocation(&entry.mapping)) {
switch (entry.operation) {
case DATA_INCREMENT:
recovery->logicalBlocksUsed++;
break;
case DATA_DECREMENT:
recovery->logicalBlocksUsed--;
break;
case BLOCK_MAP_INCREMENT:
recovery->blockMapDataBlocks++;
break;
default:
return logErrorWithStringError(VDO_CORRUPT_JOURNAL,
"Recovery journal entry at "
"sequence number %" PRIu64
", sector %u, entry %u had invalid "
"operation %u",
recoveryPoint.sequenceNumber,
recoveryPoint.sectorCount,
recoveryPoint.entryCount,
entry.operation);
}
}
incrementRecoveryPoint(&recoveryPoint);
}
return VDO_SUCCESS;
}
/**
* Advance the current recovery and journal points.
*
* @param recovery The RecoveryCompletion whose points are to be
* advanced
* @param entriesPerBlock The number of entries in a recovery journal block
**/
static void advancePoints(RecoveryCompletion *recovery,
JournalEntryCount entriesPerBlock)
{
incrementRecoveryPoint(&recovery->nextRecoveryPoint);
advanceJournalPoint(&recovery->nextJournalPoint, entriesPerBlock);
}
/**
* Replay recovery journal entries into the slab journals of the allocator
* currently being recovered, waiting for slab journal tailblock space when
* necessary. This method is its own callback.
*
* @param completion The allocator completion
**/
static void addSlabJournalEntries(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
VDO *vdo = recovery->vdo;
RecoveryJournal *journal = vdo->recoveryJournal;
// Get ready in case we need to enqueue again.
prepareCompletion(completion, addSlabJournalEntries,
handleAddSlabJournalEntryError,
completion->callbackThreadID, recovery);
for (RecoveryPoint *recoveryPoint = &recovery->nextRecoveryPoint;
beforeRecoveryPoint(recoveryPoint, &recovery->tailRecoveryPoint);
advancePoints(recovery, journal->entriesPerBlock)) {
RecoveryJournalEntry entry = getEntry(recovery, recoveryPoint);
int result = validateRecoveryJournalEntry(vdo, &entry);
if (result != VDO_SUCCESS) {
enterReadOnlyMode(journal->readOnlyNotifier, result);
finishCompletion(completion, result);
return;
}
if (entry.mapping.pbn == ZERO_BLOCK) {
continue;
}
Slab *slab = getSlab(vdo->depot, entry.mapping.pbn);
if (slab->allocator != recovery->allocator) {
continue;
}
if (!attemptReplayIntoSlabJournal(slab->journal, entry.mapping.pbn,
entry.operation,
&recovery->nextJournalPoint,
completion)) {
return;
}
recovery->entriesAddedToSlabJournals++;
}
logInfo("Recreating missing journal entries for zone %u",
recovery->allocator->zoneNumber);
addSynthesizedEntries(completion);
}
/**********************************************************************/
void replayIntoSlabJournals(BlockAllocator *allocator,
VDOCompletion *completion,
void *context)
{
RecoveryCompletion *recovery = context;
assertOnPhysicalZoneThread(recovery->vdo, allocator->zoneNumber, __func__);
if ((recovery->journalData == NULL) || isReplaying(recovery->vdo)) {
// there's nothing to replay
notifySlabJournalsAreRecovered(allocator, VDO_SUCCESS);
return;
}
recovery->allocator = allocator;
recovery->nextRecoveryPoint = (RecoveryPoint) {
.sequenceNumber = recovery->slabJournalHead,
.sectorCount = 1,
.entryCount = 0,
};
recovery->nextJournalPoint = (JournalPoint) {
.sequenceNumber = recovery->slabJournalHead,
.entryCount = 0,
};
logInfo("Replaying entries into slab journals for zone %u",
allocator->zoneNumber);
completion->parent = recovery;
addSlabJournalEntries(completion);
}
/**
* A waiter callback to enqueue a MissingDecref on the queue for the physical
* zone in which it will be applied.
*
* Implements WaiterCallback.
**/
static void queueOnPhysicalZone(Waiter *waiter, void *context)
{
MissingDecref *decref = asMissingDecref(waiter);
DataLocation mapping = decref->penultimateMapping;
if (isMappedLocation(&mapping)) {
decref->recovery->logicalBlocksUsed--;
}
if (mapping.pbn == ZERO_BLOCK) {
// Decrefs of zero are not applied to slab journals.
FREE(decref);
return;
}
decref->slabJournal = getSlabJournal((SlabDepot *) context, mapping.pbn);
ZoneCount zoneNumber = decref->slabJournal->slab->allocator->zoneNumber;
enqueueMissingDecref(&decref->recovery->missingDecrefs[zoneNumber], decref);
}
/**
* Queue each missing decref on the slab journal to which it is to be applied
* then load the slab depot. This callback is registered in
* findSlabJournalEntries().
*
* @param completion The sub-task completion
**/
static void applyToDepot(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
assertOnAdminThread(recovery->vdo, __func__);
prepareSubTask(recovery, finishRecoveringDepot, finishParentCallback,
ZONE_TYPE_ADMIN);
SlabDepot *depot = getSlabDepot(recovery->vdo);
notifyAllWaiters(&recovery->missingDecrefs[0], queueOnPhysicalZone, depot);
if (abortRecoveryOnError(recovery->completion.result, recovery)) {
return;
}
loadSlabDepot(depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery);
}
/**
* Validate the location of the penultimate mapping for a MissingDecref. If it
* is valid, enqueue it for the appropriate physical zone or account for it.
* Otherwise, dispose of it and signal an error.
*
* @param decref The decref whose penultimate mapping has just been found
* @param location The penultimate mapping
* @param errorCode The error code to use if the location is invalid
**/
static int recordMissingDecref(MissingDecref *decref,
DataLocation location,
int errorCode)
{
RecoveryCompletion *recovery = decref->recovery;
recovery->incompleteDecrefCount--;
if (isValidLocation(&location)
&& isPhysicalDataBlock(recovery->vdo->depot, location.pbn)) {
decref->penultimateMapping = location;
decref->complete = true;
return VDO_SUCCESS;
}
// The location was invalid
enterReadOnlyMode(recovery->vdo->readOnlyNotifier, errorCode);
setCompletionResult(&recovery->completion, errorCode);
logErrorWithStringError(errorCode,
"Invalid mapping for pbn %llu with state %u",
location.pbn, location.state);
return errorCode;
}
/**
* Find the block map slots with missing decrefs.
*
* To find the slots missing decrefs, we iterate through the journal in reverse
* so we see decrefs before increfs; if we see an incref before its paired
* decref, we instantly know this incref is missing its decref.
*
* Simultaneously, we attempt to determine the missing decref. If there is a
* missing decref, and at least two increfs for that slot, we know we should
* decref the PBN from the penultimate incref. Otherwise, there is only one
* incref for that slot: we must synthesize the decref out of the block map
* instead of the recovery journal.
*
* @param recovery The recovery completion
*
* @return VDO_SUCCESS or an error code
**/
__attribute__((warn_unused_result))
static int findMissingDecrefs(RecoveryCompletion *recovery)
{
IntMap *slotEntryMap = recovery->slotEntryMap;
// This placeholder decref is used to mark lbns for which we have observed a
// decref but not the paired incref (going backwards through the journal).
MissingDecref foundDecref;
// A buffer is allocated based on the number of incRef entries found, so use
// the earliest head.
SequenceNumber head = minSequenceNumber(recovery->blockMapHead,
recovery->slabJournalHead);
RecoveryPoint headPoint = {
.sequenceNumber = head,
.sectorCount = 1,
.entryCount = 0,
};
// Set up for the first fake journal point that will be used for a
// synthesized entry.
recovery->nextSynthesizedJournalPoint = (JournalPoint) {
.sequenceNumber = recovery->tail,
.entryCount = recovery->vdo->recoveryJournal->entriesPerBlock,
};
RecoveryPoint recoveryPoint = recovery->tailRecoveryPoint;
while (beforeRecoveryPoint(&headPoint, &recoveryPoint)) {
decrementRecoveryPoint(&recoveryPoint);
RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
if (!isIncrementOperation(entry.operation)) {
// Observe that we've seen a decref before its incref, but only if
// the IntMap does not contain an unpaired incref for this lbn.
int result = intMapPut(slotEntryMap, slotAsNumber(entry.slot),
&foundDecref, false, NULL);
if (result != VDO_SUCCESS) {
return result;
}
continue;
}
recovery->increfCount++;
MissingDecref *decref
= intMapRemove(slotEntryMap, slotAsNumber(entry.slot));
if (entry.operation == BLOCK_MAP_INCREMENT) {
if (decref != NULL) {
return logErrorWithStringError(VDO_CORRUPT_JOURNAL,
"decref found for block map block %"
PRIu64 " with state %u",
entry.mapping.pbn, entry.mapping.state);
}
// There are no decrefs for block map pages, so they can't be missing.
continue;
}
if (decref == &foundDecref) {
// This incref already had a decref in the intmap, so we know it is
// not missing its decref.
continue;
}
if (decref == NULL) {
// This incref is missing a decref. Add a missing decref object.
int result = makeMissingDecref(recovery, entry, &decref);
if (result != VDO_SUCCESS) {
return result;
}
result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), decref,
false, NULL);
if (result != VDO_SUCCESS) {
return result;
}
continue;
}
/*
* This MissingDecref was left here by an incref without a decref.
* We now know what its penultimate mapping is, and all entries
* before here in the journal are paired, decref before incref, so
* we needn't remember it in the intmap any longer.
*/
int result = recordMissingDecref(decref, entry.mapping,
VDO_CORRUPT_JOURNAL);
if (result != VDO_SUCCESS) {
return result;
}
}
return VDO_SUCCESS;
}
/**
* Process a fetched block map page for a missing decref. This callback is
* registered in findSlabJournalEntries().
*
* @param completion The page completion which has just finished loading
**/
static void processFetchedPage(VDOCompletion *completion)
{
MissingDecref *currentDecref = completion->parent;
RecoveryCompletion *recovery = currentDecref->recovery;
assertOnLogicalZoneThread(recovery->vdo, 0, __func__);
const BlockMapPage *page = dereferenceReadableVDOPage(completion);
DataLocation location
= unpackBlockMapEntry(&page->entries[currentDecref->slot.slot]);
releaseVDOPageCompletion(completion);
recordMissingDecref(currentDecref, location, VDO_BAD_MAPPING);
if (recovery->incompleteDecrefCount == 0) {
completeCompletion(&recovery->subTaskCompletion);
}
}
/**
* Handle an error fetching a block map page for a missing decref.
* This error handler is registered in findSlabJournalEntries().
*
* @param completion The page completion which has just finished loading
**/
static void handleFetchError(VDOCompletion *completion)
{
MissingDecref *decref = completion->parent;
RecoveryCompletion *recovery = decref->recovery;
assertOnLogicalZoneThread(recovery->vdo, 0, __func__);
// If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read from
// the journal was bad, so convert the error code
setCompletionResult(&recovery->subTaskCompletion,
((completion->result == VDO_OUT_OF_RANGE)
? VDO_CORRUPT_JOURNAL : completion->result));
releaseVDOPageCompletion(completion);
if (--recovery->incompleteDecrefCount == 0) {
completeCompletion(&recovery->subTaskCompletion);
}
}
/**
* The waiter callback to requeue a missing decref and launch its page fetch.
*
* Implements WaiterCallback.
**/
static void launchFetch(Waiter *waiter, void *context)
{
MissingDecref *decref = asMissingDecref(waiter);
RecoveryCompletion *recovery = decref->recovery;
if (enqueueMissingDecref(&recovery->missingDecrefs[0], decref)
!= VDO_SUCCESS) {
return;
}
if (decref->complete) {
// We've already found the mapping for this decref, no fetch needed.
return;
}
BlockMapZone *zone = context;
initVDOPageCompletion(&decref->pageCompletion, zone->pageCache,
decref->slot.pbn, false, decref, processFetchedPage,
handleFetchError);
getVDOPageAsync(&decref->pageCompletion.completion);
}
/**
* Find all entries which need to be replayed into the slab journals.
*
* @param completion The sub-task completion
**/
static void findSlabJournalEntries(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
VDO *vdo = recovery->vdo;
// We need to be on logical zone 0's thread since we are going to use its
// page cache.
assertOnLogicalZoneThread(vdo, 0, __func__);
int result = findMissingDecrefs(recovery);
if (abortRecoveryOnError(result, recovery)) {
return;
}
prepareSubTask(recovery, applyToDepot, finishParentCallback,
ZONE_TYPE_ADMIN);
/*
* Increment the incompleteDecrefCount so that the fetch callback can't
* complete the sub-task while we are still processing the queue of missing
* decrefs.
*/
if (recovery->incompleteDecrefCount++ > 0) {
// Fetch block map pages to fill in the incomplete missing decrefs.
notifyAllWaiters(&recovery->missingDecrefs[0], launchFetch,
getBlockMapZone(getBlockMap(vdo), 0));
}
if (--recovery->incompleteDecrefCount == 0) {
completeCompletion(completion);
}
}
/**
* Find the contiguous range of journal blocks.
*
* @param recovery The recovery completion
*
* @return true
if there were valid journal blocks
**/
static bool findContiguousRange(RecoveryCompletion *recovery)
{
RecoveryJournal *journal = recovery->vdo->recoveryJournal;
SequenceNumber head
= minSequenceNumber(recovery->blockMapHead, recovery->slabJournalHead);
bool foundEntries = false;
for (SequenceNumber i = head; i <= recovery->highestTail; i++) {
recovery->tail = i;
recovery->tailRecoveryPoint = (RecoveryPoint) {
.sequenceNumber = i,
.sectorCount = 0,
.entryCount = 0,
};
PackedJournalHeader *packedHeader
= getJournalBlockHeader(journal, recovery->journalData, i);
RecoveryBlockHeader header;
unpackRecoveryBlockHeader(packedHeader, &header);
if (!isExactRecoveryJournalBlock(journal, &header, i)
|| (header.entryCount > journal->entriesPerBlock)) {
// A bad block header was found so this must be the end of the journal.
break;
}
JournalEntryCount blockEntries = header.entryCount;
// Examine each sector in turn to determine the last valid sector.
for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) {
PackedJournalSector *sector = getJournalBlockSector(packedHeader, j);
// A bad sector means that this block was torn.
if (!isValidRecoveryJournalSector(&header, sector)) {
break;
}
JournalEntryCount sectorEntries = minBlock(sector->entryCount,
blockEntries);
if (sectorEntries > 0) {
foundEntries = true;
recovery->tailRecoveryPoint.sectorCount++;
recovery->tailRecoveryPoint.entryCount = sectorEntries;
blockEntries -= sectorEntries;
}
// If this sector is short, the later sectors can't matter.
if ((sectorEntries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
|| (blockEntries == 0)) {
break;
}
}
// If this block was not filled, or if it tore, no later block can matter.
if ((header.entryCount != journal->entriesPerBlock)
|| (blockEntries > 0)) {
break;
}
}
// Set the tail to the last valid tail block, if there is one.
if (foundEntries && (recovery->tailRecoveryPoint.sectorCount == 0)) {
recovery->tail--;
}
return foundEntries;
}
/**
* Count the number of increment entries in the journal.
*
* @param recovery The recovery completion
**/
static int countIncrementEntries(RecoveryCompletion *recovery)
{
RecoveryPoint recoveryPoint = {
.sequenceNumber = recovery->blockMapHead,
.sectorCount = 1,
.entryCount = 0,
};
while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
int result = validateRecoveryJournalEntry(recovery->vdo, &entry);
if (result != VDO_SUCCESS) {
enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
return result;
}
if (isIncrementOperation(entry.operation)) {
recovery->increfCount++;
}
incrementRecoveryPoint(&recoveryPoint);
}
return VDO_SUCCESS;
}
/**
* Determine the limits of the valid recovery journal and prepare to replay
* into the slab journals and block map.
*
* @param completion The sub-task completion
**/
static void prepareToApplyJournalEntries(VDOCompletion *completion)
{
RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
VDO *vdo = recovery->vdo;
RecoveryJournal *journal = vdo->recoveryJournal;
logInfo("Finished reading recovery journal");
bool foundEntries = findHeadAndTail(journal, recovery->journalData,
&recovery->highestTail,
&recovery->blockMapHead,
&recovery->slabJournalHead);
if (foundEntries) {
foundEntries = findContiguousRange(recovery);
}
// Both reap heads must be behind the tail.
if ((recovery->blockMapHead > recovery->tail)
|| (recovery->slabJournalHead > recovery->tail)) {
int result = logErrorWithStringError(VDO_CORRUPT_JOURNAL,
"Journal tail too early. "
"block map head: %" PRIu64
", slab journal head: %" PRIu64
", tail: %llu",
recovery->blockMapHead,
recovery->slabJournalHead,
recovery->tail);
finishCompletion(&recovery->completion, result);
return;
}
if (!foundEntries) {
// This message must be recognizable by VDOTest::RebuildBase.
logInfo("Replaying 0 recovery entries into block map");
// We still need to load the SlabDepot.
FREE(recovery->journalData);
recovery->journalData = NULL;
prepareSubTask(recovery, finishParentCallback, finishParentCallback,
ZONE_TYPE_ADMIN);
loadSlabDepot(getSlabDepot(vdo), ADMIN_STATE_LOADING_FOR_RECOVERY,
completion, recovery);
return;
}
logInfo("Highest-numbered recovery journal block has sequence number"
" %llu, and the highest-numbered usable block is %"
PRIu64, recovery->highestTail, recovery->tail);
if (isReplaying(vdo)) {
// We need to know how many entries the block map rebuild completion will
// need to hold.
int result = countIncrementEntries(recovery);
if (result != VDO_SUCCESS) {
finishCompletion(&recovery->completion, result);
return;
}
// We need to access the block map from a logical zone.
prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback,
ZONE_TYPE_LOGICAL);
loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion,
recovery);
return;
}
int result = computeUsages(recovery);
if (abortRecoveryOnError(result, recovery)) {
return;
}
prepareSubTask(recovery, findSlabJournalEntries, finishParentCallback,
ZONE_TYPE_LOGICAL);
invokeCallback(completion);
}
/**********************************************************************/
void launchRecovery(VDO *vdo, VDOCompletion *parent)
{
// Note: This message must be recognizable by Permabit::VDODeviceBase.
logWarning("Device was dirty, rebuilding reference counts");
RecoveryCompletion *recovery;
int result = makeRecoveryCompletion(vdo, &recovery);
if (result != VDO_SUCCESS) {
finishCompletion(parent, result);
return;
}
VDOCompletion *completion = &recovery->completion;
prepareCompletion(completion, finishRecovery, abortRecovery,
parent->callbackThreadID, parent);
prepareSubTask(recovery, prepareToApplyJournalEntries, finishParentCallback,
ZONE_TYPE_ADMIN);
loadJournalAsync(vdo->recoveryJournal, &recovery->subTaskCompletion,
&recovery->journalData);
}