/* * Copyright (c) 2020 Red Hat, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.c#7 $ */ #include "blockMapRecovery.h" #include "logger.h" #include "memoryAlloc.h" #include "blockMapInternals.h" #include "blockMapPage.h" #include "heap.h" #include "numUtils.h" #include "refCounts.h" #include "slabDepot.h" #include "types.h" #include "vdoInternal.h" #include "vdoPageCache.h" /** * A completion to manage recovering the block map from the recovery journal. * Note that the page completions kept in this structure are not immediately * freed, so the corresponding pages will be locked down in the page cache * until the recovery frees them. **/ typedef struct { /** completion header */ VDOCompletion completion; /** the completion for flushing the block map */ VDOCompletion subTaskCompletion; /** the thread from which the block map may be flushed */ ThreadID adminThread; /** the thread on which all block map operations must be done */ ThreadID logicalThreadID; /** the block map */ BlockMap *blockMap; /** whether this recovery has been aborted */ bool aborted; /** whether we are currently launching the initial round of requests */ bool launching; // Fields for the journal entries. /** the journal entries to apply */ NumberedBlockMapping *journalEntries; /** * a heap wrapping journalEntries. It re-orders and sorts journal entries in * ascending LBN order, then original journal order. This permits efficient * iteration over the journal entries in order. **/ Heap replayHeap; // Fields tracking progress through the journal entries. /** a pointer to the next journal entry to apply */ NumberedBlockMapping *currentEntry; /** the next entry for which the block map page has not been requested */ NumberedBlockMapping *currentUnfetchedEntry; // Fields tracking requested pages. /** the absolute PBN of the current page being processed */ PhysicalBlockNumber pbn; /** number of pending (non-ready) requests */ PageCount outstanding; /** number of page completions */ PageCount pageCount; /** array of requested, potentially ready page completions */ VDOPageCompletion pageCompletions[]; } BlockMapRecoveryCompletion; /** * This is a HeapComparator function that orders NumberedBlockMappings using * the 'blockMapSlot' field as the primary key and the mapping 'number' field * as the secondary key. Using the mapping number preserves the journal order * of entries for the same slot, allowing us to sort by slot while still * ensuring we replay all entries with the same slot in the exact order as they * appeared in the journal. * *

The comparator order is reversed from the usual sense since Heap is a * max-heap, returning larger elements before smaller ones, but we want to pop * entries off the heap in ascending LBN order. **/ static int compareMappings(const void *item1, const void *item2) { const NumberedBlockMapping *mapping1 = (const NumberedBlockMapping *) item1; const NumberedBlockMapping *mapping2 = (const NumberedBlockMapping *) item2; if (mapping1->blockMapSlot.pbn != mapping2->blockMapSlot.pbn) { return ((mapping1->blockMapSlot.pbn < mapping2->blockMapSlot.pbn) ? 1 : -1); } if (mapping1->blockMapSlot.slot != mapping2->blockMapSlot.slot) { return ((mapping1->blockMapSlot.slot < mapping2->blockMapSlot.slot) ? 1 : -1); } if (mapping1->number != mapping2->number) { return ((mapping1->number < mapping2->number) ? 1 : -1); } return 0; } /** * Swap two NumberedBlockMapping structures. Implements HeapSwapper. **/ static void swapMappings(void *item1, void *item2) { NumberedBlockMapping *mapping1 = item1; NumberedBlockMapping *mapping2 = item2; NumberedBlockMapping temp = *mapping1; *mapping1 = *mapping2; *mapping2 = temp; } /** * Convert a VDOCompletion to a BlockMapRecoveryCompletion. * * @param completion The completion to convert * * @return The completion as a BlockMapRecoveryCompletion **/ __attribute__((warn_unused_result)) static inline BlockMapRecoveryCompletion * asBlockMapRecoveryCompletion(VDOCompletion *completion) { STATIC_ASSERT(offsetof(BlockMapRecoveryCompletion, completion) == 0); assertCompletionType(completion->type, BLOCK_MAP_RECOVERY_COMPLETION); return (BlockMapRecoveryCompletion *) completion; } /** * Free a BlockMapRecoveryCompletion and null out the reference to it. * * @param completionPtr a pointer to the completion to free **/ static void freeRecoveryCompletion(VDOCompletion **completionPtr) { VDOCompletion *completion = *completionPtr; if (completion == NULL) { return; } BlockMapRecoveryCompletion *recovery = asBlockMapRecoveryCompletion(*completionPtr); destroyEnqueueable(completion); destroyEnqueueable(&recovery->subTaskCompletion); FREE(recovery); *completionPtr = NULL; } /** * Free the BlockMapRecoveryCompletion and notify the parent that the block map * recovery is done. This callback is registered in makeRecoveryCompletion(). * * @param completion The BlockMapRecoveryCompletion **/ static void finishBlockMapRecovery(VDOCompletion *completion) { int result = completion->result; VDOCompletion *parent = completion->parent; freeRecoveryCompletion(&completion); finishCompletion(parent, result); } /** * Make a new block map recovery completion. * * @param [in] vdo The VDO * @param [in] entryCount The number of journal entries * @param [in] journalEntries An array of journal entries to process * @param [in] parent The parent of the recovery completion * @param [out] recoveryPtr The new block map recovery completion * * @return a success or error code **/ static int makeRecoveryCompletion(VDO *vdo, BlockCount entryCount, NumberedBlockMapping *journalEntries, VDOCompletion *parent, BlockMapRecoveryCompletion **recoveryPtr) { BlockMap *blockMap = getBlockMap(vdo); PageCount pageCount = minPageCount(getConfiguredCacheSize(vdo) >> 1, MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS); BlockMapRecoveryCompletion *recovery; int result = ALLOCATE_EXTENDED(BlockMapRecoveryCompletion, pageCount, VDOPageCompletion, __func__, &recovery); if (result != UDS_SUCCESS) { return result; } result = initializeEnqueueableCompletion(&recovery->completion, BLOCK_MAP_RECOVERY_COMPLETION, vdo->layer); if (result != VDO_SUCCESS) { VDOCompletion *completion = &recovery->completion; freeRecoveryCompletion(&completion); return result; } result = initializeEnqueueableCompletion(&recovery->subTaskCompletion, SUB_TASK_COMPLETION, vdo->layer); if (result != VDO_SUCCESS) { VDOCompletion *completion = &recovery->completion; freeRecoveryCompletion(&completion); return result; } recovery->blockMap = blockMap; recovery->journalEntries = journalEntries; recovery->pageCount = pageCount; recovery->currentEntry = &recovery->journalEntries[entryCount - 1]; const ThreadConfig *threadConfig = getThreadConfig(vdo); recovery->adminThread = getAdminThread(threadConfig); recovery->logicalThreadID = getLogicalZoneThread(threadConfig, 0); // Organize the journal entries into a binary heap so we can iterate over // them in sorted order incrementally, avoiding an expensive sort call. initializeHeap(&recovery->replayHeap, compareMappings, swapMappings, journalEntries, entryCount, sizeof(NumberedBlockMapping)); buildHeap(&recovery->replayHeap, entryCount); ASSERT_LOG_ONLY((getCallbackThreadID() == recovery->logicalThreadID), "%s must be called on logical thread %u (not %u)", __func__, recovery->logicalThreadID, getCallbackThreadID()); prepareCompletion(&recovery->completion, finishBlockMapRecovery, finishBlockMapRecovery, recovery->logicalThreadID, parent); // This message must be recognizable by VDOTest::RebuildBase. logInfo("Replaying %zu recovery entries into block map", recovery->replayHeap.count); *recoveryPtr = recovery; return VDO_SUCCESS; } /**********************************************************************/ static void flushBlockMap(VDOCompletion *completion) { logInfo("Flushing block map changes"); BlockMapRecoveryCompletion *recovery = asBlockMapRecoveryCompletion(completion->parent); ASSERT_LOG_ONLY((completion->callbackThreadID == recovery->adminThread), "flushBlockMap() called on admin thread"); prepareToFinishParent(completion, completion->parent); drainBlockMap(recovery->blockMap, ADMIN_STATE_RECOVERING, completion); } /** * Check whether the recovery is done. If so, finish it by either flushing the * block map (if the recovery was successful), or by cleaning up (if it * wasn't). * * @param recovery The recovery completion * * @return true if the recovery or recovery is complete **/ static bool finishIfDone(BlockMapRecoveryCompletion *recovery) { // Pages are still being launched or there is still work to do if (recovery->launching || (recovery->outstanding > 0) || (!recovery->aborted && (recovery->currentEntry >= recovery->journalEntries))) { return false; } if (recovery->aborted) { /* * We need to be careful here to only free completions that exist. But * since we know none are outstanding, we just go through the ready ones. */ for (size_t i = 0; i < recovery->pageCount; i++) { VDOPageCompletion *pageCompletion = &recovery->pageCompletions[i]; if (recovery->pageCompletions[i].ready) { releaseVDOPageCompletion(&pageCompletion->completion); } } completeCompletion(&recovery->completion); } else { launchCallbackWithParent(&recovery->subTaskCompletion, flushBlockMap, recovery->adminThread, &recovery->completion); } return true; } /** * Note that there has been an error during the recovery and finish it if there * is nothing else outstanding. * * @param recovery The BlockMapRecoveryCompletion * @param result The error result to use, if one is not already saved **/ static void abortRecovery(BlockMapRecoveryCompletion *recovery, int result) { recovery->aborted = true; setCompletionResult(&recovery->completion, result); finishIfDone(recovery); } /** * Find the first journal entry after a given entry which is not on the same * block map page. * * @param recovery the BlockMapRecoveryCompletion * @param currentEntry the entry to search from * @param needsSort Whether sorting is needed to proceed * * @return Pointer to the first later journal entry on a different block map * page, or a pointer to just before the journal entries if no * subsequent entry is on a different block map page. **/ static NumberedBlockMapping * findEntryStartingNextPage(BlockMapRecoveryCompletion *recovery, NumberedBlockMapping *currentEntry, bool needsSort) { // If currentEntry is invalid, return immediately. if (currentEntry < recovery->journalEntries) { return currentEntry; } size_t currentPage = currentEntry->blockMapSlot.pbn; // Decrement currentEntry until it's out of bounds or on a different page. while ((currentEntry >= recovery->journalEntries) && (currentEntry->blockMapSlot.pbn == currentPage)) { if (needsSort) { NumberedBlockMapping *justSortedEntry = sortNextHeapElement(&recovery->replayHeap); ASSERT_LOG_ONLY(justSortedEntry < currentEntry, "heap is returning elements in an unexpected order"); } currentEntry--; } return currentEntry; } /** * Apply a range of journal entries to a block map page. * * @param page The block map page being modified * @param startingEntry The first journal entry to apply * @param endingEntry The entry just past the last journal entry to apply **/ static void applyJournalEntriesToPage(BlockMapPage *page, NumberedBlockMapping *startingEntry, NumberedBlockMapping *endingEntry) { NumberedBlockMapping *currentEntry = startingEntry; while (currentEntry != endingEntry) { page->entries[currentEntry->blockMapSlot.slot] = currentEntry->blockMapEntry; currentEntry--; } } /**********************************************************************/ static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, VDOCompletion *completion); /** * Note that a page is now ready and attempt to process pages. This callback is * registered in fetchPage(). * * @param completion The VDOPageCompletion for the fetched page **/ static void pageLoaded(VDOCompletion *completion) { BlockMapRecoveryCompletion *recovery = asBlockMapRecoveryCompletion(completion->parent); recovery->outstanding--; if (!recovery->launching) { recoverReadyPages(recovery, completion); } } /** * Handle an error loading a page. * * @param completion The VDOPageCompletion **/ static void handlePageLoadError(VDOCompletion *completion) { BlockMapRecoveryCompletion *recovery = asBlockMapRecoveryCompletion(completion->parent); recovery->outstanding--; abortRecovery(recovery, completion->result); } /** * Fetch a page from the block map. * * @param recovery the BlockMapRecoveryCompletion * @param completion the page completion to use **/ static void fetchPage(BlockMapRecoveryCompletion *recovery, VDOCompletion *completion) { if (recovery->currentUnfetchedEntry < recovery->journalEntries) { // Nothing left to fetch. return; } // Fetch the next page we haven't yet requested. PhysicalBlockNumber newPBN = recovery->currentUnfetchedEntry->blockMapSlot.pbn; recovery->currentUnfetchedEntry = findEntryStartingNextPage(recovery, recovery->currentUnfetchedEntry, true); initVDOPageCompletion(((VDOPageCompletion *) completion), recovery->blockMap->zones[0].pageCache, newPBN, true, &recovery->completion, pageLoaded, handlePageLoadError); recovery->outstanding++; getVDOPageAsync(completion); } /** * Get the next page completion to process. If it isn't ready, we'll try again * when it is. * * @param recovery The recovery completion * @param completion The current page completion * * @return The next page completion to process **/ static VDOPageCompletion * getNextPageCompletion(BlockMapRecoveryCompletion *recovery, VDOPageCompletion *completion) { completion++; if (completion == (&recovery->pageCompletions[recovery->pageCount])) { completion = &recovery->pageCompletions[0]; } return completion; } /** * Recover from as many pages as possible. * * @param recovery The recovery completion * @param completion The first page completion to process **/ static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, VDOCompletion *completion) { if (finishIfDone(recovery)) { return; } VDOPageCompletion *pageCompletion = (VDOPageCompletion *) completion; if (recovery->pbn != pageCompletion->pbn) { return; } while (pageCompletion->ready) { BlockMapPage *page = dereferenceWritableVDOPage(completion); int result = ASSERT(page != NULL, "page available"); if (result != VDO_SUCCESS) { abortRecovery(recovery, result); return; } NumberedBlockMapping *startOfNextPage = findEntryStartingNextPage(recovery, recovery->currentEntry, false); applyJournalEntriesToPage(page, recovery->currentEntry, startOfNextPage); recovery->currentEntry = startOfNextPage; requestVDOPageWrite(completion); releaseVDOPageCompletion(completion); if (finishIfDone(recovery)) { return; } recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; fetchPage(recovery, completion); pageCompletion = getNextPageCompletion(recovery, pageCompletion); completion = &pageCompletion->completion; } } /**********************************************************************/ void recoverBlockMap(VDO *vdo, BlockCount entryCount, NumberedBlockMapping *journalEntries, VDOCompletion *parent) { BlockMapRecoveryCompletion *recovery; int result = makeRecoveryCompletion(vdo, entryCount, journalEntries, parent, &recovery); if (result != VDO_SUCCESS) { finishCompletion(parent, result); return; } if (isHeapEmpty(&recovery->replayHeap)) { finishCompletion(&recovery->completion, VDO_SUCCESS); return; } NumberedBlockMapping *firstSortedEntry = sortNextHeapElement(&recovery->replayHeap); ASSERT_LOG_ONLY(firstSortedEntry == recovery->currentEntry, "heap is returning elements in an unexpected order"); // Prevent any page from being processed until all pages have been launched. recovery->launching = true; recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; recovery->currentUnfetchedEntry = recovery->currentEntry; for (PageCount i = 0; i < recovery->pageCount; i++) { if (recovery->currentUnfetchedEntry < recovery->journalEntries) { break; } fetchPage(recovery, &recovery->pageCompletions[i].completion); } recovery->launching = false; // Process any ready pages. recoverReadyPages(recovery, &recovery->pageCompletions[0].completion); }