Blob Blame History Raw
/*
 * Copyright (c) 2020 Red Hat, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA. 
 *
 * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.c#7 $
 */

#include "blockMapRecovery.h"

#include "logger.h"
#include "memoryAlloc.h"

#include "blockMapInternals.h"
#include "blockMapPage.h"
#include "heap.h"
#include "numUtils.h"
#include "refCounts.h"
#include "slabDepot.h"
#include "types.h"
#include "vdoInternal.h"
#include "vdoPageCache.h"

/**
 * A completion to manage recovering the block map from the recovery journal.
 * Note that the page completions kept in this structure are not immediately
 * freed, so the corresponding pages will be locked down in the page cache
 * until the recovery frees them.
 **/
typedef struct {
  /** completion header */
  VDOCompletion         completion;
  /** the completion for flushing the block map */
  VDOCompletion         subTaskCompletion;
  /** the thread from which the block map may be flushed */
  ThreadID              adminThread;
  /** the thread on which all block map operations must be done */
  ThreadID              logicalThreadID;
  /** the block map */
  BlockMap             *blockMap;
  /** whether this recovery has been aborted */
  bool                  aborted;
  /** whether we are currently launching the initial round of requests */
  bool                  launching;

  // Fields for the journal entries.
  /** the journal entries to apply */
  NumberedBlockMapping *journalEntries;
  /**
   * a heap wrapping journalEntries. It re-orders and sorts journal entries in
   * ascending LBN order, then original journal order. This permits efficient
   * iteration over the journal entries in order.
   **/
  Heap                  replayHeap;

  // Fields tracking progress through the journal entries.
  /** a pointer to the next journal entry to apply */
  NumberedBlockMapping *currentEntry;
  /** the next entry for which the block map page has not been requested */
  NumberedBlockMapping *currentUnfetchedEntry;

  // Fields tracking requested pages.
  /** the absolute PBN of the current page being processed */
  PhysicalBlockNumber   pbn;
  /** number of pending (non-ready) requests */
  PageCount             outstanding;
  /** number of page completions */
  PageCount             pageCount;
  /** array of requested, potentially ready page completions */
  VDOPageCompletion     pageCompletions[];
} BlockMapRecoveryCompletion;

/**
 * This is a HeapComparator function that orders NumberedBlockMappings using
 * the 'blockMapSlot' field as the primary key and the mapping 'number' field
 * as the secondary key. Using the mapping number preserves the journal order
 * of entries for the same slot, allowing us to sort by slot while still
 * ensuring we replay all entries with the same slot in the exact order as they
 * appeared in the journal.
 *
 * <p>The comparator order is reversed from the usual sense since Heap is a
 * max-heap, returning larger elements before smaller ones, but we want to pop
 * entries off the heap in ascending LBN order.
 **/
static int compareMappings(const void *item1, const void *item2)
{
  const NumberedBlockMapping *mapping1 = (const NumberedBlockMapping *) item1;
  const NumberedBlockMapping *mapping2 = (const NumberedBlockMapping *) item2;

  if (mapping1->blockMapSlot.pbn != mapping2->blockMapSlot.pbn) {
    return
      ((mapping1->blockMapSlot.pbn < mapping2->blockMapSlot.pbn) ? 1 : -1);
  }

  if (mapping1->blockMapSlot.slot != mapping2->blockMapSlot.slot) {
    return
      ((mapping1->blockMapSlot.slot < mapping2->blockMapSlot.slot) ? 1 : -1);
  }

  if (mapping1->number != mapping2->number) {
    return ((mapping1->number < mapping2->number) ? 1 : -1);
  }

  return 0;
}

/**
 * Swap two NumberedBlockMapping structures. Implements HeapSwapper.
 **/
static void swapMappings(void *item1, void *item2)
{
  NumberedBlockMapping *mapping1 = item1;
  NumberedBlockMapping *mapping2 = item2;
  NumberedBlockMapping temp = *mapping1;
  *mapping1 = *mapping2;
  *mapping2 = temp;
}

/**
 * Convert a VDOCompletion to a BlockMapRecoveryCompletion.
 *
 * @param completion  The completion to convert
 *
 * @return The completion as a BlockMapRecoveryCompletion
 **/
__attribute__((warn_unused_result))
static inline BlockMapRecoveryCompletion *
asBlockMapRecoveryCompletion(VDOCompletion *completion)
{
  STATIC_ASSERT(offsetof(BlockMapRecoveryCompletion, completion) == 0);
  assertCompletionType(completion->type, BLOCK_MAP_RECOVERY_COMPLETION);
  return (BlockMapRecoveryCompletion *) completion;
}

/**
 * Free a BlockMapRecoveryCompletion and null out the reference to it.
 *
 * @param completionPtr  a pointer to the completion to free
 **/
static void freeRecoveryCompletion(VDOCompletion **completionPtr)
{
  VDOCompletion *completion = *completionPtr;
  if (completion == NULL) {
    return;
  }

  BlockMapRecoveryCompletion *recovery
    = asBlockMapRecoveryCompletion(*completionPtr);
  destroyEnqueueable(completion);
  destroyEnqueueable(&recovery->subTaskCompletion);
  FREE(recovery);
  *completionPtr = NULL;
}

/**
 * Free the BlockMapRecoveryCompletion and notify the parent that the block map
 * recovery is done. This callback is registered in makeRecoveryCompletion().
 *
 * @param completion  The BlockMapRecoveryCompletion
 **/
static void finishBlockMapRecovery(VDOCompletion *completion)
{
  int            result = completion->result;
  VDOCompletion *parent = completion->parent;
  freeRecoveryCompletion(&completion);
  finishCompletion(parent, result);
}

/**
 * Make a new block map recovery completion.
 *
 * @param [in]  vdo             The VDO
 * @param [in]  entryCount      The number of journal entries
 * @param [in]  journalEntries  An array of journal entries to process
 * @param [in]  parent          The parent of the recovery completion
 * @param [out] recoveryPtr     The new block map recovery completion
 *
 * @return a success or error code
 **/
static int makeRecoveryCompletion(VDO                         *vdo,
                                  BlockCount                   entryCount,
                                  NumberedBlockMapping        *journalEntries,
                                  VDOCompletion               *parent,
                                  BlockMapRecoveryCompletion **recoveryPtr)
{
  BlockMap *blockMap = getBlockMap(vdo);
  PageCount pageCount
    = minPageCount(getConfiguredCacheSize(vdo) >> 1,
                   MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS);

  BlockMapRecoveryCompletion *recovery;
  int result = ALLOCATE_EXTENDED(BlockMapRecoveryCompletion, pageCount,
                                 VDOPageCompletion, __func__, &recovery);
  if (result != UDS_SUCCESS) {
    return result;
  }

  result = initializeEnqueueableCompletion(&recovery->completion,
                                           BLOCK_MAP_RECOVERY_COMPLETION,
                                           vdo->layer);
  if (result != VDO_SUCCESS) {
    VDOCompletion *completion = &recovery->completion;
    freeRecoveryCompletion(&completion);
    return result;
  }

  result = initializeEnqueueableCompletion(&recovery->subTaskCompletion,
                                           SUB_TASK_COMPLETION, vdo->layer);
  if (result != VDO_SUCCESS) {
    VDOCompletion *completion = &recovery->completion;
    freeRecoveryCompletion(&completion);
    return result;
  }

  recovery->blockMap       = blockMap;
  recovery->journalEntries = journalEntries;
  recovery->pageCount      = pageCount;
  recovery->currentEntry   = &recovery->journalEntries[entryCount - 1];

  const ThreadConfig *threadConfig = getThreadConfig(vdo);
  recovery->adminThread     = getAdminThread(threadConfig);
  recovery->logicalThreadID = getLogicalZoneThread(threadConfig, 0);

  // Organize the journal entries into a binary heap so we can iterate over
  // them in sorted order incrementally, avoiding an expensive sort call.
  initializeHeap(&recovery->replayHeap, compareMappings, swapMappings,
                 journalEntries, entryCount, sizeof(NumberedBlockMapping));
  buildHeap(&recovery->replayHeap, entryCount);

  ASSERT_LOG_ONLY((getCallbackThreadID() == recovery->logicalThreadID),
                  "%s must be called on logical thread %u (not %u)", __func__,
                  recovery->logicalThreadID, getCallbackThreadID());
  prepareCompletion(&recovery->completion, finishBlockMapRecovery,
                    finishBlockMapRecovery, recovery->logicalThreadID, parent);

  // This message must be recognizable by VDOTest::RebuildBase.
  logInfo("Replaying %zu recovery entries into block map",
          recovery->replayHeap.count);

  *recoveryPtr = recovery;
  return VDO_SUCCESS;
}

/**********************************************************************/
static void flushBlockMap(VDOCompletion *completion)
{
  logInfo("Flushing block map changes");
  BlockMapRecoveryCompletion *recovery
    = asBlockMapRecoveryCompletion(completion->parent);
  ASSERT_LOG_ONLY((completion->callbackThreadID == recovery->adminThread),
                  "flushBlockMap() called on admin thread");

  prepareToFinishParent(completion, completion->parent);
  drainBlockMap(recovery->blockMap, ADMIN_STATE_RECOVERING, completion);
}

/**
 * Check whether the recovery is done. If so, finish it by either flushing the
 * block map (if the recovery was successful), or by cleaning up (if it
 * wasn't).
 *
 * @param recovery  The recovery completion
 *
 * @return <code>true</code> if the recovery or recovery is complete
 **/
static bool finishIfDone(BlockMapRecoveryCompletion *recovery)
{
  // Pages are still being launched or there is still work to do
  if (recovery->launching || (recovery->outstanding > 0)
      || (!recovery->aborted
          && (recovery->currentEntry >= recovery->journalEntries))) {
    return false;
  }

  if (recovery->aborted) {
    /*
     * We need to be careful here to only free completions that exist. But
     * since we know none are outstanding, we just go through the ready ones.
     */
    for (size_t i = 0; i < recovery->pageCount; i++) {
      VDOPageCompletion *pageCompletion = &recovery->pageCompletions[i];
      if (recovery->pageCompletions[i].ready) {
        releaseVDOPageCompletion(&pageCompletion->completion);
      }
    }
    completeCompletion(&recovery->completion);
  } else {
    launchCallbackWithParent(&recovery->subTaskCompletion, flushBlockMap,
                             recovery->adminThread, &recovery->completion);
  }

  return true;
}

/**
 * Note that there has been an error during the recovery and finish it if there
 * is nothing else outstanding.
 *
 * @param recovery  The BlockMapRecoveryCompletion
 * @param result    The error result to use, if one is not already saved
 **/
static void abortRecovery(BlockMapRecoveryCompletion *recovery, int result)
{
  recovery->aborted = true;
  setCompletionResult(&recovery->completion, result);
  finishIfDone(recovery);
}

/**
 * Find the first journal entry after a given entry which is not on the same
 * block map page.
 *
 * @param recovery      the BlockMapRecoveryCompletion
 * @param currentEntry  the entry to search from
 * @param needsSort     Whether sorting is needed to proceed
 *
 * @return Pointer to the first later journal entry on a different block map
 *         page, or a pointer to just before the journal entries if no
 *         subsequent entry is on a different block map page.
 **/
static NumberedBlockMapping *
findEntryStartingNextPage(BlockMapRecoveryCompletion *recovery,
                          NumberedBlockMapping       *currentEntry,
                          bool                        needsSort)
{
  // If currentEntry is invalid, return immediately.
  if (currentEntry < recovery->journalEntries) {
    return currentEntry;
  }
  size_t currentPage = currentEntry->blockMapSlot.pbn;

  // Decrement currentEntry until it's out of bounds or on a different page.
  while ((currentEntry >= recovery->journalEntries)
         && (currentEntry->blockMapSlot.pbn == currentPage)) {
    if (needsSort) {
      NumberedBlockMapping *justSortedEntry
        = sortNextHeapElement(&recovery->replayHeap);
      ASSERT_LOG_ONLY(justSortedEntry < currentEntry,
                      "heap is returning elements in an unexpected order");
    }
    currentEntry--;
  }
  return currentEntry;
}

/**
 * Apply a range of journal entries to a block map page.
 *
 * @param page           The block map page being modified
 * @param startingEntry  The first journal entry to apply
 * @param endingEntry    The entry just past the last journal entry to apply
 **/
static void applyJournalEntriesToPage(BlockMapPage         *page,
                                      NumberedBlockMapping *startingEntry,
                                      NumberedBlockMapping *endingEntry)
{
  NumberedBlockMapping *currentEntry  = startingEntry;
  while (currentEntry != endingEntry) {
    page->entries[currentEntry->blockMapSlot.slot]
      = currentEntry->blockMapEntry;
    currentEntry--;
  }
}

/**********************************************************************/
static void recoverReadyPages(BlockMapRecoveryCompletion *recovery,
                              VDOCompletion              *completion);

/**
 * Note that a page is now ready and attempt to process pages. This callback is
 * registered in fetchPage().
 *
 * @param completion  The VDOPageCompletion for the fetched page
 **/
static void pageLoaded(VDOCompletion *completion)
{
  BlockMapRecoveryCompletion *recovery
    = asBlockMapRecoveryCompletion(completion->parent);
  recovery->outstanding--;
  if (!recovery->launching) {
    recoverReadyPages(recovery, completion);
  }
}

/**
 * Handle an error loading a page.
 *
 * @param completion  The VDOPageCompletion
 **/
static void handlePageLoadError(VDOCompletion *completion)
{
  BlockMapRecoveryCompletion *recovery
    = asBlockMapRecoveryCompletion(completion->parent);
  recovery->outstanding--;
  abortRecovery(recovery, completion->result);
}

/**
 * Fetch a page from the block map.
 *
 * @param recovery    the BlockMapRecoveryCompletion
 * @param completion  the page completion to use
 **/
static void fetchPage(BlockMapRecoveryCompletion *recovery,
                      VDOCompletion              *completion)
{
  if (recovery->currentUnfetchedEntry < recovery->journalEntries) {
    // Nothing left to fetch.
    return;
  }

  // Fetch the next page we haven't yet requested.
  PhysicalBlockNumber newPBN
    = recovery->currentUnfetchedEntry->blockMapSlot.pbn;
  recovery->currentUnfetchedEntry
    = findEntryStartingNextPage(recovery, recovery->currentUnfetchedEntry,
                                true);
  initVDOPageCompletion(((VDOPageCompletion *) completion),
                        recovery->blockMap->zones[0].pageCache,
                        newPBN, true, &recovery->completion,
                        pageLoaded, handlePageLoadError);
  recovery->outstanding++;
  getVDOPageAsync(completion);
}

/**
 * Get the next page completion to process. If it isn't ready, we'll try again
 * when it is.
 *
 * @param recovery    The recovery completion
 * @param completion  The current page completion
 *
 * @return The next page completion to process
 **/
static VDOPageCompletion *
getNextPageCompletion(BlockMapRecoveryCompletion *recovery,
                      VDOPageCompletion          *completion)
{
  completion++;
  if (completion == (&recovery->pageCompletions[recovery->pageCount])) {
    completion = &recovery->pageCompletions[0];
  }
  return completion;
}

/**
 * Recover from as many pages as possible.
 *
 * @param recovery    The recovery completion
 * @param completion  The first page completion to process
 **/
static void recoverReadyPages(BlockMapRecoveryCompletion *recovery,
                              VDOCompletion              *completion)
{
  if (finishIfDone(recovery)) {
    return;
  }

  VDOPageCompletion *pageCompletion = (VDOPageCompletion *) completion;
  if (recovery->pbn != pageCompletion->pbn) {
    return;
  }

  while (pageCompletion->ready) {
    BlockMapPage *page   = dereferenceWritableVDOPage(completion);
    int           result = ASSERT(page != NULL, "page available");
    if (result != VDO_SUCCESS) {
      abortRecovery(recovery, result);
      return;
    }

    NumberedBlockMapping *startOfNextPage
      = findEntryStartingNextPage(recovery, recovery->currentEntry, false);
    applyJournalEntriesToPage(page, recovery->currentEntry, startOfNextPage);
    recovery->currentEntry = startOfNextPage;
    requestVDOPageWrite(completion);
    releaseVDOPageCompletion(completion);

    if (finishIfDone(recovery)) {
      return;
    }

    recovery->pbn = recovery->currentEntry->blockMapSlot.pbn;
    fetchPage(recovery, completion);
    pageCompletion = getNextPageCompletion(recovery, pageCompletion);
    completion     = &pageCompletion->completion;
  }
}

/**********************************************************************/
void recoverBlockMap(VDO                  *vdo,
                     BlockCount            entryCount,
                     NumberedBlockMapping *journalEntries,
                     VDOCompletion        *parent)
{
  BlockMapRecoveryCompletion *recovery;
  int result = makeRecoveryCompletion(vdo, entryCount, journalEntries, parent,
                                      &recovery);
  if (result != VDO_SUCCESS) {
    finishCompletion(parent, result);
    return;
  }

  if (isHeapEmpty(&recovery->replayHeap)) {
    finishCompletion(&recovery->completion, VDO_SUCCESS);
    return;
  }

  NumberedBlockMapping *firstSortedEntry
    = sortNextHeapElement(&recovery->replayHeap);
  ASSERT_LOG_ONLY(firstSortedEntry == recovery->currentEntry,
                  "heap is returning elements in an unexpected order");

  // Prevent any page from being processed until all pages have been launched.
  recovery->launching = true;
  recovery->pbn       = recovery->currentEntry->blockMapSlot.pbn;
  recovery->currentUnfetchedEntry = recovery->currentEntry;
  for (PageCount i = 0; i < recovery->pageCount; i++) {
    if (recovery->currentUnfetchedEntry < recovery->journalEntries) {
      break;
    }

    fetchPage(recovery, &recovery->pageCompletions[i].completion);
  }
  recovery->launching = false;

  // Process any ready pages.
  recoverReadyPages(recovery, &recovery->pageCompletions[0].completion);
}