/*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*
* $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.h#5 $
*/
#ifndef RECOVERY_JOURNAL_H
#define RECOVERY_JOURNAL_H
#include "buffer.h"
#include "adminState.h"
#include "completion.h"
#include "fixedLayout.h"
#include "flush.h"
#include "readOnlyNotifier.h"
#include "statistics.h"
#include "trace.h"
#include "types.h"
/**
* The RecoveryJournal provides a log of all block mapping changes
* which have not yet been stably written to the block map. It exists
* to help provide resiliency guarantees by allowing synchronous
* writes to be acknowledged as soon as the corresponding journal
* entry is committed instead of having to wait for the block map
* update. For asynchronous writes, the journal aids in meeting the
* five second data loss window by ensuring that writes will not be
* lost as long as they are committed to the journal before the window
* expires. This should be less work than committing all of the
* required block map pages.
*
* The journal consists of a set of on-disk blocks arranged as a
* circular log with monotonically increasing sequence numbers. Three
* sequence numbers serve to define the active extent of the
* journal. The 'head' is the oldest active block in the journal. The
* 'tail' is the end of the half-open interval containing the active
* blocks. 'active' is the number of the block actively receiving
* entries. In an empty journal, head == active == tail. Once any
* entries are added, tail = active + 1, and head may be any value in
* the interval [tail - size, active].
*
* The journal also contains a set of in-memory blocks which are used
* to buffer up entries until they can be committed. In general the
* number of in-memory blocks ('tailBufferCount') will be less than
* the on-disk size. Each in-memory block is also a VDOCompletion.
* Each in-memory block has a VDOExtent which is used to commit that
* block to disk. The extent's data is a PackedJournalBlock (which is a
* formatted journal block). In addition each in-memory block has a
* buffer which is used to accumulate entries while a partial commit
* of the block is in progress. In-memory blocks are kept on two
* rings. Free blocks live on the 'freeTailBlocks' ring. When a block
* becomes active (see below) it is moved to the 'activeTailBlocks'
* ring. When a block is fully committed, it is moved back to the
* 'freeTailBlocks' ring.
*
* When entries are added to the journal, they are added to the active
* in-memory block, as indicated by the 'activeBlock' field. If the
* caller wishes to wait for the entry to be committed, the requesting
* VIO will be attached to the in-memory block to which the caller's
* entry was added. If the caller does wish to wait, or if the entry
* filled the active block, an attempt will be made to commit that
* block to disk. If there is already another commit in progress, the
* attempt will be ignored and then automatically retried when the
* in-progress commit completes. If there is no commit in progress,
* any VIOs waiting on the block are transferred to the extent. The
* extent is then written, automatically waking all of the waiters
* when it completes. When the extent completes, any entries which
* accumulated in the block are copied to the extent's data buffer.
*
* Finally, the journal maintains a set of counters, one for each on
* disk journal block. These counters are used as locks to prevent
* premature reaping of journal blocks. Each time a new sequence
* number is used, the counter for the corresponding block is
* incremented. The counter is subsequently decremented when that
* block is filled and then committed for the last time. This prevents
* blocks from being reaped while they are still being updated. The
* counter is also incremented once for each entry added to a block,
* and decremented once each time the block map is updated in memory
* for that request. This prevents blocks from being reaped while
* their VIOs are still active. Finally, each in-memory block map page
* tracks the oldest journal block that contains entries corresponding to
* uncommitted updates to that block map page. Each time an in-memory block
* map page is updated, it checks if the journal block for the VIO
* is earlier than the one it references, in which case it increments
* the count on the earlier journal block and decrements the count on the
* later journal block, maintaining a lock on the oldest journal block
* containing entries for that page. When a block map page has been flushed
* from the cache, the counter for the journal block it references is
* decremented. Whenever the counter for the head block goes to 0, the
* head is advanced until it comes to a block whose counter is not 0
* or until it reaches the active block. This is the mechanism for
* reclaiming journal space on disk.
*
* If there is no in-memory space when a VIO attempts to add an entry,
* the VIO will be attached to the 'commitCompletion' and will be
* woken the next time a full block has committed. If there is no
* on-disk space when a VIO attempts to add an entry, the VIO will be
* attached to the 'reapCompletion', and will be woken the next time a
* journal block is reaped.
**/
/**
* Return whether a given JournalOperation is an increment type.
*
* @param operation The operation in question
*
* @return true if the type is an increment type
**/
static inline bool isIncrementOperation(JournalOperation operation)
{
return ((operation == DATA_INCREMENT) || (operation == BLOCK_MAP_INCREMENT));
}
/**
* Get the name of a journal operation.
*
* @param operation The operation to name
*
* @return The name of the operation
**/
const char *getJournalOperationName(JournalOperation operation)
__attribute__((warn_unused_result));
/**
* Create a recovery journal.
*
* @param [in] nonce the nonce of the VDO
* @param [in] layer the physical layer for the journal
* @param [in] partition the partition for the journal
* @param [in] recoveryCount The VDO's number of completed recoveries
* @param [in] journalSize the number of blocks in the journal on disk
* @param [in] tailBufferSize the number of blocks for tail buffer
* @param [in] readOnlyNotifier the read-only mode notifier
* @param [in] threadConfig the thread configuration of the VDO
* @param [out] journalPtr the pointer to hold the new recovery journal
*
* @return a success or error code
**/
int makeRecoveryJournal(Nonce nonce,
PhysicalLayer *layer,
Partition *partition,
uint64_t recoveryCount,
BlockCount journalSize,
BlockCount tailBufferSize,
ReadOnlyNotifier *readOnlyNotifier,
const ThreadConfig *threadConfig,
RecoveryJournal **journalPtr)
__attribute__((warn_unused_result));
/**
* Free a recovery journal and null out the reference to it.
*
* @param [in,out] journalPtr The reference to the recovery journal to free
**/
void freeRecoveryJournal(RecoveryJournal **journalPtr);
/**
* Move the backing partition pointer of the recovery journal.
* Assumes that the data in the old and the new partitions is identical.
*
* @param journal the journal being moved
* @param partition the new journal partition
**/
void setRecoveryJournalPartition(RecoveryJournal *journal,
Partition *partition);
/**
* Initialize the journal after a recovery.
*
* @param journal The journal in question
* @param recoveryCount The number of completed recoveries
* @param tail The new tail block sequence number
**/
void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal,
uint64_t recoveryCount,
SequenceNumber tail);
/**
* Initialize the journal after a rebuild.
*
* @param journal The journal in question
* @param recoveryCount The number of completed recoveries
* @param tail The new tail block sequence number
* @param logicalBlocksUsed The new number of logical blocks used
* @param blockMapDataBlocks The new number of block map data blocks
**/
void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal,
uint64_t recoveryCount,
SequenceNumber tail,
BlockCount logicalBlocksUsed,
BlockCount blockMapDataBlocks);
/**
* Get the number of block map pages, allocated from data blocks, currently
* in use.
*
* @param journal The journal in question
*
* @return The number of block map pages allocated from slabs
**/
BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal)
__attribute__((warn_unused_result));
/**
* Set the number of block map pages, allocated from data blocks, currently
* in use.
*
* @param journal The journal in question
* @param pages The number of block map pages allocated from slabs
**/
void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal,
BlockCount pages);
/**
* Get the ID of a recovery journal's thread.
*
* @param journal The journal to query
*
* @return The ID of the journal's thread.
**/
ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal)
__attribute__((warn_unused_result));
/**
* Prepare the journal for new entries.
*
* @param journal The journal in question
* @param depot The slab depot for this VDO
* @param blockMap The block map for this VDO
**/
void openRecoveryJournal(RecoveryJournal *journal,
SlabDepot *depot,
BlockMap *blockMap);
/**
* Obtain the recovery journal's current sequence number. Exposed only so
* the block map can be initialized therefrom.
*
* @param journal The journal in question
*
* @return the sequence number of the tail block
**/
SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal);
/**
* Get the number of usable recovery journal blocks.
*
* @param journalSize The size of the recovery journal in blocks
*
* @return the number of recovery journal blocks usable for entries
**/
BlockCount getRecoveryJournalLength(BlockCount journalSize)
__attribute__((warn_unused_result));
/**
* Get the size of the encoded state of a recovery journal.
*
* @return the encoded size of the journal's state
**/
size_t getRecoveryJournalEncodedSize(void)
__attribute__((warn_unused_result));
/**
* Encode the state of a recovery journal.
*
* @param journal the recovery journal
* @param buffer the buffer to encode into
*
* @return VDO_SUCCESS or an error code
**/
int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
__attribute__((warn_unused_result));
/**
* Decode the state of a recovery journal saved in a buffer.
*
* @param journal the recovery journal
* @param buffer the buffer containing the saved state
*
* @return VDO_SUCCESS or an error code
**/
int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
__attribute__((warn_unused_result));
/**
* Decode the state of a Sodium recovery journal saved in a buffer.
*
* @param journal the recovery journal
* @param buffer the buffer containing the saved state
*
* @return VDO_SUCCESS or an error code
**/
int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
__attribute__((warn_unused_result));
/**
* Add an entry to a recovery journal. This method is asynchronous. The DataVIO
* will not be called back until the entry is committed to the on-disk journal.
*
* @param journal The journal in which to make an entry
* @param dataVIO The DataVIO for which to add the entry. The entry will be
* taken from the logical and newMapped fields of the
* DataVIO. The DataVIO's recoverySequenceNumber field will
* be set to the sequence number of the journal block in
* which the entry was made.
**/
void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO);
/**
* Acquire a reference to a recovery journal block from somewhere other than
* the journal itself.
*
* @param journal The recovery journal
* @param sequenceNumber The journal sequence number of the referenced block
* @param zoneType The type of the zone making the adjustment
* @param zoneID The ID of the zone making the adjustment
**/
void acquireRecoveryJournalBlockReference(RecoveryJournal *journal,
SequenceNumber sequenceNumber,
ZoneType zoneType,
ZoneCount zoneID);
/**
* Release a reference to a recovery journal block from somewhere other than
* the journal itself. If this is the last reference for a given zone type,
* an attempt will be made to reap the journal.
*
* @param journal The recovery journal
* @param sequenceNumber The journal sequence number of the referenced block
* @param zoneType The type of the zone making the adjustment
* @param zoneID The ID of the zone making the adjustment
**/
void releaseRecoveryJournalBlockReference(RecoveryJournal *journal,
SequenceNumber sequenceNumber,
ZoneType zoneType,
ZoneCount zoneID);
/**
* Release a single per-entry reference count for a recovery journal block. This
* method may be called from any zone (but shouldn't be called from the journal
* zone as it would be inefficient).
*
* @param journal The recovery journal
* @param sequenceNumber The journal sequence number of the referenced block
**/
void releasePerEntryLockFromOtherZone(RecoveryJournal *journal,
SequenceNumber sequenceNumber);
/**
* Drain recovery journal I/O. All uncommitted entries will be written out.
*
* @param journal The journal to drain
* @param operation The drain operation (suspend or save)
* @param parent The completion to finish once the journal is drained
**/
void drainRecoveryJournal(RecoveryJournal *journal,
AdminStateCode operation,
VDOCompletion *parent);
/**
* Resume a recovery journal which has been drained.
*
* @param journal The journal to resume
* @param parent The completion to finish once the journal is resumed
*
* @return VDO_SUCCESS or an error
**/
void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent);
/**
* Get the number of logical blocks in use by the VDO
*
* @param journal the journal
*
* @return the number of logical blocks in use by the VDO
**/
BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal)
__attribute__((warn_unused_result));
/**
* Get the current statistics from the recovery journal.
*
* @param journal The recovery journal to query
*
* @return a copy of the current statistics for the journal
**/
RecoveryJournalStatistics
getRecoveryJournalStatistics(const RecoveryJournal *journal)
__attribute__((warn_unused_result));
/**
* Dump some current statistics and other debug info from the recovery
* journal.
*
* @param journal The recovery journal to dump
**/
void dumpRecoveryJournalStatistics(const RecoveryJournal *journal);
#endif // RECOVERY_JOURNAL_H