/* * Copyright (c) 2020 Red Hat, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.h#5 $ */ #ifndef RECOVERY_JOURNAL_H #define RECOVERY_JOURNAL_H #include "buffer.h" #include "adminState.h" #include "completion.h" #include "fixedLayout.h" #include "flush.h" #include "readOnlyNotifier.h" #include "statistics.h" #include "trace.h" #include "types.h" /** * The RecoveryJournal provides a log of all block mapping changes * which have not yet been stably written to the block map. It exists * to help provide resiliency guarantees by allowing synchronous * writes to be acknowledged as soon as the corresponding journal * entry is committed instead of having to wait for the block map * update. For asynchronous writes, the journal aids in meeting the * five second data loss window by ensuring that writes will not be * lost as long as they are committed to the journal before the window * expires. This should be less work than committing all of the * required block map pages. * * The journal consists of a set of on-disk blocks arranged as a * circular log with monotonically increasing sequence numbers. Three * sequence numbers serve to define the active extent of the * journal. The 'head' is the oldest active block in the journal. The * 'tail' is the end of the half-open interval containing the active * blocks. 'active' is the number of the block actively receiving * entries. In an empty journal, head == active == tail. Once any * entries are added, tail = active + 1, and head may be any value in * the interval [tail - size, active]. * * The journal also contains a set of in-memory blocks which are used * to buffer up entries until they can be committed. In general the * number of in-memory blocks ('tailBufferCount') will be less than * the on-disk size. Each in-memory block is also a VDOCompletion. * Each in-memory block has a VDOExtent which is used to commit that * block to disk. The extent's data is a PackedJournalBlock (which is a * formatted journal block). In addition each in-memory block has a * buffer which is used to accumulate entries while a partial commit * of the block is in progress. In-memory blocks are kept on two * rings. Free blocks live on the 'freeTailBlocks' ring. When a block * becomes active (see below) it is moved to the 'activeTailBlocks' * ring. When a block is fully committed, it is moved back to the * 'freeTailBlocks' ring. * * When entries are added to the journal, they are added to the active * in-memory block, as indicated by the 'activeBlock' field. If the * caller wishes to wait for the entry to be committed, the requesting * VIO will be attached to the in-memory block to which the caller's * entry was added. If the caller does wish to wait, or if the entry * filled the active block, an attempt will be made to commit that * block to disk. If there is already another commit in progress, the * attempt will be ignored and then automatically retried when the * in-progress commit completes. If there is no commit in progress, * any VIOs waiting on the block are transferred to the extent. The * extent is then written, automatically waking all of the waiters * when it completes. When the extent completes, any entries which * accumulated in the block are copied to the extent's data buffer. * * Finally, the journal maintains a set of counters, one for each on * disk journal block. These counters are used as locks to prevent * premature reaping of journal blocks. Each time a new sequence * number is used, the counter for the corresponding block is * incremented. The counter is subsequently decremented when that * block is filled and then committed for the last time. This prevents * blocks from being reaped while they are still being updated. The * counter is also incremented once for each entry added to a block, * and decremented once each time the block map is updated in memory * for that request. This prevents blocks from being reaped while * their VIOs are still active. Finally, each in-memory block map page * tracks the oldest journal block that contains entries corresponding to * uncommitted updates to that block map page. Each time an in-memory block * map page is updated, it checks if the journal block for the VIO * is earlier than the one it references, in which case it increments * the count on the earlier journal block and decrements the count on the * later journal block, maintaining a lock on the oldest journal block * containing entries for that page. When a block map page has been flushed * from the cache, the counter for the journal block it references is * decremented. Whenever the counter for the head block goes to 0, the * head is advanced until it comes to a block whose counter is not 0 * or until it reaches the active block. This is the mechanism for * reclaiming journal space on disk. * * If there is no in-memory space when a VIO attempts to add an entry, * the VIO will be attached to the 'commitCompletion' and will be * woken the next time a full block has committed. If there is no * on-disk space when a VIO attempts to add an entry, the VIO will be * attached to the 'reapCompletion', and will be woken the next time a * journal block is reaped. **/ /** * Return whether a given JournalOperation is an increment type. * * @param operation The operation in question * * @return true if the type is an increment type **/ static inline bool isIncrementOperation(JournalOperation operation) { return ((operation == DATA_INCREMENT) || (operation == BLOCK_MAP_INCREMENT)); } /** * Get the name of a journal operation. * * @param operation The operation to name * * @return The name of the operation **/ const char *getJournalOperationName(JournalOperation operation) __attribute__((warn_unused_result)); /** * Create a recovery journal. * * @param [in] nonce the nonce of the VDO * @param [in] layer the physical layer for the journal * @param [in] partition the partition for the journal * @param [in] recoveryCount The VDO's number of completed recoveries * @param [in] journalSize the number of blocks in the journal on disk * @param [in] tailBufferSize the number of blocks for tail buffer * @param [in] readOnlyNotifier the read-only mode notifier * @param [in] threadConfig the thread configuration of the VDO * @param [out] journalPtr the pointer to hold the new recovery journal * * @return a success or error code **/ int makeRecoveryJournal(Nonce nonce, PhysicalLayer *layer, Partition *partition, uint64_t recoveryCount, BlockCount journalSize, BlockCount tailBufferSize, ReadOnlyNotifier *readOnlyNotifier, const ThreadConfig *threadConfig, RecoveryJournal **journalPtr) __attribute__((warn_unused_result)); /** * Free a recovery journal and null out the reference to it. * * @param [in,out] journalPtr The reference to the recovery journal to free **/ void freeRecoveryJournal(RecoveryJournal **journalPtr); /** * Move the backing partition pointer of the recovery journal. * Assumes that the data in the old and the new partitions is identical. * * @param journal the journal being moved * @param partition the new journal partition **/ void setRecoveryJournalPartition(RecoveryJournal *journal, Partition *partition); /** * Initialize the journal after a recovery. * * @param journal The journal in question * @param recoveryCount The number of completed recoveries * @param tail The new tail block sequence number **/ void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal, uint64_t recoveryCount, SequenceNumber tail); /** * Initialize the journal after a rebuild. * * @param journal The journal in question * @param recoveryCount The number of completed recoveries * @param tail The new tail block sequence number * @param logicalBlocksUsed The new number of logical blocks used * @param blockMapDataBlocks The new number of block map data blocks **/ void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal, uint64_t recoveryCount, SequenceNumber tail, BlockCount logicalBlocksUsed, BlockCount blockMapDataBlocks); /** * Get the number of block map pages, allocated from data blocks, currently * in use. * * @param journal The journal in question * * @return The number of block map pages allocated from slabs **/ BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal) __attribute__((warn_unused_result)); /** * Set the number of block map pages, allocated from data blocks, currently * in use. * * @param journal The journal in question * @param pages The number of block map pages allocated from slabs **/ void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal, BlockCount pages); /** * Get the ID of a recovery journal's thread. * * @param journal The journal to query * * @return The ID of the journal's thread. **/ ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal) __attribute__((warn_unused_result)); /** * Prepare the journal for new entries. * * @param journal The journal in question * @param depot The slab depot for this VDO * @param blockMap The block map for this VDO **/ void openRecoveryJournal(RecoveryJournal *journal, SlabDepot *depot, BlockMap *blockMap); /** * Obtain the recovery journal's current sequence number. Exposed only so * the block map can be initialized therefrom. * * @param journal The journal in question * * @return the sequence number of the tail block **/ SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal); /** * Get the number of usable recovery journal blocks. * * @param journalSize The size of the recovery journal in blocks * * @return the number of recovery journal blocks usable for entries **/ BlockCount getRecoveryJournalLength(BlockCount journalSize) __attribute__((warn_unused_result)); /** * Get the size of the encoded state of a recovery journal. * * @return the encoded size of the journal's state **/ size_t getRecoveryJournalEncodedSize(void) __attribute__((warn_unused_result)); /** * Encode the state of a recovery journal. * * @param journal the recovery journal * @param buffer the buffer to encode into * * @return VDO_SUCCESS or an error code **/ int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) __attribute__((warn_unused_result)); /** * Decode the state of a recovery journal saved in a buffer. * * @param journal the recovery journal * @param buffer the buffer containing the saved state * * @return VDO_SUCCESS or an error code **/ int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) __attribute__((warn_unused_result)); /** * Decode the state of a Sodium recovery journal saved in a buffer. * * @param journal the recovery journal * @param buffer the buffer containing the saved state * * @return VDO_SUCCESS or an error code **/ int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) __attribute__((warn_unused_result)); /** * Add an entry to a recovery journal. This method is asynchronous. The DataVIO * will not be called back until the entry is committed to the on-disk journal. * * @param journal The journal in which to make an entry * @param dataVIO The DataVIO for which to add the entry. The entry will be * taken from the logical and newMapped fields of the * DataVIO. The DataVIO's recoverySequenceNumber field will * be set to the sequence number of the journal block in * which the entry was made. **/ void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO); /** * Acquire a reference to a recovery journal block from somewhere other than * the journal itself. * * @param journal The recovery journal * @param sequenceNumber The journal sequence number of the referenced block * @param zoneType The type of the zone making the adjustment * @param zoneID The ID of the zone making the adjustment **/ void acquireRecoveryJournalBlockReference(RecoveryJournal *journal, SequenceNumber sequenceNumber, ZoneType zoneType, ZoneCount zoneID); /** * Release a reference to a recovery journal block from somewhere other than * the journal itself. If this is the last reference for a given zone type, * an attempt will be made to reap the journal. * * @param journal The recovery journal * @param sequenceNumber The journal sequence number of the referenced block * @param zoneType The type of the zone making the adjustment * @param zoneID The ID of the zone making the adjustment **/ void releaseRecoveryJournalBlockReference(RecoveryJournal *journal, SequenceNumber sequenceNumber, ZoneType zoneType, ZoneCount zoneID); /** * Release a single per-entry reference count for a recovery journal block. This * method may be called from any zone (but shouldn't be called from the journal * zone as it would be inefficient). * * @param journal The recovery journal * @param sequenceNumber The journal sequence number of the referenced block **/ void releasePerEntryLockFromOtherZone(RecoveryJournal *journal, SequenceNumber sequenceNumber); /** * Drain recovery journal I/O. All uncommitted entries will be written out. * * @param journal The journal to drain * @param operation The drain operation (suspend or save) * @param parent The completion to finish once the journal is drained **/ void drainRecoveryJournal(RecoveryJournal *journal, AdminStateCode operation, VDOCompletion *parent); /** * Resume a recovery journal which has been drained. * * @param journal The journal to resume * @param parent The completion to finish once the journal is resumed * * @return VDO_SUCCESS or an error **/ void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent); /** * Get the number of logical blocks in use by the VDO * * @param journal the journal * * @return the number of logical blocks in use by the VDO **/ BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal) __attribute__((warn_unused_result)); /** * Get the current statistics from the recovery journal. * * @param journal The recovery journal to query * * @return a copy of the current statistics for the journal **/ RecoveryJournalStatistics getRecoveryJournalStatistics(const RecoveryJournal *journal) __attribute__((warn_unused_result)); /** * Dump some current statistics and other debug info from the recovery * journal. * * @param journal The recovery journal to dump **/ void dumpRecoveryJournalStatistics(const RecoveryJournal *journal); #endif // RECOVERY_JOURNAL_H