/*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*
* $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalInternals.h#8 $
*/
#ifndef SLAB_JOURNAL_INTERNALS_H
#define SLAB_JOURNAL_INTERNALS_H
#include "slabJournal.h"
#include "numeric.h"
#include "blockAllocatorInternals.h"
#include "blockMapEntry.h"
#include "journalPoint.h"
#include "slab.h"
#include "slabSummary.h"
#include "statistics.h"
#include "waitQueue.h"
/**
* Slab journal blocks may have one of two formats, depending upon whether or
* not any of the entries in the block are block map increments. Since the
* steady state for a VDO is that all of the necessary block map pages will
* be allocated, most slab journal blocks will have only data entries. Such
* blocks can hold more entries, hence the two formats.
**/
/** A single slab journal entry */
struct slabJournalEntry {
SlabBlockNumber sbn;
JournalOperation operation;
};
/** A single slab journal entry in its on-disk form */
typedef union {
struct __attribute__((packed)) {
uint8_t offsetLow8;
uint8_t offsetMid8;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
unsigned offsetHigh7 : 7;
unsigned increment : 1;
#else
unsigned increment : 1;
unsigned offsetHigh7 : 7;
#endif
} fields;
// A raw view of the packed encoding.
uint8_t raw[3];
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
// This view is only valid on little-endian machines and is only present for
// ease of directly examining packed entries in GDB.
struct __attribute__((packed)) {
unsigned offset : 23;
unsigned increment : 1;
} littleEndian;
#endif
} __attribute__((packed)) PackedSlabJournalEntry;
/** The unpacked representation of the header of a slab journal block */
typedef struct {
/** Sequence number for head of journal */
SequenceNumber head;
/** Sequence number for this block */
SequenceNumber sequenceNumber;
/** The nonce for a given VDO instance */
Nonce nonce;
/** Recovery journal point for last entry */
JournalPoint recoveryPoint;
/** Metadata type */
VDOMetadataType metadataType;
/** Whether this block contains block map increments */
bool hasBlockMapIncrements;
/** The number of entries in the block */
JournalEntryCount entryCount;
} SlabJournalBlockHeader;
/**
* The packed, on-disk representation of a slab journal block header.
* All fields are kept in little-endian byte order.
**/
typedef union __attribute__((packed)) {
struct __attribute__((packed)) {
/** 64-bit sequence number for head of journal */
byte head[8];
/** 64-bit sequence number for this block */
byte sequenceNumber[8];
/** Recovery journal point for last entry, packed into 64 bits */
PackedJournalPoint recoveryPoint;
/** The 64-bit nonce for a given VDO instance */
byte nonce[8];
/** 8-bit metadata type (should always be two, for the slab journal) */
uint8_t metadataType;
/** Whether this block contains block map increments */
bool hasBlockMapIncrements;
/** 16-bit count of the entries encoded in the block */
byte entryCount[2];
} fields;
// A raw view of the packed encoding.
uint8_t raw[8 + 8 + 8 + 8 + 1 + 1 + 2];
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
// This view is only valid on little-endian machines and is only present for
// ease of directly examining packed entries in GDB.
struct __attribute__((packed)) {
SequenceNumber head;
SequenceNumber sequenceNumber;
PackedJournalPoint recoveryPoint;
Nonce nonce;
VDOMetadataType metadataType;
bool hasBlockMapIncrements;
JournalEntryCount entryCount;
} littleEndian;
#endif
} PackedSlabJournalBlockHeader;
enum {
SLAB_JOURNAL_PAYLOAD_SIZE
= VDO_BLOCK_SIZE - sizeof(PackedSlabJournalBlockHeader),
SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25,
SLAB_JOURNAL_ENTRY_TYPES_SIZE = ((SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1)
/ 8) + 1,
SLAB_JOURNAL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE
/ sizeof(PackedSlabJournalEntry)),
};
/** The payload of a slab journal block which has block map increments */
typedef struct {
/* The entries themselves */
PackedSlabJournalEntry entries[SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK];
/* The bit map indicating which entries are block map increments */
byte entryTypes[SLAB_JOURNAL_ENTRY_TYPES_SIZE];
} __attribute__((packed)) FullSlabJournalEntries;
typedef union {
/* Entries which include block map increments */
FullSlabJournalEntries fullEntries;
/* Entries which are only data updates */
PackedSlabJournalEntry entries[SLAB_JOURNAL_ENTRIES_PER_BLOCK];
/* Ensure the payload fills to the end of the block */
byte space[SLAB_JOURNAL_PAYLOAD_SIZE];
} __attribute__((packed)) SlabJournalPayload;
typedef struct {
PackedSlabJournalBlockHeader header;
SlabJournalPayload payload;
} __attribute__((packed)) PackedSlabJournalBlock;
typedef struct {
uint16_t count;
SequenceNumber recoveryStart;
} JournalLock;
struct slabJournal {
/** A waiter object for getting a VIO pool entry */
Waiter resourceWaiter;
/** A waiter object for updating the slab summary */
Waiter slabSummaryWaiter;
/** A waiter object for getting an extent with which to flush */
Waiter flushWaiter;
/** The queue of VIOs waiting to make an entry */
WaitQueue entryWaiters;
/** The parent slab reference of this journal */
Slab *slab;
/** Whether a tail block commit is pending */
bool waitingToCommit;
/** Whether the journal is updating the slab summary */
bool updatingSlabSummary;
/** Whether the journal is adding entries from the entryWaiters queue */
bool addingEntries;
/** Whether a partial write is in progress */
bool partialWriteInProgress;
/** The oldest block in the journal on disk */
SequenceNumber head;
/** The oldest block in the journal which may not be reaped */
SequenceNumber unreapable;
/** The end of the half-open interval of the active journal */
SequenceNumber tail;
/** The next journal block to be committed */
SequenceNumber nextCommit;
/** The tail sequence number that is written in the slab summary */
SequenceNumber summarized;
/** The tail sequence number that was last summarized in slab summary */
SequenceNumber lastSummarized;
/** The sequence number of the recovery journal lock */
SequenceNumber recoveryLock;
/**
* The number of entries which fit in a single block. Can't use the constant
* because unit tests change this number.
**/
JournalEntryCount entriesPerBlock;
/**
* The number of full entries which fit in a single block. Can't use the
* constant because unit tests change this number.
**/
JournalEntryCount fullEntriesPerBlock;
/** The recovery journal of the VDO (slab journal holds locks on it) */
RecoveryJournal *recoveryJournal;
/** The slab summary to update tail block location */
SlabSummaryZone *summary;
/** The statistics shared by all slab journals in our physical zone */
AtomicSlabJournalStatistics *events;
/** A ring of the VIO pool entries for outstanding journal block writes */
RingNode uncommittedBlocks;
/**
* The current tail block header state. This will be packed into
* the block just before it is written.
**/
SlabJournalBlockHeader tailHeader;
/** A pointer to a block-sized buffer holding the packed block data */
PackedSlabJournalBlock *block;
/** The number of blocks in the on-disk journal */
BlockCount size;
/** The number of blocks at which to start pushing reference blocks */
BlockCount flushingThreshold;
/** The number of blocks at which all reference blocks should be writing */
BlockCount flushingDeadline;
/** The number of blocks at which to wait for reference blocks to write */
BlockCount blockingThreshold;
/** The number of blocks at which to scrub the slab before coming online */
BlockCount scrubbingThreshold;
/** This node is for BlockAllocator to keep a queue of dirty journals */
RingNode dirtyNode;
/** The lock for the oldest unreaped block of the journal */
JournalLock *reapLock;
/** The locks for each on disk block */
JournalLock locks[];
};
/**
* Get the slab journal block offset of the given sequence number.
*
* @param journal The slab journal
* @param sequence The sequence number
*
* @return the offset corresponding to the sequence number
**/
__attribute__((warn_unused_result))
static inline TailBlockOffset
getSlabJournalBlockOffset(SlabJournal *journal, SequenceNumber sequence)
{
return (sequence % journal->size);
}
/**
* Encode a slab journal entry (exposed for unit tests).
*
* @param tailHeader The unpacked header for the block
* @param payload The journal block payload to hold the entry
* @param sbn The slab block number of the entry to encode
* @param operation The type of the entry
**/
void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader,
SlabJournalPayload *payload,
SlabBlockNumber sbn,
JournalOperation operation);
/**
* Decode a slab journal entry.
*
* @param block The journal block holding the entry
* @param entryCount The number of the entry
*
* @return The decoded entry
**/
SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block,
JournalEntryCount entryCount)
__attribute__((warn_unused_result));
/**
* Generate the packed encoding of a slab journal entry.
*
* @param packed The entry into which to pack the values
* @param sbn The slab block number of the entry to encode
* @param isIncrement The increment flag
**/
static inline void packSlabJournalEntry(PackedSlabJournalEntry *packed,
SlabBlockNumber sbn,
bool isIncrement)
{
packed->fields.offsetLow8 = (sbn & 0x0000FF);
packed->fields.offsetMid8 = (sbn & 0x00FF00) >> 8;
packed->fields.offsetHigh7 = (sbn & 0x7F0000) >> 16;
packed->fields.increment = isIncrement ? 1 : 0;
}
/**
* Decode the packed representation of a slab journal entry.
*
* @param packed The packed entry to decode
*
* @return The decoded slab journal entry
**/
__attribute__((warn_unused_result))
static inline
SlabJournalEntry unpackSlabJournalEntry(const PackedSlabJournalEntry *packed)
{
SlabJournalEntry entry;
entry.sbn = packed->fields.offsetHigh7;
entry.sbn <<= 8;
entry.sbn |= packed->fields.offsetMid8;
entry.sbn <<= 8;
entry.sbn |= packed->fields.offsetLow8;
entry.operation
= (packed->fields.increment ? DATA_INCREMENT : DATA_DECREMENT);
return entry;
}
/**
* Generate the packed representation of a slab block header.
*
* @param header The header containing the values to encode
* @param packed The header into which to pack the values
**/
static inline
void packSlabJournalBlockHeader(const SlabJournalBlockHeader *header,
PackedSlabJournalBlockHeader *packed)
{
storeUInt64LE(packed->fields.head, header->head);
storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber);
storeUInt64LE(packed->fields.nonce, header->nonce);
storeUInt16LE(packed->fields.entryCount, header->entryCount);
packed->fields.metadataType = header->metadataType;
packed->fields.hasBlockMapIncrements = header->hasBlockMapIncrements;
packJournalPoint(&header->recoveryPoint, &packed->fields.recoveryPoint);
}
/**
* Decode the packed representation of a slab block header.
*
* @param packed The packed header to decode
* @param header The header into which to unpack the values
**/
static inline
void unpackSlabJournalBlockHeader(const PackedSlabJournalBlockHeader *packed,
SlabJournalBlockHeader *header)
{
*header = (SlabJournalBlockHeader) {
.head = getUInt64LE(packed->fields.head),
.sequenceNumber = getUInt64LE(packed->fields.sequenceNumber),
.nonce = getUInt64LE(packed->fields.nonce),
.entryCount = getUInt16LE(packed->fields.entryCount),
.metadataType = packed->fields.metadataType,
.hasBlockMapIncrements = packed->fields.hasBlockMapIncrements,
};
unpackJournalPoint(&packed->fields.recoveryPoint, &header->recoveryPoint);
}
#endif // SLAB_JOURNAL_INTERNALS_H