/*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*
* $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.c#9 $
*/
/*
* This file contains almost all of the VDO write path, which begins with
* writeExtent(). The progression through the callbacks which make up the
* write path depends upon whether or not the write policy is synchronous or
* asynchronous. The paths would proceed as outlined in the pseudo-code here
* if this were normal, synchronous code without callbacks. Complications
* involved in waiting on locks are not included.
*
* ######################################################################
* writeExtentSynchronous(extent)
* {
* foreach (vio in extent) {
* launchWriteVIO()
* # allocateBlockForWrite()
* if (!trim and !zero-block) {
* allocate block
* if (vio is compressed) {
* completeCompressedBlockWrite()
* finishVIO()
* return
* }
* writeBlock()
* }
* finishBlockWrite()
* addJournalEntry() # Increment
* if (vio->newMapped is not ZERO_BLOCK) {
* journalIncrementForWrite()
* }
* acknowledgeWriteCallback()
* readOldBlockMapping()
* journalUnmappingForWrite()
* if (vio->mapped is not ZERO_BLOCK) {
* journalDecrementForWrite()
* }
* updateBlockMapForWrite()
* if (trim || zero-block) {
* finishVIO()
* return
* }
*
* prepareForDedupe()
* hashData()
* resolveHashZone()
* acquireHashLock()
* attemptDedupe() (query albireo)
* if (isDuplicate) {
* verifyAdvice() (read verify)
* if (isDuplicate and canAddReference) {
* shareBlock()
* addJournalEntryForDedupe()
* incrementForDedupe()
* journalUnmappingForDedupe()
* if (vio->mapped is not ZERO_BLOCK) {
* decrementForDedupe()
* }
* updateBlockMapForDedupe()
* finishVIO()
* return
* }
* }
*
* if (not canAddReference) {
* layer->updateAlbireo()
* }
* # compressData()
* if (compressing and not mooted and has no waiters) {
* layer->compressVIO()
* packCompressedData()
* if (compressed) {
* journalCompressedBlocks()
* incrementForDedupe()
* readOldBlockMappingForDedupe()
* journalUnmappingForDedupe()
* if (vio->mapped is not ZERO_BLOCK) {
* decrementForDedupe()
* }
* updateBlockMapForDedupe()
* }
* }
*
* finishVIO()
* }
* }
*
* ######################################################################
* writeExtentAsynchronous(extent)
* {
* foreach (vio in extent) {
* launchWriteVIO()
* # allocateBlockForWrite()
* if (trim || zero-block) {
* acknowledgeWrite()
* } else {
* allocateAndLockBlock()
* if (vio is compressed) {
* writeBlock()
* completeCompressedBlockWrite()
* finishVIO()
* return
* }
*
* acknowledgeWrite()
* prepareForDedupe()
* hashData()
* resolveHashZone()
* acquireHashLock()
* attemptDedupe() (query albireo)
* if (isDuplicate) {
* verifyAdvice() (read verify)
* if (isDuplicate and canAddReference) {
* shareBlock()
* addJournalEntryForDedupe()
* incrementForDedupe()
* readOldBlockMappingForDedupe()
* journalUnmappingForDedupe()
* if (vio->mapped is not ZERO_BLOCK) {
* decrementForDedupe()
* }
* updateBlockMapForDedupe()
* finishVIO()
* return
* }
* }
*
* if (not canAddReference) {
* layer->updateAlbireo()
* }
* # compressData()
* if (compressing and not mooted and has no waiters) {
* layer->compressVIO()
* packCompressedData()
* if (compressed) {
* journalCompressedBlocks()
* journalIncrementForDedupe()
* readOldBlockMappingForDedupe()
* journalUnmappingForDedupe()
* if (vio->mapped is not ZERO_BLOCK) {
* decrementForDedupe()
* }
* updateBlockMapForDedupe()
* finishVIO()
* return
* }
* }
*
* writeBlock()
* }
*
* finishBlockWrite()
* addJournalEntry() # Increment
* if (vio->newMapped is not ZERO_BLOCK) {
* journalIncrementForWrite()
* }
* readOldBlockMappingForWrite()
* journalUnmappingForWrite()
* if (vio->mapped is not ZERO_BLOCK) {
* journalDecrementForWrite()
* }
* updateBlockMapForWrite()
* finishVIO()
* }
* }
*/
#include "vioWrite.h"
#include "logger.h"
#include "allocatingVIO.h"
#include "atomic.h"
#include "blockMap.h"
#include "compressionState.h"
#include "dataVIO.h"
#include "hashLock.h"
#include "recoveryJournal.h"
#include "referenceOperation.h"
#include "slab.h"
#include "slabDepot.h"
#include "slabJournal.h"
#include "vdoInternal.h"
#include "vioRead.h"
/**
* The steps taken cleaning up a VIO, in the order they are performed.
**/
typedef enum dataVIOCleanupStage {
VIO_CLEANUP_START = 0,
VIO_RELEASE_ALLOCATED = VIO_CLEANUP_START,
VIO_RELEASE_RECOVERY_LOCKS,
VIO_RELEASE_HASH_LOCK,
VIO_RELEASE_LOGICAL,
VIO_CLEANUP_DONE
} DataVIOCleanupStage;
/**
* Actions to take on error used by abortOnError().
**/
typedef enum {
NOT_READ_ONLY,
READ_ONLY_IF_ASYNC,
READ_ONLY,
} ReadOnlyAction;
// Forward declarations required because of circular function references.
static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage);
static void writeBlock(DataVIO *dataVIO);
/**
* Check whether we are in async mode.
*
* @param dataVIO A DataVIO containing a pointer to the VDO whose write
* policy we want to check
*
* @return <code>true</code> if we are in async mode
**/
static inline bool isAsync(DataVIO *dataVIO)
{
return (getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC);
}
/**
* Release the PBN lock and/or the reference on the allocated block at the
* end of processing a DataVIO.
*
* @param completion The DataVIO
**/
static void releaseAllocatedLock(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInAllocatedZone(dataVIO);
releaseAllocationLock(dataVIOAsAllocatingVIO(dataVIO));
performCleanupStage(dataVIO, VIO_RELEASE_RECOVERY_LOCKS);
}
/**
* Release the logical block lock and flush generation lock at the end of
* processing a DataVIO.
*
* @param completion The DataVIO
**/
static void releaseLogicalLock(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInLogicalZone(dataVIO);
releaseLogicalBlockLock(dataVIO);
releaseFlushGenerationLock(dataVIO);
performCleanupStage(dataVIO, VIO_CLEANUP_DONE);
}
/**
* Release the hash lock at the end of processing a DataVIO.
*
* @param completion The DataVIO
**/
static void cleanHashLock(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInHashZone(dataVIO);
releaseHashLock(dataVIO);
performCleanupStage(dataVIO, VIO_RELEASE_LOGICAL);
}
/**
* Make some assertions about a DataVIO which has finished cleaning up
* and do its final callback.
*
* @param dataVIO The DataVIO which has finished cleaning up
**/
static void finishCleanup(DataVIO *dataVIO)
{
ASSERT_LOG_ONLY(dataVIOAsAllocatingVIO(dataVIO)->allocationLock == NULL,
"complete DataVIO has no allocation lock");
ASSERT_LOG_ONLY(dataVIO->hashLock == NULL,
"complete DataVIO has no hash lock");
vioDoneCallback(dataVIOAsCompletion(dataVIO));
}
/**
* Perform the next step in the process of cleaning up a DataVIO.
*
* @param dataVIO The DataVIO to clean up
* @param stage The cleanup stage to perform
**/
static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage)
{
switch (stage) {
case VIO_RELEASE_ALLOCATED:
if (hasAllocation(dataVIO)) {
launchAllocatedZoneCallback(dataVIO, releaseAllocatedLock,
THIS_LOCATION("$F;cb=releaseAllocLock"));
return;
}
// fall through
case VIO_RELEASE_RECOVERY_LOCKS:
if ((dataVIO->recoverySequenceNumber > 0)
&& !isOrWillBeReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)
&& (dataVIOAsCompletion(dataVIO)->result != VDO_READ_ONLY)) {
logWarning("VDO not read-only when cleaning DataVIO with RJ lock");
}
// fall through
case VIO_RELEASE_HASH_LOCK:
if (dataVIO->hashLock != NULL) {
launchHashZoneCallback(dataVIO, cleanHashLock,
THIS_LOCATION("$F;cb=cleanHashLock"));
return;
}
// fall through
case VIO_RELEASE_LOGICAL:
if (!isCompressedWriteDataVIO(dataVIO)) {
launchLogicalCallback(dataVIO, releaseLogicalLock,
THIS_LOCATION("$F;cb=releaseLL"));
return;
}
// fall through
default:
finishCleanup(dataVIO);
}
}
/**
* Return a DataVIO that encountered an error to its hash lock so it can
* update the hash lock state accordingly. This continuation is registered in
* abortOnError(), and must be called in the hash zone of the DataVIO.
*
* @param completion The completion of the DataVIO to return to its hash lock
**/
static void finishWriteDataVIOWithError(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInHashZone(dataVIO);
continueHashLockOnError(dataVIO);
}
/**
* Check whether a result is an error, and if so abort the DataVIO associated
* with the error.
*
* @param result The result to check
* @param dataVIO The DataVIO
* @param readOnlyAction The conditions under which the VDO should be put
* into read-only mode if the result is an error
*
* @return <code>true</code> if the result is an error
**/
static bool abortOnError(int result,
DataVIO *dataVIO,
ReadOnlyAction readOnlyAction)
{
if (result == VDO_SUCCESS) {
return false;
}
if ((result == VDO_READ_ONLY)
|| (readOnlyAction == READ_ONLY)
|| ((readOnlyAction == READ_ONLY_IF_ASYNC) && isAsync(dataVIO))) {
ReadOnlyNotifier *notifier = dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier;
if (!isReadOnly(notifier)) {
if (result != VDO_READ_ONLY) {
logErrorWithStringError(result, "Preparing to enter read-only mode:"
" DataVIO for LBN %llu (becoming mapped"
" to %llu, previously mapped"
" to %llu, allocated %llu) is"
" completing with a fatal error after"
" operation %s", dataVIO->logical.lbn,
dataVIO->newMapped.pbn, dataVIO->mapped.pbn,
getDataVIOAllocation(dataVIO),
getOperationName(dataVIO));
}
enterReadOnlyMode(notifier, result);
}
}
if (dataVIO->hashLock != NULL) {
launchHashZoneCallback(dataVIO, finishWriteDataVIOWithError,
THIS_LOCATION(NULL));
} else {
finishDataVIO(dataVIO, result);
}
return true;
}
/**
* Return a DataVIO that finished writing, compressing, or deduplicating to
* its hash lock so it can share the result with any DataVIOs waiting in the
* hash lock, or update albireo, or simply release its share of the lock. This
* continuation is registered in updateBlockMapForWrite(),
* updateBlockMapForDedupe(), and abortDeduplication(), and must be called in
* the hash zone of the DataVIO.
*
* @param completion The completion of the DataVIO to return to its hash lock
**/
static void finishWriteDataVIO(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInHashZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
return;
}
continueHashLock(dataVIO);
}
/**
* Abort the data optimization process.
*
* @param dataVIO The DataVIO which does not deduplicate or compress
**/
static void abortDeduplication(DataVIO *dataVIO)
{
if (!hasAllocation(dataVIO)) {
// There was no space to write this block and we failed to deduplicate
// or compress it.
finishDataVIO(dataVIO, VDO_NO_SPACE);
return;
}
if (isAsync(dataVIO)) {
// We failed to deduplicate or compress an async DataVIO, so now we need
// to actually write the data.
writeBlock(dataVIO);
return;
}
if (dataVIO->hashLock == NULL) {
// We failed to compress a synchronous DataVIO that is a hash collision,
// which means it can't dedpe or be used for dedupe, so it's done now.
finishDataVIO(dataVIO, VDO_SUCCESS);
return;
}
/*
* This synchronous DataVIO failed to compress and so is finished, but must
* now return to its hash lock so other DataVIOs with the same data can
* deduplicate against the uncompressed block it wrote.
*/
launchHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL));
}
/**
* Update the block map now that we've added an entry in the recovery journal
* for a block we have just shared. This is the callback registered in
* decrementForDedupe().
*
* @param completion The completion of the write in progress
**/
static void updateBlockMapForDedupe(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInLogicalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
if (dataVIO->hashLock != NULL) {
setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL));
} else {
completion->callback = completeDataVIO;
}
dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK_FOR_DEDUPE;
putMappedBlockAsync(dataVIO);
}
/**
* Make a recovery journal increment.
*
* @param dataVIO The DataVIO
* @param lock The PBNLock on the block being incremented
**/
static void journalIncrement(DataVIO *dataVIO, PBNLock *lock)
{
setUpReferenceOperationWithLock(DATA_INCREMENT, dataVIO->newMapped.pbn,
dataVIO->newMapped.state, lock,
&dataVIO->operation);
addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal,
dataVIO);
}
/**
* Make a recovery journal decrement entry.
*
* @param dataVIO The DataVIO
**/
static void journalDecrement(DataVIO *dataVIO)
{
setUpReferenceOperationWithZone(DATA_DECREMENT, dataVIO->mapped.pbn,
dataVIO->mapped.state, dataVIO->mapped.zone,
&dataVIO->operation);
addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal,
dataVIO);
}
/**
* Make a reference count change.
*
* @param dataVIO The DataVIO
**/
static void updateReferenceCount(DataVIO *dataVIO)
{
SlabDepot *depot = getVDOFromDataVIO(dataVIO)->depot;
PhysicalBlockNumber pbn = dataVIO->operation.pbn;
int result = ASSERT(isPhysicalDataBlock(depot, pbn),
"Adding slab journal entry for impossible PBN %" PRIu64
"for LBN %llu", pbn, dataVIO->logical.lbn);
if (abortOnError(result, dataVIO, READ_ONLY)) {
return;
}
addSlabJournalEntry(getSlabJournal(depot, pbn), dataVIO);
}
/**
* Do the decref after a successful dedupe or compression. This is the callback
* registered by journalUnmappingForDedupe().
*
* @param completion The completion of the write in progress
**/
static void decrementForDedupe(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInMappedZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO);
if (allocatingVIO->allocation == dataVIO->mapped.pbn) {
/*
* If we are about to release the reference on the allocated block,
* we must release the PBN lock on it first so that the allocator will
* not allocate a write-locked block.
*/
releaseAllocationLock(allocatingVIO);
}
setLogicalCallback(dataVIO, updateBlockMapForDedupe,
THIS_LOCATION("$F;js=dec"));
dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_DEDUPE;
updateReferenceCount(dataVIO);
}
/**
* Write the appropriate journal entry for removing the mapping of logical to
* mapped, for dedupe or compression. This is the callback registered in
* readOldBlockMappingForDedupe().
*
* @param completion The completion of the write in progress
**/
static void journalUnmappingForDedupe(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInJournalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
if (dataVIO->mapped.pbn == ZERO_BLOCK) {
setLogicalCallback(dataVIO, updateBlockMapForDedupe,
THIS_LOCATION("$F;j=dedupe;js=unmap;cb=updateBM"));
} else {
setMappedZoneCallback(dataVIO, decrementForDedupe,
THIS_LOCATION("$F;j=dedupe;js=unmap;cb=decDedupe"));
}
dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_DEDUPE;
journalDecrement(dataVIO);
}
/**
* Get the previous PBN mapped to this LBN from the block map, so as to make
* an appropriate journal entry referencing the removal of this LBN->PBN
* mapping, for dedupe or compression. This callback is registered in
* incrementForDedupe().
*
* @param completion The completion of the write in progress
**/
static void readOldBlockMappingForDedupe(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInLogicalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_DEDUPE;
setJournalCallback(dataVIO, journalUnmappingForDedupe,
THIS_LOCATION("$F;cb=journalUnmapDedupe"));
getMappedBlockAsync(dataVIO);
}
/**
* Do the incref after compression. This is the callback registered by
* addRecoveryJournalEntryForCompression().
*
* @param completion The completion of the write in progress
**/
static void incrementForCompression(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInNewMappedZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state),
"Impossible attempt to update reference counts for a block "
"which was not compressed (logical block %llu)",
dataVIO->logical.lbn);
/*
* If we are synchronous and allocated a block, we know the one we
* allocated is the block we need to decrement, so there is no need
* to look in the block map.
*/
if (isAsync(dataVIO) || !hasAllocation(dataVIO)) {
setLogicalCallback(dataVIO, readOldBlockMappingForDedupe,
THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe"));
} else {
setJournalCallback(dataVIO, journalUnmappingForDedupe,
THIS_LOCATION("$F;cb=journalUnmappingForDedupe"));
}
dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_COMPRESSION;
updateReferenceCount(dataVIO);
}
/**
* Add a recovery journal entry for the increment resulting from compression.
*
* @param completion The DataVIO which has been compressed
**/
static void addRecoveryJournalEntryForCompression(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInJournalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
return;
}
if (!isCompressed(dataVIO->newMapped.state)) {
abortDeduplication(dataVIO);
return;
}
setNewMappedZoneCallback(dataVIO, incrementForCompression,
THIS_LOCATION("$F($dup);js=map/$dup;"
"cb=incCompress($dup)"));
dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_COMPRESSION;
journalIncrement(dataVIO, getDuplicateLock(dataVIO));
}
/**
* Attempt to pack the compressed DataVIO into a block. This is the callback
* registered in compressData().
*
* @param completion The completion of a compressed DataVIO
**/
static void packCompressedData(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInPackerZone(dataVIO);
// XXX this is a callback, so there should probably be an error check here
// even if we think compression can't currently return one.
if (!mayPackDataVIO(dataVIO)) {
abortDeduplication(dataVIO);
return;
}
setJournalCallback(dataVIO, addRecoveryJournalEntryForCompression,
THIS_LOCATION("$F;cb=update(compress)"));
dataVIO->lastAsyncOperation = PACK_COMPRESSED_BLOCK;
attemptPacking(dataVIO);
}
/**********************************************************************/
void compressData(DataVIO *dataVIO)
{
ASSERT_LOG_ONLY(!dataVIO->isDuplicate,
"compressing a non-duplicate block");
if (!mayCompressDataVIO(dataVIO)) {
abortDeduplication(dataVIO);
return;
}
dataVIO->lastAsyncOperation = COMPRESS_DATA;
setPackerCallback(dataVIO, packCompressedData, THIS_LOCATION("$F;cb=pack"));
dataVIOAsCompletion(dataVIO)->layer->compressDataVIO(dataVIO);
}
/**
* Do the incref after deduplication. This is the callback registered by
* addRecoveryJournalEntryForDedupe().
*
* @param completion The completion of the write in progress
**/
static void incrementForDedupe(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInNewMappedZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
ASSERT_LOG_ONLY(dataVIO->isDuplicate,
"Impossible attempt to update reference counts for a block "
"which was not a duplicate (logical block %llu)",
dataVIO->logical.lbn);
/*
* If we are synchronous and allocated a block, we know the one we
* allocated is the block we need to decrement, so there is no need
* to look in the block map.
*/
if (isAsync(dataVIO) || !hasAllocation(dataVIO)) {
setLogicalCallback(dataVIO, readOldBlockMappingForDedupe,
THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe"));
} else {
setJournalCallback(dataVIO, journalUnmappingForDedupe,
THIS_LOCATION("$F;cb=journalUnmappingForDedupe"));
}
dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_DEDUPE;
updateReferenceCount(dataVIO);
}
/**
* Add a recovery journal entry for the increment resulting from deduplication.
* This callback is registered in shareBlock().
*
* @param completion The DataVIO which has been deduplicated
**/
static void addRecoveryJournalEntryForDedupe(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInJournalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
return;
}
setNewMappedZoneCallback(dataVIO, incrementForDedupe,
THIS_LOCATION("$F($dup);js=map/$dup;"
"cb=incDedupe($dup)"));
dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_DEDUPE;
journalIncrement(dataVIO, getDuplicateLock(dataVIO));
}
/**
* Share a block in the block map if it is a duplicate. This is the lock
* callback registered in acquirePBNReadLock(). This is only public so
* test code can compare the function to the current callback in a completion.
*
* @param completion The completion of the write in progress
**/
void shareBlock(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInDuplicateZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
return;
}
if (!dataVIO->isDuplicate) {
compressData(dataVIO);
return;
}
dataVIO->newMapped = dataVIO->duplicate;
launchJournalCallback(dataVIO, addRecoveryJournalEntryForDedupe,
THIS_LOCATION("$F;cb=addJournalEntryDup"));
}
/**
* Route the DataVIO to the HashZone responsible for the chunk name to acquire
* a hash lock on that name, or join with a existing hash lock managing
* concurrent dedupe for that name. This is the callback registered in
* resolveHashZone().
*
* @param completion The DataVIO to lock
**/
static void lockHashInZone(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInHashZone(dataVIO);
// Shouldn't have had any errors since all we did was switch threads.
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
int result = acquireHashLock(dataVIO);
if (abortOnError(result, dataVIO, READ_ONLY)) {
return;
}
if (dataVIO->hashLock == NULL) {
// It's extremely unlikely, but in the case of a hash collision, the
// DataVIO will not obtain a reference to the lock and cannot deduplicate.
compressData(dataVIO);
return;
}
enterHashLock(dataVIO);
}
/**
* Set the hash zone (and flag the chunk name as set) while still on the
* thread that just hashed the data to set the chunk name. This is the
* callback registered by prepareForDedupe().
*
* @param completion The DataVIO whose chunk name was just generated, as a
* completion
**/
static void resolveHashZone(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
// We don't care what thread we are on.
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, "zero blocks should not be hashed");
dataVIO->hashZone
= selectHashZone(getVDOFromDataVIO(dataVIO), &dataVIO->chunkName);
dataVIO->lastAsyncOperation = ACQUIRE_HASH_LOCK;
launchHashZoneCallback(dataVIO, lockHashInZone, THIS_LOCATION(NULL));
}
/**
* Prepare for the dedupe path after a synchronous write or an asynchronous
* allocation. This callback is registered in updateBlockMapForWrite() for
* sync, and continueWriteAfterAllocation() (via acknowledgeWrite()) for
* async. It is also called directly from the latter when allocation fails.
*
* @param completion The completion of the write in progress
**/
static void prepareForDedupe(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
// We don't care what thread we are on
dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
if (!isAsync(dataVIO)) {
// Remember which block we wrote so we will decrement the reference to it
// if we deduplicate. This avoids having to look it up in the block map.
dataVIO->mapped = dataVIO->newMapped;
}
ASSERT_LOG_ONLY(!dataVIO->isZeroBlock,
"must not prepare to dedupe zero blocks");
// Before we can dedupe, we need to know the chunk name, so the first step
// is to hash the block data.
dataVIO->lastAsyncOperation = HASH_DATA;
// XXX this is the wrong thread to run this callback, but we don't yet have
// a mechanism for running it on the CPU thread immediately after hashing.
setAllocatedZoneCallback(dataVIO, resolveHashZone, THIS_LOCATION(NULL));
completion->layer->hashData(dataVIO);
}
/**
* Update the block map after a data write (or directly for a ZERO_BLOCK write
* or trim). This callback is registered in decrementForWrite() and
* journalUnmappingForWrite().
*
* @param completion The completion of the write in progress
**/
static void updateBlockMapForWrite(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInLogicalZone(dataVIO);
dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) {
completion->callback = completeDataVIO;
} else if (!isAsync(dataVIO)) {
// Synchronous DataVIOs branch off to the hash/dedupe path after finishing
// the uncompressed write of their data.
completion->callback = prepareForDedupe;
} else if (dataVIO->hashLock != NULL) {
// Async writes will be finished, but must return to the hash lock to
// allow other DataVIOs with the same data to dedupe against the write.
setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL));
} else {
// Async writes without a hash lock (hash collisions) will be finished.
completion->callback = completeDataVIO;
}
dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK;
putMappedBlockAsync(dataVIO);
}
/**
* Do the decref after a successful block write. This is the callback
* by journalUnmappingForWrite() if the old mapping was not the zero block.
*
* @param completion The completion of the write in progress
**/
static void decrementForWrite(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInMappedZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_WRITE;
setLogicalCallback(dataVIO, updateBlockMapForWrite, THIS_LOCATION(NULL));
updateReferenceCount(dataVIO);
}
/**
* Write the appropriate journal entry for unmapping logical to mapped for a
* write. This is the callback registered in readOldBlockMappingForWrite().
*
* @param completion The completion of the write in progress
**/
static void journalUnmappingForWrite(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInJournalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
if (dataVIO->mapped.pbn == ZERO_BLOCK) {
setLogicalCallback(dataVIO, updateBlockMapForWrite,
THIS_LOCATION("$F;js=unmap;cb=updateBMwrite"));
} else {
setMappedZoneCallback(dataVIO, decrementForWrite,
THIS_LOCATION("$F;js=unmap;cb=decWrite"));
}
dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_WRITE;
journalDecrement(dataVIO);
}
/**
* Get the previous PBN mapped to this LBN from the block map for a write, so
* as to make an appropriate journal entry referencing the removal of this
* LBN->PBN mapping. This callback is registered in finishBlockWrite() in the
* async path, and is registered in acknowledgeWrite() in the sync path.
*
* @param completion The completion of the write in progress
**/
static void readOldBlockMappingForWrite(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInLogicalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
setJournalCallback(dataVIO, journalUnmappingForWrite,
THIS_LOCATION("$F;cb=journalUnmapWrite"));
dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_WRITE;
getMappedBlockAsync(dataVIO);
}
/**
* Acknowledge a write to the requestor.
*
* @param dataVIO The DataVIO being acknowledged
**/
static void acknowledgeWrite(DataVIO *dataVIO)
{
ASSERT_LOG_ONLY(dataVIO->hasFlushGenerationLock,
"write VIO to be acknowledged has a flush generation lock");
dataVIO->lastAsyncOperation = ACKNOWLEDGE_WRITE;
dataVIOAsCompletion(dataVIO)->layer->acknowledgeDataVIO(dataVIO);
}
/**
* Acknowledge a write now that we have made an entry in the recovery
* journal. This is the callback registered in finishBlockWrite() in
* synchronous mode.
*
* @param completion The completion of the write in progress
**/
static void acknowledgeWriteCallback(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
return;
}
setLogicalCallback(dataVIO, readOldBlockMappingForWrite,
THIS_LOCATION(NULL));
acknowledgeWrite(dataVIO);
}
/**********************************************************************/
static VDOAction *getWriteIncrementCallback(DataVIO *dataVIO)
{
return (isAsync(dataVIO)
? readOldBlockMappingForWrite : acknowledgeWriteCallback);
}
/**
* Do the incref after a successful block write. This is the callback
* registered by finishBlockWrite().
*
* @param completion The completion of the write in progress
**/
static void incrementForWrite(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInAllocatedZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
return;
}
/*
* Now that the data has been written, it's safe to deduplicate against the
* block. Downgrade the allocation lock to a read lock so it can be used
* later by the hash lock (which we don't have yet in sync mode).
*/
downgradePBNWriteLock(dataVIOAsAllocatingVIO(dataVIO)->allocationLock);
dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_WRITE;
setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO),
THIS_LOCATION(NULL));
updateReferenceCount(dataVIO);
}
/**
* Add an entry in the recovery journal after a successful block write. This is
* the callback registered by writeBlock(). It is also registered in
* allocateBlockForWrite().
*
* @param completion The completion of the write in progress
**/
static void finishBlockWrite(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
assertInJournalZone(dataVIO);
if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
return;
}
if (dataVIO->newMapped.pbn == ZERO_BLOCK) {
setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO),
THIS_LOCATION("$F;js=writeZero"));
} else {
setAllocatedZoneCallback(dataVIO, incrementForWrite,
THIS_LOCATION("$F;js=mapWrite"));
}
dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_WRITE;
journalIncrement(dataVIO, dataVIOAsAllocatingVIO(dataVIO)->allocationLock);
}
/**
* Write data to the underlying storage.
*
* @param dataVIO The DataVIO to write
**/
static void writeBlock(DataVIO *dataVIO)
{
dataVIO->lastAsyncOperation = WRITE_DATA;
setJournalCallback(dataVIO, finishBlockWrite,
THIS_LOCATION("$F(data);cb=finishWrite"));
dataVIOAsCompletion(dataVIO)->layer->writeData(dataVIO);
}
/**
* Continue the write path for a DataVIO now that block allocation is complete
* (the DataVIO may or may not have actually received an allocation). This
* callback is registered in continueWriteWithBlockMapSlot().
*
* @param allocatingVIO The DataVIO which has finished the allocation process
* (as an AllocatingVIO)
**/
static void continueWriteAfterAllocation(AllocatingVIO *allocatingVIO)
{
DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO);
if (abortOnError(dataVIOAsCompletion(dataVIO)->result, dataVIO,
NOT_READ_ONLY)) {
return;
}
if (!hasAllocation(dataVIO)) {
prepareForDedupe(dataVIOAsCompletion(dataVIO));
return;
}
atomicStoreBool(&dataVIO->hasAllocation, true);
dataVIO->newMapped = (ZonedPBN) {
.zone = allocatingVIO->zone,
.pbn = allocatingVIO->allocation,
.state = MAPPING_STATE_UNCOMPRESSED,
};
if (!isAsync(dataVIO)) {
writeBlock(dataVIO);
return;
}
// XXX prepareForDedupe can run from any thread, so this is a place where
// running the callback on the kernel thread would save a thread switch.
setAllocatedZoneCallback(dataVIO, prepareForDedupe, THIS_LOCATION(NULL));
if (vioRequiresFlushAfter(allocatingVIOAsVIO(allocatingVIO))) {
invokeCallback(dataVIOAsCompletion(dataVIO));
return;
}
acknowledgeWrite(dataVIO);
}
/**
* Continue the write path for a VIO now that block map slot resolution is
* complete. This callback is registered in launchWriteDataVIO().
*
* @param completion The DataVIO to write
**/
static void continueWriteWithBlockMapSlot(VDOCompletion *completion)
{
DataVIO *dataVIO = asDataVIO(completion);
// We don't care what thread we're on.
if (abortOnError(completion->result, dataVIO, NOT_READ_ONLY)) {
return;
}
if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) {
int result = ASSERT(isTrimDataVIO(dataVIO),
"dataVIO with no block map page is a trim");
if (abortOnError(result, dataVIO, READ_ONLY)) {
return;
}
// This is a trim for a block on a block map page which has not been
// allocated, so there's nothing more we need to do.
finishDataVIO(dataVIO, VDO_SUCCESS);
return;
}
if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) {
// We don't need to write any data, so skip allocation and just update
// the block map and reference counts (via the journal).
dataVIO->newMapped.pbn = ZERO_BLOCK;
launchJournalCallback(dataVIO, finishBlockWrite,
THIS_LOCATION("$F;cb=finishWrite"));
return;
}
allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO),
getAllocationSelector(dataVIO->logical.zone),
VIO_WRITE_LOCK, continueWriteAfterAllocation);
}
/**********************************************************************/
void launchWriteDataVIO(DataVIO *dataVIO)
{
if (isReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)) {
finishDataVIO(dataVIO, VDO_READ_ONLY);
return;
}
// Write requests join the current flush generation.
int result = acquireFlushGenerationLock(dataVIO);
if (abortOnError(result, dataVIO, NOT_READ_ONLY)) {
return;
}
// Go find the block map slot for the LBN mapping.
dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT;
findBlockMapSlotAsync(dataVIO, continueWriteWithBlockMapSlot,
getLogicalZoneThreadID(dataVIO->logical.zone));
}
/**********************************************************************/
void cleanupWriteDataVIO(DataVIO *dataVIO)
{
performCleanupStage(dataVIO, VIO_CLEANUP_START);
}