/* * Copyright (c) 2020 Red Hat, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.c#9 $ */ /* * This file contains almost all of the VDO write path, which begins with * writeExtent(). The progression through the callbacks which make up the * write path depends upon whether or not the write policy is synchronous or * asynchronous. The paths would proceed as outlined in the pseudo-code here * if this were normal, synchronous code without callbacks. Complications * involved in waiting on locks are not included. * * ###################################################################### * writeExtentSynchronous(extent) * { * foreach (vio in extent) { * launchWriteVIO() * # allocateBlockForWrite() * if (!trim and !zero-block) { * allocate block * if (vio is compressed) { * completeCompressedBlockWrite() * finishVIO() * return * } * writeBlock() * } * finishBlockWrite() * addJournalEntry() # Increment * if (vio->newMapped is not ZERO_BLOCK) { * journalIncrementForWrite() * } * acknowledgeWriteCallback() * readOldBlockMapping() * journalUnmappingForWrite() * if (vio->mapped is not ZERO_BLOCK) { * journalDecrementForWrite() * } * updateBlockMapForWrite() * if (trim || zero-block) { * finishVIO() * return * } * * prepareForDedupe() * hashData() * resolveHashZone() * acquireHashLock() * attemptDedupe() (query albireo) * if (isDuplicate) { * verifyAdvice() (read verify) * if (isDuplicate and canAddReference) { * shareBlock() * addJournalEntryForDedupe() * incrementForDedupe() * journalUnmappingForDedupe() * if (vio->mapped is not ZERO_BLOCK) { * decrementForDedupe() * } * updateBlockMapForDedupe() * finishVIO() * return * } * } * * if (not canAddReference) { * layer->updateAlbireo() * } * # compressData() * if (compressing and not mooted and has no waiters) { * layer->compressVIO() * packCompressedData() * if (compressed) { * journalCompressedBlocks() * incrementForDedupe() * readOldBlockMappingForDedupe() * journalUnmappingForDedupe() * if (vio->mapped is not ZERO_BLOCK) { * decrementForDedupe() * } * updateBlockMapForDedupe() * } * } * * finishVIO() * } * } * * ###################################################################### * writeExtentAsynchronous(extent) * { * foreach (vio in extent) { * launchWriteVIO() * # allocateBlockForWrite() * if (trim || zero-block) { * acknowledgeWrite() * } else { * allocateAndLockBlock() * if (vio is compressed) { * writeBlock() * completeCompressedBlockWrite() * finishVIO() * return * } * * acknowledgeWrite() * prepareForDedupe() * hashData() * resolveHashZone() * acquireHashLock() * attemptDedupe() (query albireo) * if (isDuplicate) { * verifyAdvice() (read verify) * if (isDuplicate and canAddReference) { * shareBlock() * addJournalEntryForDedupe() * incrementForDedupe() * readOldBlockMappingForDedupe() * journalUnmappingForDedupe() * if (vio->mapped is not ZERO_BLOCK) { * decrementForDedupe() * } * updateBlockMapForDedupe() * finishVIO() * return * } * } * * if (not canAddReference) { * layer->updateAlbireo() * } * # compressData() * if (compressing and not mooted and has no waiters) { * layer->compressVIO() * packCompressedData() * if (compressed) { * journalCompressedBlocks() * journalIncrementForDedupe() * readOldBlockMappingForDedupe() * journalUnmappingForDedupe() * if (vio->mapped is not ZERO_BLOCK) { * decrementForDedupe() * } * updateBlockMapForDedupe() * finishVIO() * return * } * } * * writeBlock() * } * * finishBlockWrite() * addJournalEntry() # Increment * if (vio->newMapped is not ZERO_BLOCK) { * journalIncrementForWrite() * } * readOldBlockMappingForWrite() * journalUnmappingForWrite() * if (vio->mapped is not ZERO_BLOCK) { * journalDecrementForWrite() * } * updateBlockMapForWrite() * finishVIO() * } * } */ #include "vioWrite.h" #include "logger.h" #include "allocatingVIO.h" #include "atomic.h" #include "blockMap.h" #include "compressionState.h" #include "dataVIO.h" #include "hashLock.h" #include "recoveryJournal.h" #include "referenceOperation.h" #include "slab.h" #include "slabDepot.h" #include "slabJournal.h" #include "vdoInternal.h" #include "vioRead.h" /** * The steps taken cleaning up a VIO, in the order they are performed. **/ typedef enum dataVIOCleanupStage { VIO_CLEANUP_START = 0, VIO_RELEASE_ALLOCATED = VIO_CLEANUP_START, VIO_RELEASE_RECOVERY_LOCKS, VIO_RELEASE_HASH_LOCK, VIO_RELEASE_LOGICAL, VIO_CLEANUP_DONE } DataVIOCleanupStage; /** * Actions to take on error used by abortOnError(). **/ typedef enum { NOT_READ_ONLY, READ_ONLY_IF_ASYNC, READ_ONLY, } ReadOnlyAction; // Forward declarations required because of circular function references. static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage); static void writeBlock(DataVIO *dataVIO); /** * Check whether we are in async mode. * * @param dataVIO A DataVIO containing a pointer to the VDO whose write * policy we want to check * * @return true if we are in async mode **/ static inline bool isAsync(DataVIO *dataVIO) { return (getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC); } /** * Release the PBN lock and/or the reference on the allocated block at the * end of processing a DataVIO. * * @param completion The DataVIO **/ static void releaseAllocatedLock(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInAllocatedZone(dataVIO); releaseAllocationLock(dataVIOAsAllocatingVIO(dataVIO)); performCleanupStage(dataVIO, VIO_RELEASE_RECOVERY_LOCKS); } /** * Release the logical block lock and flush generation lock at the end of * processing a DataVIO. * * @param completion The DataVIO **/ static void releaseLogicalLock(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInLogicalZone(dataVIO); releaseLogicalBlockLock(dataVIO); releaseFlushGenerationLock(dataVIO); performCleanupStage(dataVIO, VIO_CLEANUP_DONE); } /** * Release the hash lock at the end of processing a DataVIO. * * @param completion The DataVIO **/ static void cleanHashLock(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInHashZone(dataVIO); releaseHashLock(dataVIO); performCleanupStage(dataVIO, VIO_RELEASE_LOGICAL); } /** * Make some assertions about a DataVIO which has finished cleaning up * and do its final callback. * * @param dataVIO The DataVIO which has finished cleaning up **/ static void finishCleanup(DataVIO *dataVIO) { ASSERT_LOG_ONLY(dataVIOAsAllocatingVIO(dataVIO)->allocationLock == NULL, "complete DataVIO has no allocation lock"); ASSERT_LOG_ONLY(dataVIO->hashLock == NULL, "complete DataVIO has no hash lock"); vioDoneCallback(dataVIOAsCompletion(dataVIO)); } /** * Perform the next step in the process of cleaning up a DataVIO. * * @param dataVIO The DataVIO to clean up * @param stage The cleanup stage to perform **/ static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage) { switch (stage) { case VIO_RELEASE_ALLOCATED: if (hasAllocation(dataVIO)) { launchAllocatedZoneCallback(dataVIO, releaseAllocatedLock, THIS_LOCATION("$F;cb=releaseAllocLock")); return; } // fall through case VIO_RELEASE_RECOVERY_LOCKS: if ((dataVIO->recoverySequenceNumber > 0) && !isOrWillBeReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier) && (dataVIOAsCompletion(dataVIO)->result != VDO_READ_ONLY)) { logWarning("VDO not read-only when cleaning DataVIO with RJ lock"); } // fall through case VIO_RELEASE_HASH_LOCK: if (dataVIO->hashLock != NULL) { launchHashZoneCallback(dataVIO, cleanHashLock, THIS_LOCATION("$F;cb=cleanHashLock")); return; } // fall through case VIO_RELEASE_LOGICAL: if (!isCompressedWriteDataVIO(dataVIO)) { launchLogicalCallback(dataVIO, releaseLogicalLock, THIS_LOCATION("$F;cb=releaseLL")); return; } // fall through default: finishCleanup(dataVIO); } } /** * Return a DataVIO that encountered an error to its hash lock so it can * update the hash lock state accordingly. This continuation is registered in * abortOnError(), and must be called in the hash zone of the DataVIO. * * @param completion The completion of the DataVIO to return to its hash lock **/ static void finishWriteDataVIOWithError(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInHashZone(dataVIO); continueHashLockOnError(dataVIO); } /** * Check whether a result is an error, and if so abort the DataVIO associated * with the error. * * @param result The result to check * @param dataVIO The DataVIO * @param readOnlyAction The conditions under which the VDO should be put * into read-only mode if the result is an error * * @return true if the result is an error **/ static bool abortOnError(int result, DataVIO *dataVIO, ReadOnlyAction readOnlyAction) { if (result == VDO_SUCCESS) { return false; } if ((result == VDO_READ_ONLY) || (readOnlyAction == READ_ONLY) || ((readOnlyAction == READ_ONLY_IF_ASYNC) && isAsync(dataVIO))) { ReadOnlyNotifier *notifier = dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier; if (!isReadOnly(notifier)) { if (result != VDO_READ_ONLY) { logErrorWithStringError(result, "Preparing to enter read-only mode:" " DataVIO for LBN %llu (becoming mapped" " to %llu, previously mapped" " to %llu, allocated %llu) is" " completing with a fatal error after" " operation %s", dataVIO->logical.lbn, dataVIO->newMapped.pbn, dataVIO->mapped.pbn, getDataVIOAllocation(dataVIO), getOperationName(dataVIO)); } enterReadOnlyMode(notifier, result); } } if (dataVIO->hashLock != NULL) { launchHashZoneCallback(dataVIO, finishWriteDataVIOWithError, THIS_LOCATION(NULL)); } else { finishDataVIO(dataVIO, result); } return true; } /** * Return a DataVIO that finished writing, compressing, or deduplicating to * its hash lock so it can share the result with any DataVIOs waiting in the * hash lock, or update albireo, or simply release its share of the lock. This * continuation is registered in updateBlockMapForWrite(), * updateBlockMapForDedupe(), and abortDeduplication(), and must be called in * the hash zone of the DataVIO. * * @param completion The completion of the DataVIO to return to its hash lock **/ static void finishWriteDataVIO(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInHashZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { return; } continueHashLock(dataVIO); } /** * Abort the data optimization process. * * @param dataVIO The DataVIO which does not deduplicate or compress **/ static void abortDeduplication(DataVIO *dataVIO) { if (!hasAllocation(dataVIO)) { // There was no space to write this block and we failed to deduplicate // or compress it. finishDataVIO(dataVIO, VDO_NO_SPACE); return; } if (isAsync(dataVIO)) { // We failed to deduplicate or compress an async DataVIO, so now we need // to actually write the data. writeBlock(dataVIO); return; } if (dataVIO->hashLock == NULL) { // We failed to compress a synchronous DataVIO that is a hash collision, // which means it can't dedpe or be used for dedupe, so it's done now. finishDataVIO(dataVIO, VDO_SUCCESS); return; } /* * This synchronous DataVIO failed to compress and so is finished, but must * now return to its hash lock so other DataVIOs with the same data can * deduplicate against the uncompressed block it wrote. */ launchHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); } /** * Update the block map now that we've added an entry in the recovery journal * for a block we have just shared. This is the callback registered in * decrementForDedupe(). * * @param completion The completion of the write in progress **/ static void updateBlockMapForDedupe(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInLogicalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } if (dataVIO->hashLock != NULL) { setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); } else { completion->callback = completeDataVIO; } dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK_FOR_DEDUPE; putMappedBlockAsync(dataVIO); } /** * Make a recovery journal increment. * * @param dataVIO The DataVIO * @param lock The PBNLock on the block being incremented **/ static void journalIncrement(DataVIO *dataVIO, PBNLock *lock) { setUpReferenceOperationWithLock(DATA_INCREMENT, dataVIO->newMapped.pbn, dataVIO->newMapped.state, lock, &dataVIO->operation); addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, dataVIO); } /** * Make a recovery journal decrement entry. * * @param dataVIO The DataVIO **/ static void journalDecrement(DataVIO *dataVIO) { setUpReferenceOperationWithZone(DATA_DECREMENT, dataVIO->mapped.pbn, dataVIO->mapped.state, dataVIO->mapped.zone, &dataVIO->operation); addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, dataVIO); } /** * Make a reference count change. * * @param dataVIO The DataVIO **/ static void updateReferenceCount(DataVIO *dataVIO) { SlabDepot *depot = getVDOFromDataVIO(dataVIO)->depot; PhysicalBlockNumber pbn = dataVIO->operation.pbn; int result = ASSERT(isPhysicalDataBlock(depot, pbn), "Adding slab journal entry for impossible PBN %" PRIu64 "for LBN %llu", pbn, dataVIO->logical.lbn); if (abortOnError(result, dataVIO, READ_ONLY)) { return; } addSlabJournalEntry(getSlabJournal(depot, pbn), dataVIO); } /** * Do the decref after a successful dedupe or compression. This is the callback * registered by journalUnmappingForDedupe(). * * @param completion The completion of the write in progress **/ static void decrementForDedupe(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInMappedZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); if (allocatingVIO->allocation == dataVIO->mapped.pbn) { /* * If we are about to release the reference on the allocated block, * we must release the PBN lock on it first so that the allocator will * not allocate a write-locked block. */ releaseAllocationLock(allocatingVIO); } setLogicalCallback(dataVIO, updateBlockMapForDedupe, THIS_LOCATION("$F;js=dec")); dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_DEDUPE; updateReferenceCount(dataVIO); } /** * Write the appropriate journal entry for removing the mapping of logical to * mapped, for dedupe or compression. This is the callback registered in * readOldBlockMappingForDedupe(). * * @param completion The completion of the write in progress **/ static void journalUnmappingForDedupe(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInJournalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } if (dataVIO->mapped.pbn == ZERO_BLOCK) { setLogicalCallback(dataVIO, updateBlockMapForDedupe, THIS_LOCATION("$F;j=dedupe;js=unmap;cb=updateBM")); } else { setMappedZoneCallback(dataVIO, decrementForDedupe, THIS_LOCATION("$F;j=dedupe;js=unmap;cb=decDedupe")); } dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_DEDUPE; journalDecrement(dataVIO); } /** * Get the previous PBN mapped to this LBN from the block map, so as to make * an appropriate journal entry referencing the removal of this LBN->PBN * mapping, for dedupe or compression. This callback is registered in * incrementForDedupe(). * * @param completion The completion of the write in progress **/ static void readOldBlockMappingForDedupe(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInLogicalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_DEDUPE; setJournalCallback(dataVIO, journalUnmappingForDedupe, THIS_LOCATION("$F;cb=journalUnmapDedupe")); getMappedBlockAsync(dataVIO); } /** * Do the incref after compression. This is the callback registered by * addRecoveryJournalEntryForCompression(). * * @param completion The completion of the write in progress **/ static void incrementForCompression(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInNewMappedZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state), "Impossible attempt to update reference counts for a block " "which was not compressed (logical block %llu)", dataVIO->logical.lbn); /* * If we are synchronous and allocated a block, we know the one we * allocated is the block we need to decrement, so there is no need * to look in the block map. */ if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); } else { setJournalCallback(dataVIO, journalUnmappingForDedupe, THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); } dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_COMPRESSION; updateReferenceCount(dataVIO); } /** * Add a recovery journal entry for the increment resulting from compression. * * @param completion The DataVIO which has been compressed **/ static void addRecoveryJournalEntryForCompression(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInJournalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { return; } if (!isCompressed(dataVIO->newMapped.state)) { abortDeduplication(dataVIO); return; } setNewMappedZoneCallback(dataVIO, incrementForCompression, THIS_LOCATION("$F($dup);js=map/$dup;" "cb=incCompress($dup)")); dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_COMPRESSION; journalIncrement(dataVIO, getDuplicateLock(dataVIO)); } /** * Attempt to pack the compressed DataVIO into a block. This is the callback * registered in compressData(). * * @param completion The completion of a compressed DataVIO **/ static void packCompressedData(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInPackerZone(dataVIO); // XXX this is a callback, so there should probably be an error check here // even if we think compression can't currently return one. if (!mayPackDataVIO(dataVIO)) { abortDeduplication(dataVIO); return; } setJournalCallback(dataVIO, addRecoveryJournalEntryForCompression, THIS_LOCATION("$F;cb=update(compress)")); dataVIO->lastAsyncOperation = PACK_COMPRESSED_BLOCK; attemptPacking(dataVIO); } /**********************************************************************/ void compressData(DataVIO *dataVIO) { ASSERT_LOG_ONLY(!dataVIO->isDuplicate, "compressing a non-duplicate block"); if (!mayCompressDataVIO(dataVIO)) { abortDeduplication(dataVIO); return; } dataVIO->lastAsyncOperation = COMPRESS_DATA; setPackerCallback(dataVIO, packCompressedData, THIS_LOCATION("$F;cb=pack")); dataVIOAsCompletion(dataVIO)->layer->compressDataVIO(dataVIO); } /** * Do the incref after deduplication. This is the callback registered by * addRecoveryJournalEntryForDedupe(). * * @param completion The completion of the write in progress **/ static void incrementForDedupe(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInNewMappedZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } ASSERT_LOG_ONLY(dataVIO->isDuplicate, "Impossible attempt to update reference counts for a block " "which was not a duplicate (logical block %llu)", dataVIO->logical.lbn); /* * If we are synchronous and allocated a block, we know the one we * allocated is the block we need to decrement, so there is no need * to look in the block map. */ if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); } else { setJournalCallback(dataVIO, journalUnmappingForDedupe, THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); } dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_DEDUPE; updateReferenceCount(dataVIO); } /** * Add a recovery journal entry for the increment resulting from deduplication. * This callback is registered in shareBlock(). * * @param completion The DataVIO which has been deduplicated **/ static void addRecoveryJournalEntryForDedupe(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInJournalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { return; } setNewMappedZoneCallback(dataVIO, incrementForDedupe, THIS_LOCATION("$F($dup);js=map/$dup;" "cb=incDedupe($dup)")); dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_DEDUPE; journalIncrement(dataVIO, getDuplicateLock(dataVIO)); } /** * Share a block in the block map if it is a duplicate. This is the lock * callback registered in acquirePBNReadLock(). This is only public so * test code can compare the function to the current callback in a completion. * * @param completion The completion of the write in progress **/ void shareBlock(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInDuplicateZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { return; } if (!dataVIO->isDuplicate) { compressData(dataVIO); return; } dataVIO->newMapped = dataVIO->duplicate; launchJournalCallback(dataVIO, addRecoveryJournalEntryForDedupe, THIS_LOCATION("$F;cb=addJournalEntryDup")); } /** * Route the DataVIO to the HashZone responsible for the chunk name to acquire * a hash lock on that name, or join with a existing hash lock managing * concurrent dedupe for that name. This is the callback registered in * resolveHashZone(). * * @param completion The DataVIO to lock **/ static void lockHashInZone(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInHashZone(dataVIO); // Shouldn't have had any errors since all we did was switch threads. if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } int result = acquireHashLock(dataVIO); if (abortOnError(result, dataVIO, READ_ONLY)) { return; } if (dataVIO->hashLock == NULL) { // It's extremely unlikely, but in the case of a hash collision, the // DataVIO will not obtain a reference to the lock and cannot deduplicate. compressData(dataVIO); return; } enterHashLock(dataVIO); } /** * Set the hash zone (and flag the chunk name as set) while still on the * thread that just hashed the data to set the chunk name. This is the * callback registered by prepareForDedupe(). * * @param completion The DataVIO whose chunk name was just generated, as a * completion **/ static void resolveHashZone(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); // We don't care what thread we are on. if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, "zero blocks should not be hashed"); dataVIO->hashZone = selectHashZone(getVDOFromDataVIO(dataVIO), &dataVIO->chunkName); dataVIO->lastAsyncOperation = ACQUIRE_HASH_LOCK; launchHashZoneCallback(dataVIO, lockHashInZone, THIS_LOCATION(NULL)); } /** * Prepare for the dedupe path after a synchronous write or an asynchronous * allocation. This callback is registered in updateBlockMapForWrite() for * sync, and continueWriteAfterAllocation() (via acknowledgeWrite()) for * async. It is also called directly from the latter when allocation fails. * * @param completion The completion of the write in progress **/ static void prepareForDedupe(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); // We don't care what thread we are on dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } if (!isAsync(dataVIO)) { // Remember which block we wrote so we will decrement the reference to it // if we deduplicate. This avoids having to look it up in the block map. dataVIO->mapped = dataVIO->newMapped; } ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, "must not prepare to dedupe zero blocks"); // Before we can dedupe, we need to know the chunk name, so the first step // is to hash the block data. dataVIO->lastAsyncOperation = HASH_DATA; // XXX this is the wrong thread to run this callback, but we don't yet have // a mechanism for running it on the CPU thread immediately after hashing. setAllocatedZoneCallback(dataVIO, resolveHashZone, THIS_LOCATION(NULL)); completion->layer->hashData(dataVIO); } /** * Update the block map after a data write (or directly for a ZERO_BLOCK write * or trim). This callback is registered in decrementForWrite() and * journalUnmappingForWrite(). * * @param completion The completion of the write in progress **/ static void updateBlockMapForWrite(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInLogicalZone(dataVIO); dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { completion->callback = completeDataVIO; } else if (!isAsync(dataVIO)) { // Synchronous DataVIOs branch off to the hash/dedupe path after finishing // the uncompressed write of their data. completion->callback = prepareForDedupe; } else if (dataVIO->hashLock != NULL) { // Async writes will be finished, but must return to the hash lock to // allow other DataVIOs with the same data to dedupe against the write. setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); } else { // Async writes without a hash lock (hash collisions) will be finished. completion->callback = completeDataVIO; } dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK; putMappedBlockAsync(dataVIO); } /** * Do the decref after a successful block write. This is the callback * by journalUnmappingForWrite() if the old mapping was not the zero block. * * @param completion The completion of the write in progress **/ static void decrementForWrite(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInMappedZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_WRITE; setLogicalCallback(dataVIO, updateBlockMapForWrite, THIS_LOCATION(NULL)); updateReferenceCount(dataVIO); } /** * Write the appropriate journal entry for unmapping logical to mapped for a * write. This is the callback registered in readOldBlockMappingForWrite(). * * @param completion The completion of the write in progress **/ static void journalUnmappingForWrite(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInJournalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } if (dataVIO->mapped.pbn == ZERO_BLOCK) { setLogicalCallback(dataVIO, updateBlockMapForWrite, THIS_LOCATION("$F;js=unmap;cb=updateBMwrite")); } else { setMappedZoneCallback(dataVIO, decrementForWrite, THIS_LOCATION("$F;js=unmap;cb=decWrite")); } dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_WRITE; journalDecrement(dataVIO); } /** * Get the previous PBN mapped to this LBN from the block map for a write, so * as to make an appropriate journal entry referencing the removal of this * LBN->PBN mapping. This callback is registered in finishBlockWrite() in the * async path, and is registered in acknowledgeWrite() in the sync path. * * @param completion The completion of the write in progress **/ static void readOldBlockMappingForWrite(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInLogicalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } setJournalCallback(dataVIO, journalUnmappingForWrite, THIS_LOCATION("$F;cb=journalUnmapWrite")); dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_WRITE; getMappedBlockAsync(dataVIO); } /** * Acknowledge a write to the requestor. * * @param dataVIO The DataVIO being acknowledged **/ static void acknowledgeWrite(DataVIO *dataVIO) { ASSERT_LOG_ONLY(dataVIO->hasFlushGenerationLock, "write VIO to be acknowledged has a flush generation lock"); dataVIO->lastAsyncOperation = ACKNOWLEDGE_WRITE; dataVIOAsCompletion(dataVIO)->layer->acknowledgeDataVIO(dataVIO); } /** * Acknowledge a write now that we have made an entry in the recovery * journal. This is the callback registered in finishBlockWrite() in * synchronous mode. * * @param completion The completion of the write in progress **/ static void acknowledgeWriteCallback(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); if (abortOnError(completion->result, dataVIO, READ_ONLY)) { return; } setLogicalCallback(dataVIO, readOldBlockMappingForWrite, THIS_LOCATION(NULL)); acknowledgeWrite(dataVIO); } /**********************************************************************/ static VDOAction *getWriteIncrementCallback(DataVIO *dataVIO) { return (isAsync(dataVIO) ? readOldBlockMappingForWrite : acknowledgeWriteCallback); } /** * Do the incref after a successful block write. This is the callback * registered by finishBlockWrite(). * * @param completion The completion of the write in progress **/ static void incrementForWrite(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInAllocatedZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { return; } /* * Now that the data has been written, it's safe to deduplicate against the * block. Downgrade the allocation lock to a read lock so it can be used * later by the hash lock (which we don't have yet in sync mode). */ downgradePBNWriteLock(dataVIOAsAllocatingVIO(dataVIO)->allocationLock); dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_WRITE; setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), THIS_LOCATION(NULL)); updateReferenceCount(dataVIO); } /** * Add an entry in the recovery journal after a successful block write. This is * the callback registered by writeBlock(). It is also registered in * allocateBlockForWrite(). * * @param completion The completion of the write in progress **/ static void finishBlockWrite(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); assertInJournalZone(dataVIO); if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { return; } if (dataVIO->newMapped.pbn == ZERO_BLOCK) { setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), THIS_LOCATION("$F;js=writeZero")); } else { setAllocatedZoneCallback(dataVIO, incrementForWrite, THIS_LOCATION("$F;js=mapWrite")); } dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_WRITE; journalIncrement(dataVIO, dataVIOAsAllocatingVIO(dataVIO)->allocationLock); } /** * Write data to the underlying storage. * * @param dataVIO The DataVIO to write **/ static void writeBlock(DataVIO *dataVIO) { dataVIO->lastAsyncOperation = WRITE_DATA; setJournalCallback(dataVIO, finishBlockWrite, THIS_LOCATION("$F(data);cb=finishWrite")); dataVIOAsCompletion(dataVIO)->layer->writeData(dataVIO); } /** * Continue the write path for a DataVIO now that block allocation is complete * (the DataVIO may or may not have actually received an allocation). This * callback is registered in continueWriteWithBlockMapSlot(). * * @param allocatingVIO The DataVIO which has finished the allocation process * (as an AllocatingVIO) **/ static void continueWriteAfterAllocation(AllocatingVIO *allocatingVIO) { DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO); if (abortOnError(dataVIOAsCompletion(dataVIO)->result, dataVIO, NOT_READ_ONLY)) { return; } if (!hasAllocation(dataVIO)) { prepareForDedupe(dataVIOAsCompletion(dataVIO)); return; } atomicStoreBool(&dataVIO->hasAllocation, true); dataVIO->newMapped = (ZonedPBN) { .zone = allocatingVIO->zone, .pbn = allocatingVIO->allocation, .state = MAPPING_STATE_UNCOMPRESSED, }; if (!isAsync(dataVIO)) { writeBlock(dataVIO); return; } // XXX prepareForDedupe can run from any thread, so this is a place where // running the callback on the kernel thread would save a thread switch. setAllocatedZoneCallback(dataVIO, prepareForDedupe, THIS_LOCATION(NULL)); if (vioRequiresFlushAfter(allocatingVIOAsVIO(allocatingVIO))) { invokeCallback(dataVIOAsCompletion(dataVIO)); return; } acknowledgeWrite(dataVIO); } /** * Continue the write path for a VIO now that block map slot resolution is * complete. This callback is registered in launchWriteDataVIO(). * * @param completion The DataVIO to write **/ static void continueWriteWithBlockMapSlot(VDOCompletion *completion) { DataVIO *dataVIO = asDataVIO(completion); // We don't care what thread we're on. if (abortOnError(completion->result, dataVIO, NOT_READ_ONLY)) { return; } if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) { int result = ASSERT(isTrimDataVIO(dataVIO), "dataVIO with no block map page is a trim"); if (abortOnError(result, dataVIO, READ_ONLY)) { return; } // This is a trim for a block on a block map page which has not been // allocated, so there's nothing more we need to do. finishDataVIO(dataVIO, VDO_SUCCESS); return; } if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { // We don't need to write any data, so skip allocation and just update // the block map and reference counts (via the journal). dataVIO->newMapped.pbn = ZERO_BLOCK; launchJournalCallback(dataVIO, finishBlockWrite, THIS_LOCATION("$F;cb=finishWrite")); return; } allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO), getAllocationSelector(dataVIO->logical.zone), VIO_WRITE_LOCK, continueWriteAfterAllocation); } /**********************************************************************/ void launchWriteDataVIO(DataVIO *dataVIO) { if (isReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)) { finishDataVIO(dataVIO, VDO_READ_ONLY); return; } // Write requests join the current flush generation. int result = acquireFlushGenerationLock(dataVIO); if (abortOnError(result, dataVIO, NOT_READ_ONLY)) { return; } // Go find the block map slot for the LBN mapping. dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT; findBlockMapSlotAsync(dataVIO, continueWriteWithBlockMapSlot, getLogicalZoneThreadID(dataVIO->logical.zone)); } /**********************************************************************/ void cleanupWriteDataVIO(DataVIO *dataVIO) { performCleanupStage(dataVIO, VIO_CLEANUP_START); }