/*
 * Copyright (c) 2020 Red Hat, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA. 
 *
 * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.h#18 $
 */

#ifndef KERNELLAYER_H
#define KERNELLAYER_H

#include <linux/device-mapper.h>

#include "atomic.h"
#include "constants.h"
#include "flush.h"
#include "intMap.h"
#include "physicalLayer.h"
#include "ringNode.h"
#include "volumeGeometry.h"
#include "waitQueue.h"

#include "batchProcessor.h"
#include "bufferPool.h"
#include "deadlockQueue.h"
#include "deviceConfig.h"
#include "histogram.h"
#include "kernelStatistics.h"
#include "kernelTypes.h"
#include "kernelVDO.h"
#include "ktrace.h"
#include "limiter.h"
#include "statistics.h"
#include "workQueue.h"

enum {
  VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT)
};

typedef enum {
  LAYER_SIMPLE_THINGS_INITIALIZED,
  LAYER_BUFFER_POOLS_INITIALIZED,
  LAYER_REQUEST_QUEUE_INITIALIZED,
  LAYER_CPU_QUEUE_INITIALIZED,
  LAYER_BIO_ACK_QUEUE_INITIALIZED,
  LAYER_BIO_DATA_INITIALIZED,
  LAYER_STARTING,
  LAYER_RUNNING,
  LAYER_SUSPENDED,
  LAYER_STOPPING,
  LAYER_STOPPED,
  LAYER_RESUMING,
} KernelLayerState;

/* Keep BIO statistics atomically */
struct atomicBioStats {
  atomic64_t read;              // Number of not REQ_WRITE bios
  atomic64_t write;             // Number of REQ_WRITE bios
  atomic64_t discard;           // Number of REQ_DISCARD bios
  atomic64_t flush;             // Number of REQ_FLUSH bios
  atomic64_t fua;               // Number of REQ_FUA bios
};

// Data managing the reporting of Albireo timeouts
typedef struct periodicEventReporter {
  uint64_t             lastReportedValue;
  const char          *format;
  atomic64_t           value;
  Jiffies              reportingInterval; // jiffies
  /*
   * Just an approximation.  If nonzero, then either the work item has
   * been queued to run, or some other thread currently has
   * responsibility for enqueueing it, or the reporter function is
   * running but hasn't looked at the current value yet.
   *
   * If this is set, don't set the timer again, because we don't want
   * the work item queued twice.  Use an atomic xchg or cmpxchg to
   * test-and-set it, and an atomic store to clear it.
   */
  atomic_t             workItemQueued;
  KvdoWorkItem         workItem;
  KernelLayer         *layer;
} PeriodicEventReporter;

static inline uint64_t getEventCount(PeriodicEventReporter *reporter)
{
  return atomic64_read(&reporter->value);
}

/**
 * The VDO representation of the target device
 **/
struct kernelLayer {
  PhysicalLayer           common;
  // Layer specific info
  DeviceConfig           *deviceConfig;
  /** A ring of all DeviceConfigs referencing this layer */
  RingNode                deviceConfigRing;
  char                    threadNamePrefix[MAX_QUEUE_NAME_LEN];
  struct kobject          kobj;
  struct kobject          wqDirectory;
  struct kobject          statsDirectory;
  /**
   * A counter value to attach to thread names and log messages to
   * identify the individual device.
   **/
  unsigned int            instance;
  /** Contains the current KernelLayerState, which rarely changes */
  Atomic32                state;
  bool                    noFlushSuspend;
  bool                    allocationsAllowed;
  AtomicBool              processingMessage;
  /** Limit the number of requests that are being processed. */
  Limiter                 requestLimiter;
  Limiter                 discardLimiter;
  KVDO                    kvdo;
  /** Incoming bios we've had to buffer to avoid deadlock. */
  DeadlockQueue           deadlockQueue;
  // for REQ_FLUSH processing
  struct bio_list         waitingFlushes;
  KVDOFlush              *spareKVDOFlush;
  spinlock_t              flushLock;
  Jiffies                 flushArrivalTime;
  /**
   * Bio submission manager used for sending bios to the storage
   * device.
   **/
  IOSubmitter            *ioSubmitter;
  /**
   * Work queue (possibly with multiple threads) for miscellaneous
   * CPU-intensive, non-blocking work.
   **/
  KvdoWorkQueue          *cpuQueue;
  /** N blobs of context data for LZ4 code, one per CPU thread. */
  char                  **compressionContext;
  Atomic32                compressionContextIndex;
  /** Optional work queue for calling bio_endio. */
  KvdoWorkQueue          *bioAckQueue;
  /** Underlying block device info. */
  uint64_t                startingSectorOffset;
  VolumeGeometry          geometry;
  // Memory allocation
  BufferPool             *dataKVIOPool;
  struct bio_set         *bioset;
  // Albireo specific info
  DedupeIndex            *dedupeIndex;
  // Statistics
  atomic64_t              biosSubmitted;
  atomic64_t              biosCompleted;
  atomic64_t              dedupeContextBusy;
  atomic64_t              flushOut;
  AtomicBioStats          biosIn;
  AtomicBioStats          biosInPartial;
  AtomicBioStats          biosOut;
  AtomicBioStats          biosOutCompleted;
  AtomicBioStats          biosAcknowledged;
  AtomicBioStats          biosAcknowledgedPartial;
  AtomicBioStats          biosMeta;
  AtomicBioStats          biosMetaCompleted;
  AtomicBioStats          biosJournal;
  AtomicBioStats          biosPageCache;
  AtomicBioStats          biosJournalCompleted;
  AtomicBioStats          biosPageCacheCompleted;
  // for reporting Albireo timeouts
  PeriodicEventReporter   albireoTimeoutReporter;
  // Debugging
  /* Whether to dump VDO state on shutdown */
  bool                    dumpOnShutdown;
  /**
   * Whether we should collect tracing info. (Actually, this controls
   * allocations; non-null record pointers cause recording.)
   **/
  bool                    vioTraceRecording;
  SampleCounter           traceSampleCounter;
  /* Should we log tracing info? */
  bool                    traceLogging;
  /* Storage for trace data. */
  BufferPool             *traceBufferPool;
  /* Private storage for procfs. */
  void                   *procfsPrivate;
  /* For returning batches of DataKVIOs to their pool */
  BatchProcessor         *dataKVIOReleaser;

  // Administrative operations
  /* The object used to wait for administrative operations to complete */
  struct completion       callbackSync;

  // Statistics reporting
  /* Protects the *statsStorage structs */
  struct mutex            statsMutex;
  /* Used when shutting down the sysfs statistics */
  struct completion       statsShutdown;;
  /* true if sysfs statistics directory is set up */
  bool                    statsAdded;
  /* Used to gather statistics without allocating memory */
  VDOStatistics           vdoStatsStorage;
  KernelStatistics        kernelStatsStorage;
};

typedef enum bioQAction {
  BIO_Q_ACTION_COMPRESSED_DATA,
  BIO_Q_ACTION_DATA,
  BIO_Q_ACTION_FLUSH,
  BIO_Q_ACTION_HIGH,
  BIO_Q_ACTION_METADATA,
  BIO_Q_ACTION_READCACHE,
  BIO_Q_ACTION_VERIFY
} BioQAction;

typedef enum cpuQAction {
  CPU_Q_ACTION_COMPLETE_KVIO,
  CPU_Q_ACTION_COMPRESS_BLOCK,
  CPU_Q_ACTION_EVENT_REPORTER,
  CPU_Q_ACTION_HASH_BLOCK,
} CPUQAction;

typedef enum bioAckQAction {
  BIO_ACK_Q_ACTION_ACK,
} BioAckQAction;

typedef void (*DedupeShutdownCallbackFunction)(KernelLayer *layer);

/*
 * Wrapper for the Enqueueable object, to associate it with a kernel
 * layer work item.
 */
typedef struct kvdoEnqueueable {
  KvdoWorkItem workItem;
  Enqueueable  enqueueable;
} KvdoEnqueueable;

/**
 * Implements LayerFilter.
 **/
bool layerIsNamed(KernelLayer *layer, void *context)
  __attribute__((warn_unused_result));

/**
 * Creates a kernel specific physical layer to be used by VDO
 *
 * @param startingSector        The sector offset of our table entry in the
 *                              DM device
 * @param instance              Device instantiation counter
 * @param parentKobject         The parent sysfs node
 * @param config                The device configuration
 * @param threadConfigPointer   Where to store the new threadConfig handle
 * @param reason                The reason for any failure during this call
 * @param layerPtr              A pointer to hold the created layer
 *
 * @return VDO_SUCCESS or an error
 **/
int makeKernelLayer(uint64_t        startingSector,
                    unsigned int    instance,
                    DeviceConfig   *config,
                    struct kobject *parentKobject,
                    ThreadConfig  **threadConfigPointer,
                    char          **reason,
                    KernelLayer   **layerPtr)
  __attribute__((warn_unused_result));

/**
 * Prepare to modify a kernel layer.
 *
 * @param layer     The layer to modify
 * @param config    The new device configuration
 * @param errorPtr  A pointer to store the reason for any failure
 *
 * @return VDO_SUCCESS or an error
 **/
int prepareToModifyKernelLayer(KernelLayer       *layer,
                               DeviceConfig      *config,
                               char             **errorPtr)
  __attribute__((warn_unused_result));

/**
 * Modify a kernel physical layer.
 *
 * @param layer   The layer to modify
 * @param config  The new device configuration
 *
 * @return VDO_SUCCESS or an error
 **/
int modifyKernelLayer(KernelLayer       *layer,
                      DeviceConfig      *config)
  __attribute__((warn_unused_result));

/**
 * Free a kernel physical layer.
 *
 * @param layer    The layer, which must have been created by
 *                 makeKernelLayer
 **/
void freeKernelLayer(KernelLayer *layer);

/**
 * Make and configure a kernel layer. This method does not alter the VDO state
 * on disk. It should be run from the VDO constructor for devices which have
 * not been started.
 *
 * @param layer       The kernel layer
 * @param loadConfig  Load-time parameters for the VDO
 * @param reason      The reason for any failure during this call
 *
 * @return VDO_SUCCESS or an error
 *
 * @note redundant starts are silently ignored
 **/
int preloadKernelLayer(KernelLayer          *layer,
                       const VDOLoadConfig  *loadConfig,
                       char                **reason);

/**
 * Start the kernel layer. This method finishes bringing a VDO online now that
 * a table is being resumed for the first time.
 *
 * @param layer   The kernel layer
 * @param reason  The reason for any failure during this call
 *
 * @return VDO_SUCCESS or an error
 **/
int startKernelLayer(KernelLayer *layer, char **reason);

/**
 * Stop the kernel layer.
 *
 * @param layer  The kernel layer
 **/
void stopKernelLayer(KernelLayer *layer);

/**
 * Suspend the kernel layer.
 *
 * @param layer  The kernel layer
 *
 * @return VDO_SUCCESS or an error
 **/
int suspendKernelLayer(KernelLayer *layer);

/**
 * Resume the kernel layer.
 *
 * @param layer  The kernel layer
 *
 * @return VDO_SUCCESS or an error
 **/
int resumeKernelLayer(KernelLayer *layer);

/**
 * Get the kernel layer state.
 *
 * @param layer  The kernel layer
 *
 * @return the instantaneously correct kernel layer state
 **/
static inline KernelLayerState getKernelLayerState(const KernelLayer *layer)
{
  return atomicLoad32(&layer->state);
}

/**
 * Function call to begin processing a bio passed in from the block layer
 *
 * @param layer  The physical layer
 * @param bio    The bio from the block layer
 *
 * @return value to return from the VDO map function.  Either an error code
 *         or DM_MAPIO_REMAPPED or DM_MAPPED_SUBMITTED (see vdoMapBio for
 *         details).
 **/
int kvdoMapBio(KernelLayer *layer, BIO *bio);

/**
 * Convert a generic PhysicalLayer to a kernelLayer.
 *
 * @param layer The PhysicalLayer to convert
 *
 * @return The PhysicalLayer as a KernelLayer
 **/
static inline KernelLayer *asKernelLayer(PhysicalLayer *layer)
{
  return container_of(layer, KernelLayer, common);
}

/**
 * Convert a block number (or count) to a (512-byte-)sector number.
 *
 * The argument type is sector_t to force conversion to the type we
 * want, although the actual values passed are of various integral
 * types.  It's just too easy to forget and do the multiplication
 * without casting, resulting in 32-bit arithmetic that accidentally
 * produces wrong results in devices over 2TB (2**32 sectors).
 *
 * @param [in] layer        the physical layer
 * @param [in] blockNumber  the block number/count
 *
 * @return      the sector number/count
 **/
static inline sector_t blockToSector(KernelLayer *layer, sector_t blockNumber)
{
  return (blockNumber * VDO_SECTORS_PER_BLOCK);
}

/**
 * Convert a sector number (or count) to a block number. Does not
 * check to make sure the sector number is an integral number of
 * blocks.
 *
 * @param [in] layer         the physical layer
 * @param [in] sectorNumber  the sector number/count
 *
 * @return      the block number/count
 **/
static inline sector_t sectorToBlock(KernelLayer *layer, sector_t sectorNumber)
{
  return (sectorNumber / VDO_SECTORS_PER_BLOCK);
}

/**
 * Convert a sector number to an offset within a block.
 *
 * @param [in] layer         the physical layer
 * @param [in] sectorNumber  the sector number
 *
 * @return      the offset within the block
 **/
static inline BlockSize sectorToBlockOffset(KernelLayer *layer,
                                            sector_t     sectorNumber)
{
  unsigned int sectorsPerBlockMask = VDO_SECTORS_PER_BLOCK - 1;
  return to_bytes(sectorNumber & sectorsPerBlockMask);
}

/**
 * Get the block device object currently underlying a kernel layer.
 *
 * @param layer  The kernel layer in question
 *
 * @return The block device object under the layer
 **/
struct block_device *getKernelLayerBdev(const KernelLayer *layer)
  __attribute__((warn_unused_result));

/**
 * Set the layer's active config.
 *
 * @param layer   The kernel layer in question
 * @param config  The config in question
 **/
static inline void setKernelLayerActiveConfig(KernelLayer  *layer,
                                              DeviceConfig *config)
{
  layer->deviceConfig = config;
}

/**
 * Given an error code, return a value we can return to the OS.  The
 * input error code may be a system-generated value (such as -EIO), an
 * errno macro used in our code (such as EIO), or a UDS or VDO status
 * code; the result must be something the rest of the OS can consume
 * (negative errno values such as -EIO, in the case of the kernel).
 *
 * @param error    the error code to convert
 *
 * @return   a system error code value
 **/
int mapToSystemError(int error);

/**
 * Record and eventually report that some number of dedupe requests
 * reached their expiration time without getting an answer, so we
 * timed out on them.
 *
 * This is called in a timer context, so it shouldn't do the reporting
 * directly.
 *
 * @param layer          The kernel layer for the device
 * @param expiredCount   The number of expired requests we timed out on
 **/
void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount);

/**
 * Wait until there are no requests in progress.
 *
 * @param layer  The kernel layer for the device
 **/
void waitForNoRequestsActive(KernelLayer *layer);

/**
 * Enqueues an item on our internal "cpu queues". Since there is more than
 * one, we rotate through them in hopes of creating some general balance.
 *
 * @param layer The kernel layer
 * @param item  The work item to enqueue
 */
static inline void enqueueCPUWorkQueue(KernelLayer *layer, KvdoWorkItem *item)
{
  enqueueWorkQueue(layer->cpuQueue, item);
}

/**
 * Adjust parameters to prepare to use a larger physical space.
 * The size must be larger than the current size.
 *
 * @param layer          the kernel layer
 * @param physicalCount  the new physical size in blocks
 *
 * @return VDO_SUCCESS or an error
 */
int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount);

/**
 * Adjusts parameters to reflect resizing the underlying device.
 * The size must be larger than the current size.
 *
 * @param layer            the kernel layer
 * @param physicalCount    the new physical count in blocks
 *
 * @return VDO_SUCCESS or an error
 */
int resizePhysical(KernelLayer *layer, BlockCount physicalCount);

/**
 * Adjust parameters to prepare to present a larger logical space.
 * The size must be larger than the current size.
 *
 * @param layer         the kernel layer
 * @param logicalCount  the new logical size in blocks
 *
 * @return VDO_SUCCESS or an error
 */
int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount);

/**
 * Adjust parameters to present a larger logical space.
 * The size must be larger than the current size.
 *
 * @param layer         the kernel layer
 * @param logicalCount  the new logical size in blocks
 *
 * @return VDO_SUCCESS or an error
 */
int resizeLogical(KernelLayer *layer, BlockCount logicalCount);

/**
 * Indicate whether the kernel layer is configured to use a separate
 * work queue for acknowledging received and processed bios.
 *
 * Note that this directly controls handling of write operations, but
 * the compile-time flag USE_BIO_ACK_QUEUE_FOR_READ is also checked
 * for read operations.
 *
 * @param  layer  The kernel layer
 *
 * @return   Whether a bio-acknowledgement work queue is in use
 **/
static inline bool useBioAckQueue(KernelLayer *layer)
{
  return layer->deviceConfig->threadCounts.bioAckThreads > 0;
}

/**
 * Update bookkeeping for the completion of some number of requests, so that
 * more incoming requests can be accepted.
 *
 * @param layer  The kernel layer
 * @param count  The number of completed requests
 **/
void completeManyRequests(KernelLayer *layer, uint32_t count);

#endif /* KERNELLAYER_H */