/*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*
* $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryLinuxKernel.c#6 $
*/
#include <linux/delay.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include "compilerDefs.h"
#include "logger.h"
#include "memoryAlloc.h"
#include "permassert.h"
/*
******************************************************************************
* Production: UDS and VDO keep track of which threads are allowed to allocate
* memory freely, and which threads must be careful to not do a memory
* allocation that does an I/O request. The allocatingThreads ThreadsRegistry
* and its associated methods implement this tracking.
*/
static ThreadRegistry allocatingThreads;
/*****************************************************************************/
static bool allocationsAllowed(void)
{
const bool *pointer = lookupThread(&allocatingThreads);
return pointer != NULL ? *pointer : false;
}
/*****************************************************************************/
void registerAllocatingThread(RegisteredThread *newThread, const bool *flagPtr)
{
if (flagPtr == NULL) {
static const bool allocationAlwaysAllowed = true;
flagPtr = &allocationAlwaysAllowed;
}
registerThread(&allocatingThreads, newThread, flagPtr);
}
/*****************************************************************************/
void unregisterAllocatingThread(void)
{
unregisterThread(&allocatingThreads);
}
/*
******************************************************************************
* Production: We track how much memory has been allocated and freed. When we
* unload the UDS module, we log an error if we have not freed all the memory
* that we allocated. Nearly all memory allocation and freeing is done using
* this module.
*
* We do not use kernel functions like the kvasprintf() method, which allocate
* memory indirectly using kmalloc.
*
* These data structures and methods are used to track the amount of memory
* used.
*/
// We allocate very few large objects, and allocation/deallocation isn't done
// in a performance-critical stage for us, so a linked list should be fine.
typedef struct vmallocBlockInfo {
void *ptr;
size_t size;
struct vmallocBlockInfo *next;
} VmallocBlockInfo;
static struct {
spinlock_t lock;
size_t kmallocBlocks;
size_t kmallocBytes;
size_t vmallocBlocks;
size_t vmallocBytes;
size_t peakBytes;
VmallocBlockInfo *vmallocList;
} memoryStats __cacheline_aligned;
/*****************************************************************************/
static void updatePeakUsage(void)
{
size_t totalBytes = memoryStats.kmallocBytes + memoryStats.vmallocBytes;
if (totalBytes > memoryStats.peakBytes) {
memoryStats.peakBytes = totalBytes;
}
}
/*****************************************************************************/
static void addKmallocBlock(size_t size)
{
unsigned long flags;
spin_lock_irqsave(&memoryStats.lock, flags);
memoryStats.kmallocBlocks++;
memoryStats.kmallocBytes += size;
updatePeakUsage();
spin_unlock_irqrestore(&memoryStats.lock, flags);
}
/*****************************************************************************/
static void removeKmallocBlock(size_t size)
{
unsigned long flags;
spin_lock_irqsave(&memoryStats.lock, flags);
memoryStats.kmallocBlocks--;
memoryStats.kmallocBytes -= size;
spin_unlock_irqrestore(&memoryStats.lock, flags);
}
/*****************************************************************************/
static void addVmallocBlock(VmallocBlockInfo *block)
{
unsigned long flags;
spin_lock_irqsave(&memoryStats.lock, flags);
block->next = memoryStats.vmallocList;
memoryStats.vmallocList = block;
memoryStats.vmallocBlocks++;
memoryStats.vmallocBytes += block->size;
updatePeakUsage();
spin_unlock_irqrestore(&memoryStats.lock, flags);
}
/*****************************************************************************/
static void removeVmallocBlock(void *ptr)
{
VmallocBlockInfo *block, **blockPtr;
unsigned long flags;
spin_lock_irqsave(&memoryStats.lock, flags);
for (blockPtr = &memoryStats.vmallocList;
(block = *blockPtr) != NULL;
blockPtr = &block->next) {
if (block->ptr == ptr) {
*blockPtr = block->next;
memoryStats.vmallocBlocks--;
memoryStats.vmallocBytes -= block->size;
break;
}
}
spin_unlock_irqrestore(&memoryStats.lock, flags);
if (block != NULL) {
FREE(block);
} else {
logInfo("attempting to remove ptr %" PRIptr " not found in vmalloc list",
ptr);
}
}
/**
* Determine whether allocating a memory block should use kmalloc or vmalloc.
*
* vmalloc can allocate any integral number of pages.
*
* kmalloc can allocate any number of bytes up to a configured limit, which
* defaults to 8 megabytes on some of our systems. kmalloc is especially good
* when memory is being both allocated and freed, and it does this efficiently
* in a multi CPU environment.
*
* kmalloc usually rounds the size of the block up to the next power of two.
* So when the requested block is bigger than PAGE_SIZE / 2 bytes, kmalloc will
* never give you less space than the corresponding vmalloc allocation.
* Sometimes vmalloc will use less overhead than kmalloc.
*
* The advantages of kmalloc do not help out UDS or VDO, because we allocate
* all our memory up front and do not free and reallocate it. Sometimes we
* have problems using kmalloc, because the Linux memory page map can become so
* fragmented that kmalloc will not give us a 32KB chunk. We have used vmalloc
* as a backup to kmalloc in the past, and a followup vmalloc of 32KB will
* work. But there is no strong case to be made for using kmalloc over vmalloc
* for these size chunks.
*
* The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB
* requests. There is no strong reason for favoring either kmalloc or vmalloc
* for 4KB requests, except that the keeping of vmalloc statistics uses a
* linked list implementation. Using a simple test, this choice of boundary
* results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB
* results in an additional 6374 vmalloc calls, which will require a change to
* the code that tracks vmalloc statistics.
*
* @param size How many bytes to allocate
**/
static INLINE bool useKmalloc(size_t size)
{
return size <= PAGE_SIZE;
}
/*****************************************************************************/
int allocateMemory(size_t size, size_t align, const char *what, void *ptr)
{
if (ptr == NULL) {
return UDS_INVALID_ARGUMENT;
}
if (size == 0) {
*((void **) ptr) = NULL;
return UDS_SUCCESS;
}
/*
* The __GFP_RETRY_MAYFAIL means: The VM implementation will retry memory
* reclaim procedures that have previously failed if there is some indication
* that progress has been made else where. It can wait for other tasks to
* attempt high level approaches to freeing memory such as compaction (which
* removes fragmentation) and page-out. There is still a definite limit to
* the number of retries, but it is a larger limit than with __GFP_NORETRY.
* Allocations with this flag may fail, but only when there is genuinely
* little unused memory. While these allocations do not directly trigger the
* OOM killer, their failure indicates that the system is likely to need to
* use the OOM killer soon. The caller must handle failure, but can
* reasonably do so by failing a higher-level request, or completing it only
* in a much less efficient manner.
*/
const gfp_t gfpFlags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL;
bool allocationsRestricted = !allocationsAllowed();
unsigned int noioFlags;
if (allocationsRestricted) {
noioFlags = memalloc_noio_save();
}
unsigned long startTime = jiffies;
void *p = NULL;
if (useKmalloc(size) && (align < PAGE_SIZE)) {
p = kmalloc(size, gfpFlags | __GFP_NOWARN);
if (p == NULL) {
/*
* If we had just done kmalloc(size, gfpFlags) it is possible that the
* allocation would fail (see VDO-3688). The kernel log would then
* contain a long report about the failure. Although the failure occurs
* because there is no page available to allocate, by the time it logs
* the available space, there is a page available. So hopefully a short
* sleep will allow the page reclaimer to free a single page, which is
* all that we need.
*/
msleep(1);
p = kmalloc(size, gfpFlags);
}
if (p != NULL) {
addKmallocBlock(ksize(p));
}
} else {
VmallocBlockInfo *block;
if (ALLOCATE(1, VmallocBlockInfo, __func__, &block) == UDS_SUCCESS) {
/*
* If we just do __vmalloc(size, gfpFlags, PAGE_KERNEL) it is possible
* that the allocation will fail (see VDO-3661). The kernel log will
* then contain a long report about the failure. Although the failure
* occurs because there are not enough pages available to allocate, by
* the time it logs the available space, there may enough pages available
* for smaller allocations. So hopefully a short sleep will allow the
* page reclaimer to free enough pages for us.
*
* For larger allocations, the kernel page_alloc code is racing against
* the page reclaimer. If the page reclaimer can stay ahead of
* page_alloc, the __vmalloc will succeed. But if page_alloc overtakes
* the page reclaimer, the allocation fails. It is possible that more
* retries will succeed.
*/
for (;;) {
p = __vmalloc(size, gfpFlags | __GFP_NOWARN, PAGE_KERNEL);
// Try again unless we succeeded or more than 1 second has elapsed.
if ((p != NULL) || (jiffies_to_msecs(jiffies - startTime) > 1000)) {
break;
}
msleep(1);
}
if (p == NULL) {
// Try one more time, logging a failure for this call.
p = __vmalloc(size, gfpFlags, PAGE_KERNEL);
}
if (p == NULL) {
FREE(block);
} else {
block->ptr = p;
block->size = PAGE_ALIGN(size);
addVmallocBlock(block);
}
}
}
if (allocationsRestricted) {
memalloc_noio_restore(noioFlags);
}
if (p == NULL) {
unsigned int duration = jiffies_to_msecs(jiffies - startTime);
logError("Could not allocate %zu bytes for %s in %u msecs",
size, what, duration);
return ENOMEM;
}
*((void **) ptr) = p;
return UDS_SUCCESS;
}
/*****************************************************************************/
void *allocateMemoryNowait(size_t size,
const char *what __attribute__((unused)))
{
void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO);
if (p != NULL) {
addKmallocBlock(ksize(p));
}
return p;
}
/*****************************************************************************/
void freeMemory(void *ptr)
{
if (ptr != NULL) {
if (is_vmalloc_addr(ptr)) {
removeVmallocBlock(ptr);
vfree(ptr);
} else {
removeKmallocBlock(ksize(ptr));
kfree(ptr);
}
}
}
/*****************************************************************************/
int reallocateMemory(void *ptr,
size_t oldSize,
size_t size,
const char *what,
void *newPtr)
{
// Handle special case of zero sized result
if (size == 0) {
FREE(ptr);
*(void **)newPtr = NULL;
return UDS_SUCCESS;
}
int result = ALLOCATE(size, char, what, newPtr);
if (result != UDS_SUCCESS) {
return result;
}
if (ptr != NULL) {
if (oldSize < size) {
size = oldSize;
}
memcpy(*((void **) newPtr), ptr, size);
FREE(ptr);
}
return UDS_SUCCESS;
}
/*****************************************************************************/
void memoryInit(void)
{
spin_lock_init(&memoryStats.lock);
initializeThreadRegistry(&allocatingThreads);
}
/*****************************************************************************/
void memoryExit(void)
{
ASSERT_LOG_ONLY(memoryStats.kmallocBytes == 0,
"kmalloc memory used (%zd bytes in %zd blocks)"
" is returned to the kernel",
memoryStats.kmallocBytes, memoryStats.kmallocBlocks);
ASSERT_LOG_ONLY(memoryStats.vmallocBytes == 0,
"vmalloc memory used (%zd bytes in %zd blocks)"
" is returned to the kernel",
memoryStats.vmallocBytes, memoryStats.vmallocBlocks);
logDebug("%s peak usage %zd bytes", THIS_MODULE->name,
memoryStats.peakBytes);
}
/**********************************************************************/
void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed)
{
unsigned long flags;
spin_lock_irqsave(&memoryStats.lock, flags);
*bytesUsed = memoryStats.kmallocBytes + memoryStats.vmallocBytes;
*peakBytesUsed = memoryStats.peakBytes;
spin_unlock_irqrestore(&memoryStats.lock, flags);
}
/**********************************************************************/
void reportMemoryUsage()
{
unsigned long flags;
spin_lock_irqsave(&memoryStats.lock, flags);
uint64_t kmallocBlocks = memoryStats.kmallocBlocks;
uint64_t kmallocBytes = memoryStats.kmallocBytes;
uint64_t vmallocBlocks = memoryStats.vmallocBlocks;
uint64_t vmallocBytes = memoryStats.vmallocBytes;
uint64_t peakUsage = memoryStats.peakBytes;
spin_unlock_irqrestore(&memoryStats.lock, flags);
uint64_t totalBytes = kmallocBytes + vmallocBytes;
logInfo("current module memory tracking"
" (actual allocation sizes, not requested):");
logInfo(" %llu bytes in %llu kmalloc blocks",
kmallocBytes, kmallocBlocks);
logInfo(" %llu bytes in %llu vmalloc blocks",
vmallocBytes, vmallocBlocks);
logInfo(" total %llu bytes, peak usage %llu bytes",
totalBytes, peakUsage);
}