Blob Blame History Raw
/*
 * libpmem: IO engine that uses PMDK libpmem to read and write data
 *
 * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License,
 * version 2 as published by the Free Software Foundation..
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 */

/*
 * libpmem engine
 *
 * IO engine that uses libpmem to read and write data
 *
 * To use:
 *   ioengine=libpmem
 *
 * Other relevant settings:
 *   iodepth=1
 *   direct=1
 *   directory=/mnt/pmem0/
 *   bs=4k
 *
 *   direct=1 means that pmem_drain() is executed for each write operation.
 *   In contrast, direct=0 means that pmem_drain() is not executed.
 *
 *   The pmem device must have a DAX-capable filesystem and be mounted
 *   with DAX enabled. directory must point to a mount point of DAX FS.
 *
 *   Example:
 *     mkfs.xfs /dev/pmem0
 *     mkdir /mnt/pmem0
 *     mount -o dax /dev/pmem0 /mnt/pmem0
 *
 *
 * See examples/libpmem.fio for more.
 *
 *
 * libpmem.so
 *   By default, the libpmem engine will let the system find the libpmem.so
 *   that it uses. You can use an alternative libpmem by setting the
 *   FIO_PMEM_LIB environment variable to the full path to the desired
 *   libpmem.so.
 */

#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <libgen.h>
#include <libpmem.h>

#include "../fio.h"
#include "../verify.h"

/*
 * Limits us to 1GiB of mapped files in total to model after
 * libpmem engine behavior
 */
#define MMAP_TOTAL_SZ   (1 * 1024 * 1024 * 1024UL)

struct fio_libpmem_data {
	void *libpmem_ptr;
	size_t libpmem_sz;
	off_t libpmem_off;
};

#define MEGABYTE ((uintptr_t)1 << 20)
#define GIGABYTE ((uintptr_t)1 << 30)
#define PROCMAXLEN 2048 /* maximum expected line length in /proc files */
#define roundup(x, y)   ((((x) + ((y) - 1)) / (y)) * (y))

static bool Mmap_no_random;
static void *Mmap_hint;
static unsigned long long Mmap_align;

/*
 * util_map_hint_align -- choose the desired mapping alignment
 *
 * Use 2MB/1GB page alignment only if the mapping length is at least
 * twice as big as the page size.
 */
static inline size_t util_map_hint_align(size_t len, size_t req_align)
{
	size_t align = Mmap_align;

	dprint(FD_IO, "DEBUG util_map_hint_align\n" );

	if (req_align)
		align = req_align;
	else if (len >= 2 * GIGABYTE)
		align = GIGABYTE;
	else if (len >= 4 * MEGABYTE)
		align = 2 * MEGABYTE;

	dprint(FD_IO, "align=%d\n", (int)align);
	return align;
}

#ifdef __FreeBSD__
static const char *sscanf_os = "%p %p";
#define MAP_NORESERVE 0
#define OS_MAPFILE "/proc/curproc/map"
#else
static const char *sscanf_os = "%p-%p";
#define OS_MAPFILE "/proc/self/maps"
#endif

/*
 * util_map_hint_unused -- use /proc to determine a hint address for mmap()
 *
 * This is a helper function for util_map_hint().
 * It opens up /proc/self/maps and looks for the first unused address
 * in the process address space that is:
 * - greater or equal 'minaddr' argument,
 * - large enough to hold range of given length,
 * - aligned to the specified unit.
 *
 * Asking for aligned address like this will allow the DAX code to use large
 * mappings.  It is not an error if mmap() ignores the hint and chooses
 * different address.
 */
static char *util_map_hint_unused(void *minaddr, size_t len, size_t align)
{
	char *lo = NULL;        /* beginning of current range in maps file */
	char *hi = NULL;        /* end of current range in maps file */
	char *raddr = minaddr;  /* ignore regions below 'minaddr' */

#ifdef WIN32
	MEMORY_BASIC_INFORMATION mi;
#else
	FILE *fp;
	char line[PROCMAXLEN];  /* for fgets() */
#endif

	dprint(FD_IO, "DEBUG util_map_hint_unused\n");
	assert(align > 0);

	if (raddr == NULL)
		raddr += page_size;

	raddr = (char *)roundup((uintptr_t)raddr, align);

#ifdef WIN32
	while ((uintptr_t)raddr < UINTPTR_MAX - len) {
		size_t ret = VirtualQuery(raddr, &mi, sizeof(mi));
		if (ret == 0) {
			ERR("VirtualQuery %p", raddr);
			return MAP_FAILED;
		}
		dprint(FD_IO, "addr %p len %zu state %d",
				mi.BaseAddress, mi.RegionSize, mi.State);

		if ((mi.State != MEM_FREE) || (mi.RegionSize < len)) {
			raddr = (char *)mi.BaseAddress + mi.RegionSize;
			raddr = (char *)roundup((uintptr_t)raddr, align);
			dprint(FD_IO, "nearest aligned addr %p", raddr);
		} else {
			dprint(FD_IO, "unused region of size %zu found at %p",
					mi.RegionSize, mi.BaseAddress);
			return mi.BaseAddress;
		}
	}

	dprint(FD_IO, "end of address space reached");
	return MAP_FAILED;
#else
	fp = fopen(OS_MAPFILE, "r");
	if (!fp) {
		log_err("!%s\n", OS_MAPFILE);
		return MAP_FAILED;
	}

	while (fgets(line, PROCMAXLEN, fp) != NULL) {
		/* check for range line */
		if (sscanf(line, sscanf_os, &lo, &hi) == 2) {
			dprint(FD_IO, "%p-%p\n", lo, hi);
			if (lo > raddr) {
				if ((uintptr_t)(lo - raddr) >= len) {
					dprint(FD_IO, "unused region of size "
							"%zu found at %p\n",
							lo - raddr, raddr);
					break;
				} else {
					dprint(FD_IO, "region is too small: "
							"%zu < %zu\n",
							lo - raddr, len);
				}
			}

			if (hi > raddr) {
				raddr = (char *)roundup((uintptr_t)hi, align);
				dprint(FD_IO, "nearest aligned addr %p\n",
						raddr);
			}

			if (raddr == 0) {
				dprint(FD_IO, "end of address space reached\n");
				break;
			}
		}
	}

	/*
	 * Check for a case when this is the last unused range in the address
	 * space, but is not large enough. (very unlikely)
	 */
	if ((raddr != NULL) && (UINTPTR_MAX - (uintptr_t)raddr < len)) {
		dprint(FD_IO, "end of address space reached");
		raddr = MAP_FAILED;
	}

	fclose(fp);

	dprint(FD_IO, "returning %p", raddr);
	return raddr;
#endif
}

/*
 * util_map_hint -- determine hint address for mmap()
 *
 * If PMEM_MMAP_HINT environment variable is not set, we let the system to pick
 * the randomized mapping address.  Otherwise, a user-defined hint address
 * is used.
 *
 * Windows Environment:
 *   XXX - Windows doesn't support large DAX pages yet, so there is
 *   no point in aligning for the same.
 *
 * Except for Windows Environment:
 *   ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap
 *   (bit positions 12-39), which means the base mapping address is randomized
 *   within [0..1024GB] range, with 4KB granularity.  Assuming additional
 *   1GB alignment, it results in 1024 possible locations.
 *
 *   Configuring the hint address via PMEM_MMAP_HINT environment variable
 *   disables address randomization.  In such case, the function will search for
 *   the first unused, properly aligned region of given size, above the
 *   specified address.
 */
static char *util_map_hint(size_t len, size_t req_align)
{
	char *addr;
	size_t align = 0;
	char *e = NULL;

	dprint(FD_IO, "DEBUG util_map_hint\n");
	dprint(FD_IO, "len %zu req_align %zu\n", len, req_align);

	/* choose the desired alignment based on the requested length */
	align = util_map_hint_align(len, req_align);

	e = getenv("PMEM_MMAP_HINT");
	if (e) {
		char *endp;
		unsigned long long val = 0;

		errno = 0;

		val = strtoull(e, &endp, 16);
		if (errno || endp == e) {
			dprint(FD_IO, "Invalid PMEM_MMAP_HINT\n");
		} else {
			Mmap_hint = (void *)val;
			Mmap_no_random = true;
			dprint(FD_IO, "PMEM_MMAP_HINT set to %p\n", Mmap_hint);
		}
	}

	if (Mmap_no_random) {
		dprint(FD_IO, "user-defined hint %p\n", (void *)Mmap_hint);
		addr = util_map_hint_unused((void *)Mmap_hint, len, align);
	} else {
		/*
		 * Create dummy mapping to find an unused region of given size.
		 * * Request for increased size for later address alignment.
		 *
		 * Windows Environment: 
		 *   Use MAP_NORESERVE flag to only reserve the range of pages
		 *   rather than commit.  We don't want the pages to be actually
		 *   backed by the operating system paging file, as the swap
		 *   file is usually too small to handle terabyte pools.
		 *
		 * Except for Windows Environment:
		 *   Use MAP_PRIVATE with read-only access to simulate
		 *   zero cost for overcommit accounting.  Note: MAP_NORESERVE
		 *   flag is ignored if overcommit is disabled (mode 2).
		 */
#ifndef WIN32
		addr = mmap(NULL, len + align, PROT_READ,
				MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
#else
		addr = mmap(NULL, len + align, PROT_READ,
				MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0);
#endif
		if (addr != MAP_FAILED) {
			dprint(FD_IO, "system choice %p\n", addr);
			munmap(addr, len + align);
			addr = (char *)roundup((uintptr_t)addr, align);
		}
	}

	dprint(FD_IO, "hint %p\n", addr);

	return addr;
}

/*
 * This is the mmap execution function
 */
static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
			    size_t length, off_t off)
{
	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
	int flags = 0;
	void *addr = NULL;

	dprint(FD_IO, "DEBUG fio_libpmem_file\n");

	if (td_rw(td))
		flags = PROT_READ | PROT_WRITE;
	else if (td_write(td)) {
		flags = PROT_WRITE;

		if (td->o.verify != VERIFY_NONE)
			flags |= PROT_READ;
	} else
		flags = PROT_READ;

	dprint(FD_IO, "f->file_name = %s  td->o.verify = %d \n", f->file_name,
			td->o.verify);
	dprint(FD_IO, "length = %ld  flags = %d  f->fd = %d off = %ld \n",
			length, flags, f->fd,off);

	addr = util_map_hint(length, 0);

	fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off);
	if (fdd->libpmem_ptr == MAP_FAILED) {
		fdd->libpmem_ptr = NULL;
		td_verror(td, errno, "mmap");
	}

	if (td->error && fdd->libpmem_ptr)
		munmap(fdd->libpmem_ptr, length);

	return td->error;
}

/*
 * XXX Just mmap an appropriate portion, we cannot mmap the full extent
 */
static int fio_libpmem_prep_limited(struct thread_data *td, struct io_u *io_u)
{
	struct fio_file *f = io_u->file;
	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);

	dprint(FD_IO, "DEBUG fio_libpmem_prep_limited\n" );

	if (io_u->buflen > f->real_file_size) {
		log_err("libpmem: bs too big for libpmem engine\n");
		return EIO;
	}

	fdd->libpmem_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
	if (fdd->libpmem_sz > f->io_size)
		fdd->libpmem_sz = f->io_size;

	fdd->libpmem_off = io_u->offset;

	return fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
}

/*
 * Attempt to mmap the entire file
 */
static int fio_libpmem_prep_full(struct thread_data *td, struct io_u *io_u)
{
	struct fio_file *f = io_u->file;
	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
	int ret;

	dprint(FD_IO, "DEBUG fio_libpmem_prep_full\n" );

	if (fio_file_partial_mmap(f))
		return EINVAL;

	dprint(FD_IO," f->io_size %ld : io_u->offset %lld \n",
			f->io_size, io_u->offset);

	if (io_u->offset != (size_t) io_u->offset ||
	    f->io_size != (size_t) f->io_size) {
		fio_file_set_partial_mmap(f);
		return EINVAL;
	}
	fdd->libpmem_sz = f->io_size;
	fdd->libpmem_off = 0;

	ret = fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
	if (ret)
		fio_file_set_partial_mmap(f);

	return ret;
}

static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
{
	struct fio_file *f = io_u->file;
	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
	int ret;

	dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
	/*
	 * It fits within existing mapping, use it
	 */
	dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %llu : "
			"io_u->buflen %llu : fdd->libpmem_sz %llu\n",
			io_u->offset, (unsigned long long) fdd->libpmem_off,
			io_u->buflen, (unsigned long long) fdd->libpmem_sz);

	if (io_u->offset >= fdd->libpmem_off &&
	    (io_u->offset + io_u->buflen <=
	     fdd->libpmem_off + fdd->libpmem_sz))
		goto done;

	/*
	 * unmap any existing mapping
	 */
	if (fdd->libpmem_ptr) {
		dprint(FD_IO,"munmap \n");
		if (munmap(fdd->libpmem_ptr, fdd->libpmem_sz) < 0)
			return errno;
		fdd->libpmem_ptr = NULL;
	}

	if (fio_libpmem_prep_full(td, io_u)) {
		td_clear_error(td);
		ret = fio_libpmem_prep_limited(td, io_u);
		if (ret)
			return ret;
	}

done:
	io_u->mmap_data = fdd->libpmem_ptr + io_u->offset - fdd->libpmem_off
				- f->file_offset;
	return 0;
}

static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
					   struct io_u *io_u)
{
	fio_ro_check(td, io_u);
	io_u->error = 0;

	dprint(FD_IO, "DEBUG fio_libpmem_queue\n");

	switch (io_u->ddir) {
	case DDIR_READ:
		memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
		break;
	case DDIR_WRITE:
		dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
				io_u->mmap_data, io_u->xfer_buf );
		dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
		if (td->o.odirect) {
			pmem_memcpy_persist(io_u->mmap_data,
						io_u->xfer_buf,
						io_u->xfer_buflen);
		} else {
			pmem_memcpy_nodrain(io_u->mmap_data,
						io_u->xfer_buf,
						io_u->xfer_buflen);
		}
		break;
	case DDIR_SYNC:
	case DDIR_DATASYNC:
	case DDIR_SYNC_FILE_RANGE:
		break;
	default:
		io_u->error = EINVAL;
		break;
	}

	return FIO_Q_COMPLETED;
}

static int fio_libpmem_init(struct thread_data *td)
{
	struct thread_options *o = &td->o;

	dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %d \n o->fdatasync_blocks %d \n",
			o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
	dprint(FD_IO, "DEBUG fio_libpmem_init\n");

	if ((o->rw_min_bs & page_mask) &&
	    (o->fsync_blocks || o->fdatasync_blocks)) {
		log_err("libpmem: mmap options dictate a minimum block size of "
				"%llu bytes\n",	(unsigned long long) page_size);
		return 1;
	}
	return 0;
}

static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
{
	struct fio_libpmem_data *fdd;
	int ret;

	dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
	dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
	dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
	dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
	dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);

	ret = generic_open_file(td, f);
	if (ret)
		return ret;

	fdd = calloc(1, sizeof(*fdd));
	if (!fdd) {
		int fio_unused __ret;
		__ret = generic_close_file(td, f);
		return 1;
	}

	FILE_SET_ENG_DATA(f, fdd);

	return 0;
}

static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
{
	struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);

	dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
	dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);

	if (!td->o.odirect) {
		dprint(FD_IO,"pmem_drain\n");
		pmem_drain();
	}

	FILE_SET_ENG_DATA(f, NULL);
	free(fdd);
	fio_file_clear_partial_mmap(f);

	return generic_close_file(td, f);
}

static struct ioengine_ops ioengine = {
	.name		= "libpmem",
	.version	= FIO_IOOPS_VERSION,
	.init		= fio_libpmem_init,
	.prep		= fio_libpmem_prep,
	.queue		= fio_libpmem_queue,
	.open_file	= fio_libpmem_open_file,
	.close_file	= fio_libpmem_close_file,
	.get_file_size	= generic_get_file_size,
	.flags		= FIO_SYNCIO |FIO_NOEXTEND,
};

static void fio_init fio_libpmem_register(void)
{
#ifndef WIN32
	Mmap_align = page_size;
#else
	if (Mmap_align == 0) {
		SYSTEM_INFO si;

		GetSystemInfo(&si);
		Mmap_align = si.dwAllocationGranularity;
	}
#endif

	register_ioengine(&ioengine);
}

static void fio_exit fio_libpmem_unregister(void)
{
	unregister_ioengine(&ioengine);
}