Blob Blame History Raw
/*
 * Copyright 2017-2018, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *
 *     * Neither the name of the copyright holder nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * os_deep_linux.c -- Linux abstraction layer
 */

#define _GNU_SOURCE

#include <inttypes.h>
#include <fcntl.h>
#include <sys/stat.h>
#include "out.h"
#include "os.h"
#include "mmap.h"
#include "file.h"
#include "libpmem.h"
#include "os_deep.h"

/*
 * os_deep_flush_write -- (internal) perform write to deep_flush file
 * on given region_id
 */
static int
os_deep_flush_write(int region_id)
{
	LOG(3, "region_id %d", region_id);

	char deep_flush_path[PATH_MAX];
	int deep_flush_fd;

	snprintf(deep_flush_path, PATH_MAX,
		"/sys/bus/nd/devices/region%d/deep_flush", region_id);

	if ((deep_flush_fd = os_open(deep_flush_path, O_WRONLY)) < 0) {
		LOG(1, "!os_open(\"%s\", O_WRONLY)", deep_flush_path);
		return -1;
	}

	if (write(deep_flush_fd, "1", 1) != 1) {
		LOG(1, "!write(%d, \"1\")", deep_flush_fd);
		int oerrno = errno;
		os_close(deep_flush_fd);
		errno = oerrno;
		return -1;
	}

	os_close(deep_flush_fd);
	return 0;
}

/*
 * os_deep_type -- (internal) perform deep operation based on a pmem
 * mapping type
 */
static int
os_deep_type(const struct map_tracker *mt, void *addr, size_t len)
{
	LOG(15, "mt %p addr %p len %zu", mt, addr, len);

	switch (mt->type) {
	case PMEM_DEV_DAX:
		pmem_drain();

		if (os_deep_flush_write(mt->region_id) < 0) {
			if (errno == ENOENT) {
				errno = ENOTSUP;
				LOG(1, "!deep_flush not supported");
			} else {
				LOG(2, "cannot write to deep_flush"
					"in region %d", mt->region_id);
			}
			return -1;
		}
		return 0;
	case PMEM_MAP_SYNC:
		return pmem_msync(addr, len);
	default:
		ASSERT(0);
		return -1;
	}
}

/*
 * os_range_deep_common -- perform deep action of given address range
 */
int
os_range_deep_common(uintptr_t addr, size_t len)
{
	LOG(3, "addr 0x%016" PRIxPTR " len %zu", addr, len);

	while (len != 0) {
		const struct map_tracker *mt = util_range_find(addr, len);

		/* no more overlapping track regions or NOT a device DAX */
		if (mt == NULL) {
			LOG(15, "pmem_msync addr %p, len %lu",
				(void *)addr, len);
			return pmem_msync((void *)addr, len);
		}
		/*
		 * For range that intersects with the found mapping
		 * write to (Device DAX) deep_flush file.
		 * Call msync for the non-intersecting part.
		 */
		if (mt->base_addr > addr) {
			size_t curr_len = mt->base_addr - addr;
			if (curr_len > len)
				curr_len = len;
			if (pmem_msync((void *)addr, curr_len) != 0)
				return -1;
			len -= curr_len;
			if (len == 0)
				return 0;
			addr = mt->base_addr;
		}
		size_t mt_in_len = mt->end_addr - addr;
		size_t persist_len = MIN(len, mt_in_len);

		if (os_deep_type(mt, (void *)addr, persist_len))
			return -1;

		if (mt->end_addr >= addr + len)
			return 0;

		len -= mt_in_len;
		addr = mt->end_addr;
	}
	return 0;
}

/*
 * os_part_deep_common -- common function to handle both
 * deep_persist and deep_drain part flush cases.
 */
int
os_part_deep_common(struct pool_replica *rep, unsigned partidx, void *addr,
			size_t len, int flush)
{
	LOG(3, "part %p part %d addr %p len %lu flush %d",
		rep, partidx, addr, len, flush);

	if (!rep->is_pmem) {
		/*
		 * In case of part on non-pmem call msync on the range
		 * to deep flush the data. Deep drain is empty as all
		 * data is msynced to persistence.
		 */

		if (!flush)
			return 0;

		if (pmem_msync(addr, len)) {
			LOG(1, "pmem_msync(%p, %lu)", addr, len);
			return -1;
		}
		return 0;
	}
	struct pool_set_part part = rep->part[partidx];
	/* Call deep flush if it was requested */
	if (flush) {
		LOG(15, "pmem_deep_flush addr %p, len %lu", addr, len);
		pmem_deep_flush(addr, len);
	}
	/*
	 * Before deep drain call normal drain to ensure that data
	 * is at least in WPQ.
	 */
	pmem_drain();

	if (part.is_dev_dax) {
		/*
		 * During deep_drain for part on device DAX search for
		 * device region id, and perform WPQ flush on found
		 * device DAX region.
		 */
		int region_id = util_ddax_region_find(part.path);

		if (region_id < 0) {
			if (errno == ENOENT) {
				errno = ENOTSUP;
				LOG(1, "!deep_flush not supported");
			} else {
				LOG(1, "invalid dax_region id %d", region_id);
			}
			return -1;
		}

		if (os_deep_flush_write(region_id)) {
			LOG(1, "ddax_deep_flush_write(%d)",
				region_id);
			return -1;
		}
	} else {
		/*
		 * For deep_drain on normal pmem it is enough to
		 * call msync on one page.
		 */
		if (pmem_msync(addr, MIN(Pagesize, len))) {
			LOG(1, "pmem_msync(%p, %lu)", addr, len);
			return -1;
		}
	}
	return 0;
}