Blob Blame History Raw
/*
 * Copyright 2014-2018, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *
 *     * Neither the name of the copyright holder nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * file.c -- file utilities
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/file.h>
#include <sys/mman.h>

#if !defined(_WIN32) && !defined(__FreeBSD__)
#include <sys/sysmacros.h>
#endif

#include "file.h"
#include "os.h"
#include "out.h"
#include "mmap.h"

#define DEVICE_DAX_PREFIX "/sys/class/dax"
#define MAX_SIZE_LENGTH 64

#define DEVICE_DAX_ZERO_LEN (2 * MEGABYTE)

#ifndef _WIN32
/*
 * device_dax_size -- (internal) checks the size of a given dax device
 */
static ssize_t
device_dax_size(const char *path)
{
	LOG(3, "path \"%s\"", path);

	os_stat_t st;
	int olderrno;

	if (os_stat(path, &st) < 0) {
		ERR("!stat \"%s\"", path);
		return -1;
	}

	char spath[PATH_MAX];
	snprintf(spath, PATH_MAX, "/sys/dev/char/%u:%u/size",
		os_major(st.st_rdev), os_minor(st.st_rdev));

	LOG(4, "device size path \"%s\"", spath);

	int fd = os_open(spath, O_RDONLY);
	if (fd < 0) {
		ERR("!open \"%s\"", spath);
		return -1;
	}

	ssize_t size = -1;

	char sizebuf[MAX_SIZE_LENGTH + 1];
	ssize_t nread;
	if ((nread = read(fd, sizebuf, MAX_SIZE_LENGTH)) < 0) {
		ERR("!read");
		goto out;
	}

	sizebuf[nread] = 0; /* null termination */

	char *endptr;

	olderrno = errno;
	errno = 0;

	size = strtoll(sizebuf, &endptr, 0);
	if (endptr == sizebuf || *endptr != '\n' ||
	    ((size == LLONG_MAX || size == LLONG_MIN) && errno == ERANGE)) {
		ERR("invalid device size %s", sizebuf);
		size = -1;
		goto out;
	}

	errno = olderrno;

out:
	olderrno = errno;
	(void) os_close(fd);
	errno = olderrno;

	LOG(4, "device size %zu", size);
	return size;
}
#endif

/*
 * util_file_exists -- checks whether file exists
 */
int
util_file_exists(const char *path)
{
	LOG(3, "path \"%s\"", path);

	if (os_access(path, F_OK) == 0)
		return 1;

	if (errno != ENOENT) {
		ERR("!os_access \"%s\"", path);
		return -1;
	}

	/*
	 * ENOENT means that some component of a pathname does not exists.
	 *
	 * XXX - we should also call os_access on parent directory and
	 * if this also results in ENOENT -1 should be returned.
	 *
	 * The problem is that we would need to use realpath, which fails
	 * if file does not exist.
	 */

	return 0;
}

/*
 * get_file_type_internal -- (internal) checks whether stat structure describes
 *			 device dax or a normal file
 */
static enum file_type
get_file_type_internal(os_stat_t *st)
{
#ifdef _WIN32
	return TYPE_NORMAL;
#else
	if (!S_ISCHR(st->st_mode)) {
		LOG(4, "not a character device");
		return TYPE_NORMAL;
	}

	char spath[PATH_MAX];
	snprintf(spath, PATH_MAX, "/sys/dev/char/%u:%u/subsystem",
		os_major(st->st_rdev), os_minor(st->st_rdev));

	LOG(4, "device subsystem path \"%s\"", spath);

	char npath[PATH_MAX];
	char *rpath = realpath(spath, npath);
	if (rpath == NULL) {
		ERR("!realpath \"%s\"", spath);
		return OTHER_ERROR;
	}

	if (strcmp(DEVICE_DAX_PREFIX, rpath) != 0) {
		LOG(3, "%s path does not match device dax prefix path", rpath);
		errno = EINVAL;
		return OTHER_ERROR;
	}

	return TYPE_DEVDAX;
#endif
}

/*
 * util_fd_get_type -- checks whether a file descriptor is associated
 *		       with a device dax or a normal file
 */
enum file_type
util_fd_get_type(int fd)
{
	LOG(3, "fd %d", fd);

#ifdef _WIN32
	return TYPE_NORMAL;
#else
	os_stat_t st;

	if (os_fstat(fd, &st) < 0) {
		ERR("!fstat");
		return OTHER_ERROR;
	}

	return get_file_type_internal(&st);
#endif
}

/*
 * util_file_get_type -- checks whether the path points to a device dax,
 *			 normal file or non-existent file
 */
enum file_type
util_file_get_type(const char *path)
{
	LOG(3, "path \"%s\"", path);

	if (path == NULL) {
		ERR("invalid (NULL) path");
		errno = EINVAL;
		return OTHER_ERROR;
	}

	int exists = util_file_exists(path);
	if (exists < 0)
		return OTHER_ERROR;

	if (!exists)
		return NOT_EXISTS;

#ifdef _WIN32
	return TYPE_NORMAL;
#else
	os_stat_t st;

	if (os_stat(path, &st) < 0) {
		ERR("!stat");
		return OTHER_ERROR;
	}

	return get_file_type_internal(&st);
#endif
}

/*
 * util_file_get_size -- returns size of a file
 */
ssize_t
util_file_get_size(const char *path)
{
	LOG(3, "path \"%s\"", path);

	int file_type = util_file_get_type(path);
	if (file_type < 0)
		return -1;

#ifndef _WIN32
	if (file_type == TYPE_DEVDAX) {
		return device_dax_size(path);
	}
#endif

	os_stat_t stbuf;
	if (os_stat(path, &stbuf) < 0) {
		ERR("!stat \"%s\"", path);
		return -1;
	}

	LOG(4, "file length %zu", stbuf.st_size);
	return stbuf.st_size;
}

/*
 * util_file_map_whole -- maps the entire file into memory
 */
void *
util_file_map_whole(const char *path)
{
	LOG(3, "path \"%s\"", path);

	int fd;
	int olderrno;
	void *addr = NULL;

	if ((fd = os_open(path, O_RDWR)) < 0) {
		ERR("!open \"%s\"", path);
		return NULL;
	}

	ssize_t size = util_file_get_size(path);
	if (size < 0) {
		LOG(2, "cannot determine file length \"%s\"", path);
		goto out;
	}

	addr = util_map(fd, (size_t)size, MAP_SHARED, 0, 0, NULL);
	if (addr == NULL) {
		LOG(2, "failed to map entire file \"%s\"", path);
		goto out;
	}

out:
	olderrno = errno;
	(void) os_close(fd);
	errno = olderrno;

	return addr;
}

/*
 * util_file_zero -- zeroes the specified region of the file
 */
int
util_file_zero(const char *path, os_off_t off, size_t len)
{
	LOG(3, "path \"%s\" off %ju len %zu", path, off, len);

	int fd;
	int olderrno;
	int ret = 0;

	if ((fd = os_open(path, O_RDWR)) < 0) {
		ERR("!open \"%s\"", path);
		return -1;
	}

	ssize_t size = util_file_get_size(path);
	if (size < 0) {
		LOG(2, "cannot determine file length \"%s\"", path);
		ret = -1;
		goto out;
	}

	if (off > size) {
		LOG(2, "offset beyond file length, %ju > %ju", off, size);
		ret = -1;
		goto out;
	}

	if ((size_t)off + len > (size_t)size) {
		LOG(2, "requested size of write goes beyond the file length, "
					"%zu > %zu", (size_t)off + len, size);
		LOG(4, "adjusting len to %zu", size - off);
		len = (size_t)(size - off);
	}

	void *addr = util_map(fd, (size_t)size, MAP_SHARED, 0, 0, NULL);
	if (addr == NULL) {
		LOG(2, "failed to map entire file \"%s\"", path);
		ret = -1;
		goto out;
	}

	/* zero initialize the specified region */
	memset((char *)addr + off, 0, len);

	util_unmap(addr, (size_t)size);

out:
	olderrno = errno;
	(void) os_close(fd);
	errno = olderrno;

	return ret;
}

/*
 * util_file_pwrite -- writes to a file with an offset
 */
ssize_t
util_file_pwrite(const char *path, const void *buffer, size_t size,
	os_off_t offset)
{
	LOG(3, "path \"%s\" buffer %p size %zu offset %ju",
			path, buffer, size, offset);

	enum file_type type = util_file_get_type(path);
	if (type < 0)
		return -1;

	if (type == TYPE_NORMAL) {
		int fd = util_file_open(path, NULL, 0, O_RDWR);
		if (fd < 0) {
			LOG(2, "failed to open file \"%s\"", path);
			return -1;
		}

		ssize_t write_len = pwrite(fd, buffer, size, offset);
		int olderrno = errno;
		(void) os_close(fd);
		errno = olderrno;
		return write_len;
	}

	ssize_t file_size = util_file_get_size(path);
	if (file_size < 0) {
		LOG(2, "cannot determine file length \"%s\"", path);
		return -1;
	}

	size_t max_size = (size_t)(file_size - offset);
	if (size > max_size) {
		LOG(2, "requested size of write goes beyond the file length, "
			"%zu > %zu", size, max_size);
		LOG(4, "adjusting size to %zu", max_size);
		size = max_size;
	}

	void *addr = util_file_map_whole(path);
	if (addr == NULL) {
		LOG(2, "failed to map entire file \"%s\"", path);
		return -1;
	}

	memcpy(ADDR_SUM(addr, offset), buffer, size);
	util_unmap(addr, (size_t)file_size);
	return (ssize_t)size;
}

/*
 * util_file_pread -- reads from a file with an offset
 */
ssize_t
util_file_pread(const char *path, void *buffer, size_t size,
	os_off_t offset)
{
	LOG(3, "path \"%s\" buffer %p size %zu offset %ju",
			path, buffer, size, offset);

	enum file_type type = util_file_get_type(path);
	if (type < 0)
		return -1;

	if (type == TYPE_NORMAL) {
		int fd = util_file_open(path, NULL, 0, O_RDONLY);
		if (fd < 0) {
			LOG(2, "failed to open file \"%s\"", path);
			return -1;
		}

		ssize_t read_len = pread(fd, buffer, size, offset);
		int olderrno = errno;
		(void) os_close(fd);
		errno = olderrno;
		return read_len;
	}

	ssize_t file_size = util_file_get_size(path);
	if (file_size < 0) {
		LOG(2, "cannot determine file length \"%s\"", path);
		return -1;
	}

	size_t max_size = (size_t)(file_size - offset);
	if (size > max_size) {
		LOG(2, "requested size of read goes beyond the file length, "
			"%zu > %zu", size, max_size);
		LOG(4, "adjusting size to %zu", max_size);
		size = max_size;
	}

	void *addr = util_file_map_whole(path);
	if (addr == NULL) {
		LOG(2, "failed to map entire file \"%s\"", path);
		return -1;
	}

	memcpy(buffer, ADDR_SUM(addr, offset), size);
	util_unmap(addr, (size_t)file_size);
	return (ssize_t)size;
}

/*
 * util_file_create -- create a new memory pool file
 */
int
util_file_create(const char *path, size_t size, size_t minsize)
{
	LOG(3, "path \"%s\" size %zu minsize %zu", path, size, minsize);

	ASSERTne(size, 0);

	if (size < minsize) {
		ERR("size %zu smaller than %zu", size, minsize);
		errno = EINVAL;
		return -1;
	}

	if (((os_off_t)size) < 0) {
		ERR("invalid size (%zu) for os_off_t", size);
		errno = EFBIG;
		return -1;
	}

	int fd;
	int mode;
	int flags = O_RDWR | O_CREAT | O_EXCL;
#ifndef _WIN32
	mode = 0;
#else
	mode = S_IWRITE | S_IREAD;
	flags |= O_BINARY;
#endif

	/*
	 * Create file without any permission. It will be granted once
	 * initialization completes.
	 */
	if ((fd = os_open(path, flags, mode)) < 0) {
		ERR("!open \"%s\"", path);
		return -1;
	}

	if ((errno = os_posix_fallocate(fd, 0, (os_off_t)size)) != 0) {
		ERR("!posix_fallocate \"%s\", %zu", path, size);
		goto err;
	}

	/* for windows we can't flock until after we fallocate */
	if (os_flock(fd, OS_LOCK_EX | OS_LOCK_NB) < 0) {
		ERR("!flock \"%s\"", path);
		goto err;
	}

	return fd;

err:
	LOG(4, "error clean up");
	int oerrno = errno;
	if (fd != -1)
		(void) os_close(fd);
	os_unlink(path);
	errno = oerrno;
	return -1;
}

/*
 * util_file_open -- open a memory pool file
 */
int
util_file_open(const char *path, size_t *size, size_t minsize, int flags)
{
	LOG(3, "path \"%s\" size %p minsize %zu flags %d", path, size, minsize,
			flags);

	int oerrno;
	int fd;

#ifdef _WIN32
	flags |= O_BINARY;
#endif

	if ((fd = os_open(path, flags)) < 0) {
		ERR("!open \"%s\"", path);
		return -1;
	}

	if (os_flock(fd, OS_LOCK_EX | OS_LOCK_NB) < 0) {
		ERR("!flock \"%s\"", path);
		(void) os_close(fd);
		return -1;
	}

	if (size || minsize) {
		if (size)
			ASSERTeq(*size, 0);

		ssize_t actual_size = util_file_get_size(path);
		if (actual_size < 0) {
			ERR("stat \"%s\": negative size", path);
			errno = EINVAL;
			goto err;
		}

		if ((size_t)actual_size < minsize) {
			ERR("size %zu smaller than %zu",
					(size_t)actual_size, minsize);
			errno = EINVAL;
			goto err;
		}

		if (size) {
			*size = (size_t)actual_size;
			LOG(4, "actual file size %zu", *size);
		}
	}

	return fd;
err:
	oerrno = errno;
	if (os_flock(fd, OS_LOCK_UN))
		ERR("!flock unlock");
	(void) os_close(fd);
	errno = oerrno;
	return -1;
}

/*
 * util_unlink -- unlinks a file or zeroes a device dax
 */
int
util_unlink(const char *path)
{
	LOG(3, "path \"%s\"", path);

	enum file_type type = util_file_get_type(path);
	if (type < 0)
		return -1;

	if (type == TYPE_DEVDAX) {
		return util_file_zero(path, 0, DEVICE_DAX_ZERO_LEN);
	} else {
#ifdef _WIN32
		/* on Windows we can not unlink Read-Only files */
		if (os_chmod(path, S_IREAD | S_IWRITE) == -1) {
			ERR("!chmod \"%s\"", path);
			return -1;
		}
#endif
		return os_unlink(path);
	}
}

/*
 * util_unlink_flock -- flocks the file and unlinks it
 *
 * The unlink(2) call on a file which is opened and locked using flock(2)
 * by different process works on linux. Thus in order to forbid removing a
 * pool when in use by different process we need to flock(2) the pool files
 * first before unlinking.
 */
int
util_unlink_flock(const char *path)
{
	LOG(3, "path \"%s\"", path);

#ifdef WIN32
	/*
	 * On Windows it is not possible to unlink the
	 * file if it is flocked.
	 */
	return util_unlink(path);
#else
	int fd = util_file_open(path, NULL, 0, O_RDONLY);
	if (fd < 0) {
		LOG(2, "failed to open file \"%s\"", path);
		return -1;
	}

	int ret = util_unlink(path);

	(void) os_close(fd);

	return ret;
#endif
}

/*
 * util_write_all -- a wrapper for util_write
 *
 * writes exactly count bytes from buf to file referred to by fd
 * returns -1 on error, 0 otherwise
 */
int
util_write_all(int fd, const char *buf, size_t count)
{
	ssize_t n_wrote = 0;
	size_t total = 0;

	while (count > total) {
		n_wrote = util_write(fd, buf, count - total);
		if (n_wrote <= 0)
			return -1;

		buf += (size_t)n_wrote;
		total += (size_t)n_wrote;
	}

	return 0;
}