Blob Blame History Raw
/*
 * libhugetlbfs - Easy use of Linux hugepages
 * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#define _LARGEFILE64_SOURCE /* Need this for statfs64 */
#define _GNU_SOURCE
#include <dlfcn.h>
#include <features.h>

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <signal.h>
#include <dirent.h>

#include <unistd.h>
#include <fcntl.h>
#include <sys/vfs.h>
#include <sys/statfs.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/syscall.h>
#include <linux/types.h>
#include <linux/unistd.h>
#include <dirent.h>

#include "libhugetlbfs_internal.h"
#include "hugetlbfs.h"

struct libhugeopts_t __hugetlb_opts;

static int hugepagesize_errno; /* = 0 */

#define MAX_HPAGE_SIZES 10
static struct hpage_size hpage_sizes[MAX_HPAGE_SIZES];
static int nr_hpage_sizes;
static int hpage_sizes_default_idx = -1;

static long default_size;

/********************************************************************/
/* Internal functions                                               */
/********************************************************************/

/*
 * Lookup the kernel default page size.
 */
long kernel_default_hugepage_size()
{
	if (default_size == 0) {
		default_size = file_read_ulong(MEMINFO, "Hugepagesize:");
		default_size = size_to_smaller_unit(default_size); /* kB to B */	}
	return default_size;
}
void kernel_default_hugepage_size_reset(void)
{
	default_size = 0;
}

#define BUF_SZ 256
#define MEMINFO_SIZE	2048

/*
 * Convert a quantity in a given unit to the next smallest unit by
 * multiplying the quantity by 1024 (eg. convert 1MB to 1024kB).
 * If the conversion would overflow the variable, return ULONGLONG_MAX to
 * signify the error.
 */
unsigned long long size_to_smaller_unit(unsigned long long size)
{
	if (size * 1024 < size)
		return -1;
	else
		return size * 1024;
}

/*
 * Convert a page size string with an optional unit suffix into a page size
 * in bytes.
 *
 * On error, -1 is returned and errno is set appropriately:
 * 	EINVAL		- str could not be parsed or was not greater than zero
 *	EOVERFLOW	- Overflow when converting from the specified units
 */
long parse_page_size(const char *str)
{
	char *pos;
	long size;

	errno = 0;
	size = strtol(str, &pos, 0);
	/* Catch strtoul errors and sizes that overflow the native word size */
	if (errno || str == pos || size <= 0) {
		if (errno == ERANGE)
			errno = EOVERFLOW;
		else
			errno = EINVAL;
		return -1;
	}

	switch (*pos) {
	case 'G':
	case 'g':
		size = size_to_smaller_unit(size);
	case 'M':
	case 'm':
		size = size_to_smaller_unit(size);
	case 'K':
	case 'k':
		size = size_to_smaller_unit(size);
	}

	if (size < 0)
		errno = EOVERFLOW;
	return size;
}

struct hugetlb_pool_counter_info_t {
	char *meminfo_key;
	char *sysfs_file;
};

static struct hugetlb_pool_counter_info_t hugetlb_counter_info[] = {
	[HUGEPAGES_TOTAL] = {
		.meminfo_key	= "HugePages_Total:",
		.sysfs_file	= "nr_hugepages",
	},
	[HUGEPAGES_TOTAL_MEMPOL] = {
		.meminfo_key	= "HugePages_Total:",
		.sysfs_file	= "nr_hugepages_mempolicy",
	},
	[HUGEPAGES_FREE] = {
		.meminfo_key	= "HugePages_Free:",
		.sysfs_file	= "free_hugepages",
	},
	[HUGEPAGES_RSVD] = {
		.meminfo_key	= "HugePages_Rsvd:",
		.sysfs_file	= "resv_hugepages",
	},
	[HUGEPAGES_SURP] = {
		.meminfo_key	= "HugePages_Surp:",
		.sysfs_file	= "surplus_hugepages",
	},
	[HUGEPAGES_OC] = {
		.meminfo_key	= NULL,
		.sysfs_file	= "nr_overcommit_hugepages"
	},
};

/*
 * Read numeric data from raw and tagged kernel status files.  Used to read
 * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
 */
long file_read_ulong(char *file, const char *tag)
{
	int fd;
	char buf[MEMINFO_SIZE];
	int len, readerr;
	char *p, *q;
	long val;

	fd = open(file, O_RDONLY);
	if (fd < 0) {
		ERROR("Couldn't open %s: %s\n", file, strerror(errno));
		return -1;
	}

	len = read(fd, buf, sizeof(buf));
	readerr = errno;
	close(fd);
	if (len < 0) {
		ERROR("Error reading %s: %s\n", file, strerror(readerr));
		return -1;
	}
	if (len == sizeof(buf)) {
		ERROR("%s is too large\n", file);
		return -1;
	}
	buf[len] = '\0';

	/* Search for a tag if provided */
	if (tag) {
		p = strstr(buf, tag);
		if (!p)
			return -1; /* looks like the line we want isn't there */
		p += strlen(tag);
	} else
		p = buf;

	val = strtol(p, &q, 0);
	if (! isspace(*q)) {
		ERROR("Couldn't parse %s value\n", file);
		return -1;
	}

	return val;
}

int file_write_ulong(char *file, unsigned long val)
{
	int fd, ret, buflen;
	char buf[20];

	fd = open(file, O_WRONLY);
	if (fd < 0) {
		ERROR("Couldn't open %s: %s\n", file, strerror(errno));
		return -1;
	}

	buflen = sprintf(buf, "%lu", val);
	ret = write(fd, buf, buflen);
	close(fd);
	return ret > 0 ? 0 : -1;
}


/*
 * Return the name of this executable, using buf as temporary space.
 */
#define MAX_EXE 4096
static char *get_exe_name(char *buf, int size)
{
	char *p;
	int fd;
	ssize_t nread;

	buf[0] = 0;
	fd = open("/proc/self/cmdline", O_RDONLY);
	if (fd < 0) {
		WARNING("Unable to open cmdline, no exe name\n");
		return buf;
	}
	nread = read(fd, buf, size-1);
	close(fd);

	if (nread < 0) {
		WARNING("Error %d reading cmdline, no exe name\n", errno);
		return buf;
	}
	if (nread == 0) {
		WARNING("Read zero bytes from cmdline, no exe name\n");
		return buf;
	}

	buf[nread] = 0; /* make sure we're null terminated */
	/*
	 * Take advantage of cmdline being a series of null-terminated
	 * strings.  The first string is the path to the executable in
	 * the form:
	 *
	 *      /path/to/exe
	 *
	 * The exe name starts one character after the last '/'.
	 */
	p = strrchr(buf, '/');
	if (!p)
		return buf;
	return p + 1;           /* skip over "/" */
}


/*
 * Reads the contents of hugetlb environment variables and save their
 * values for later use.
 */
void hugetlbfs_setup_env()
{
	char *env;

	__hugetlb_opts.min_copy = true;

	env = getenv("HUGETLB_VERBOSE");
	if (env)
		__hugetlbfs_verbose = atoi(env);

	env = getenv("HUGETLB_DEBUG");
	if (env) {
		__hugetlbfs_debug = true;
		__hugetlbfs_verbose = VERBOSE_DEBUG;
	}

	env = getenv("HUGETLB_RESTRICT_EXE");
	if (env) {
		char *p, *tok, *exe, buf[MAX_EXE+1], restriction[MAX_EXE];
		int found = 0;

		exe = get_exe_name(buf, sizeof buf);
		DEBUG("Found HUGETLB_RESTRICT_EXE, this exe is \"%s\"\n", exe);
		strncpy(restriction, env, sizeof restriction);
		restriction[sizeof(restriction)-1] = 0;
		for (p = restriction; (tok = strtok(p, ":")) != NULL; p = NULL) {
			DEBUG("  ...check exe match for \"%s\"\n",  tok);
			if (strcmp(tok, exe) == 0) {
				found = 1;
				DEBUG("exe match - libhugetlbfs is active for this exe\n");
				break;
			}
		}
		if (!found) {
			DEBUG("No exe match - libhugetlbfs is inactive for this exe\n");
			return;
		}
	}

	env = getenv("HUGETLB_NO_PREFAULT");
	if (env)
		__hugetlbfs_prefault = false;

	__hugetlb_opts.share_path = getenv("HUGETLB_SHARE_PATH");
	__hugetlb_opts.elfmap = getenv("HUGETLB_ELFMAP");
	__hugetlb_opts.ld_preload = getenv("LD_PRELOAD");
	__hugetlb_opts.def_page_size = getenv("HUGETLB_DEFAULT_PAGE_SIZE");
	__hugetlb_opts.path = getenv("HUGETLB_PATH");
	__hugetlb_opts.features = getenv("HUGETLB_FEATURES");
	__hugetlb_opts.morecore = getenv("HUGETLB_MORECORE");
	__hugetlb_opts.heapbase = getenv("HUGETLB_MORECORE_HEAPBASE");

	if (__hugetlb_opts.morecore)
		__hugetlb_opts.thp_morecore =
			(strcasecmp(__hugetlb_opts.morecore, "thp") == 0);

	if (__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) {
		DEBUG("Heapbase specified with THP for morecore, ignoring heapbase\n");
		__hugetlb_opts.heapbase = NULL;
	}

	env = getenv("HUGETLB_FORCE_ELFMAP");
	if (env && (strcasecmp(env, "yes") == 0))
		__hugetlb_opts.force_elfmap = 1;

	env = getenv("HUGETLB_MINIMAL_COPY");
	if (__hugetlb_opts.min_copy && env && (strcasecmp(env, "no") == 0)) {
		INFO("HUGETLB_MINIMAL_COPY=%s, disabling filesz copy "
			"optimization\n", env);
		__hugetlb_opts.min_copy = false;
	}

	env = getenv("HUGETLB_SHARE");
	if (env)
		__hugetlb_opts.sharing = atoi(env);

	/*
	 * We have been seeing some unexpected behavior from malloc when
	 * heap shrinking is enabled, so heap shrinking is disabled by
	 * default.
	 *
	 * If malloc has been called successfully before setup_morecore,
	 * glibc will notice a gap between the previous top-of-heap and
	 * the new top-of-heap when it calls hugetlbfs_morecore.  It treats
	 * this as a "foreign sbrk."  Unfortunately, the "foreign sbrk"
	 * handling code will then immediately try to free the memory
	 * allocated by hugetlbfs_morecore!
	 *
	 * This behavior has been reported to the ptmalloc2 maintainer,
	 * along with a patch to correct the behavior.
	 */
	env = getenv("HUGETLB_MORECORE_SHRINK");
	if (env && strcasecmp(env, "yes") == 0)
		__hugetlb_opts.shrink_ok = true;

	/* Determine if shmget() calls should be overridden */
	env = getenv("HUGETLB_SHM");
	if (env && !strcasecmp(env, "yes"))
		__hugetlb_opts.shm_enabled = true;

	/* Determine if all reservations should be avoided */
	env = getenv("HUGETLB_NO_RESERVE");
	if (env && !strcasecmp(env, "yes"))
		__hugetlb_opts.no_reserve = true;
}

void hugetlbfs_setup_kernel_page_size()
{
	long page_size = kernel_default_hugepage_size();

	if (page_size <= 0) {
		WARNING("Unable to find default kernel huge page size\n");
		return;
	}

	INFO("Found pagesize %ld kB\n", page_size / 1024);
	hpage_sizes[0].pagesize = page_size;

	nr_hpage_sizes = 1;
}

void hugetlbfs_check_priv_resv()
{
	/*
	 * If the kernel supports MAP_PRIVATE reservations, we can skip
	 * prefaulting the huge pages we allocate since the kernel
	 * guarantees them.  This can help NUMA performance quite a bit.
	 */
	if (hugetlbfs_test_feature(HUGETLB_FEATURE_PRIVATE_RESV) > 0) {
		INFO("Kernel has MAP_PRIVATE reservations.  Disabling "
			"heap prefaulting.\n");
		__hugetlbfs_prefault = false;
	}
}

void hugetlbfs_check_safe_noreserve()
{
	/*
	 * Some kernels will trigger an OOM if MAP_NORESERVE is used and
	 * a huge page allocation fails. This is unfortunate so limit
	 * the user of NORESERVE where necessary
	 */
	if (__hugetlb_opts.no_reserve &&
		hugetlbfs_test_feature(HUGETLB_FEATURE_SAFE_NORESERVE) <= 0) {
		INFO("Kernel is not safe for MAP_NORESERVE. Forcing "
			"use of reservations.\n");
		__hugetlb_opts.no_reserve = false;
	}
}

void hugetlbfs_check_map_hugetlb()
{
/*
 * FIXME: MAP_HUGETLB has not been picked up by glibc so even though the
 * kernel may support it, without the userspace mmap flag it cannot be
 * used.  This ifdef should be removed when the MAP_HUGETLB flag makes it
 * into glibc.
 */
#ifdef MAP_HUGETLB
	/*
	 * Kernels after 2.6.32 support mmaping pseudo-anonymous regions
	 * backed by huge pages, use this feature for huge pages we
	 * don't intend to share.
	 */
	if (hugetlbfs_test_feature(HUGETLB_FEATURE_MAP_HUGETLB) > 0) {
		INFO("Kernel supports MAP_HUGETLB\n");
		__hugetlb_opts.map_hugetlb = true;
	}
#endif
}

/*
 * Pool counters are typically exposed in sysfs in modern kernels, the
 * counters for the default page size are exposed in procfs in all kernels
 * supporting hugepages.  Given a specific counter (e.g. HUGEPAGES_RSVD)
 * and a page size return both a filename and an optional tag to locate
 * and extract this counter.
 */
static int select_pool_counter(unsigned int counter, unsigned long pagesize,
				char *filename, char **key)
{
	long default_size;
	char *meminfo_key;
	char *sysfs_file;

	if (counter >= HUGEPAGES_MAX_COUNTERS) {
		ERROR("Invalid counter specified\n");
		return -1;
	}

	meminfo_key = hugetlb_counter_info[counter].meminfo_key;
	sysfs_file = hugetlb_counter_info[counter].sysfs_file;
	if (key)
		*key = NULL;

	/*
	 * Get the meminfo page size.
	 * This could be made more efficient if utility functions were shared
	 * between libhugetlbfs and the test suite.  For now we will just
	 * read /proc/meminfo.
	 */
	default_size = kernel_default_hugepage_size();
	if (default_size < 0) {
		ERROR("Cannot determine the default page size\n");
		return -1;
	}

	/* If the user is dealing in the default page size, we can use /proc */
	if (pagesize == default_size) {
		if (meminfo_key && key) {
			strcpy(filename, MEMINFO);
			*key = meminfo_key;
		} else
			sprintf(filename, PROC_HUGEPAGES_DIR "%s", sysfs_file);
	} else /* Use the sysfs interface */
		sprintf(filename, SYSFS_HUGEPAGES_DIR "hugepages-%lukB/%s",
			pagesize / 1024, sysfs_file);
	return 0;
}

static int hpage_size_to_index(unsigned long size)
{
	int i;

	for (i = 0; i < nr_hpage_sizes; i++)
		if (hpage_sizes[i].pagesize == size)
			return i;
	return -1;
}

void probe_default_hpage_size(void)
{
	long size;
	int index;
	int default_overrided;

	if (nr_hpage_sizes == 0) {
		INFO("No configured huge page sizes\n");
		hpage_sizes_default_idx = -1;
		return;
	}

	/*
	 * Check if the user specified a default size, otherwise use the
	 * system default size as reported by /proc/meminfo.
	 */
	default_overrided = (__hugetlb_opts.def_page_size &&
				strlen(__hugetlb_opts.def_page_size) > 0);
	if (default_overrided)
		size = parse_page_size(__hugetlb_opts.def_page_size);
	else {
		size = kernel_default_hugepage_size();
	}

	if (size >= 0) {
		index = hpage_size_to_index(size);
		if (index >= 0)
			hpage_sizes_default_idx = index;
		else {
			/*
			 * If the user specified HUGETLB_DEFAULT_PAGE_SIZE,
			 * then this situation will alter semantics and they
			 * should receive a WARNING.  Otherwise, this detail
			 * is purely informational in nature.
			 */
			char msg[] = "No mount point found for default huge " \
				"page size. Using first available mount "
				"point.\n";
			if (default_overrided)
				WARNING("%s", msg);
			else
				INFO("%s", msg);
			hpage_sizes_default_idx = 0;
		}
	} else {
		ERROR("Unable to determine default huge page size\n");
		hpage_sizes_default_idx = -1;
	}
}

static void add_hugetlbfs_mount(char *path, int user_mount)
{
	int idx;
	long size;

	if (strlen(path) > PATH_MAX)
		return;

	if (!hugetlbfs_test_path(path)) {
		WARNING("%s is not a hugetlbfs mount point, ignoring\n", path);
		return;
	}

	size = hugetlbfs_test_pagesize(path);
	if (size < 0) {
		WARNING("Unable to detect page size for path %s\n", path);
		return;
	}

	idx = hpage_size_to_index(size);
	if (idx < 0) {
		if (nr_hpage_sizes >= MAX_HPAGE_SIZES) {
			WARNING("Maximum number of huge page sizes exceeded, "
				"ignoring %lukB page size\n", size);
			return;
		}

		idx = nr_hpage_sizes;
		hpage_sizes[nr_hpage_sizes++].pagesize = size;
	}

	if (strlen(hpage_sizes[idx].mount)) {
		if (user_mount)
			WARNING("Mount point already defined for size %li, "
				"ignoring %s\n", size, path);
		return;
	}

	strcpy(hpage_sizes[idx].mount, path);
}

void debug_show_page_sizes(void)
{
	int i;

	INFO("Detected page sizes:\n");
	for (i = 0; i < nr_hpage_sizes; i++)
		INFO("   Size: %li kB %s  Mount: %s\n",
			hpage_sizes[i].pagesize / 1024,
			i == hpage_sizes_default_idx ? "(default)" : "",
			hpage_sizes[i].mount);
}

#define LINE_MAXLEN	2048
static void find_mounts(void)
{
	int fd;
	char path[PATH_MAX+1];
	char line[LINE_MAXLEN + 1];
	char *eol;
	char *match;
	char *end;
	int bytes;
	off_t offset;

	fd = open("/proc/mounts", O_RDONLY);
	if (fd < 0) {
		fd = open("/etc/mtab", O_RDONLY);
		if (fd < 0) {
			ERROR("Couldn't open /proc/mounts or /etc/mtab (%s)\n",
				strerror(errno));
			return;
		}
	}

	while ((bytes = read(fd, line, LINE_MAXLEN)) > 0) {
		line[LINE_MAXLEN] = '\0';
		eol = strchr(line, '\n');
		if (!eol) {
			ERROR("Line too long when parsing mounts\n");
			break;
		}

		/*
		 * Truncate the string to just one line and reset the file
		 * to begin reading at the start of the next line.
		 */
		*eol = '\0';
		offset = bytes - (eol + 1 - line);
		lseek(fd, -offset, SEEK_CUR);

		/* Match only hugetlbfs filesystems. */
		match = strstr(line, " hugetlbfs ");
		if (match) {
			match = strchr(line, '/');
			if (!match)
				continue;
			end = strchr(match, ' ');
			if (!end)
				continue;

			strncpy(path, match, end - match);
			path[end - match] = '\0';
			if ((hugetlbfs_test_path(path) == 1) &&
			    !(access(path, R_OK | W_OK | X_OK)))
				add_hugetlbfs_mount(path, 0);
		}
	}
	close(fd);
}

void setup_mounts(void)
{
	int do_scan = 1;

	/* If HUGETLB_PATH is set, only add mounts specified there */
	while (__hugetlb_opts.path) {
		char path[PATH_MAX + 1];
		char *next = strchrnul(__hugetlb_opts.path, ':');

		do_scan = 0;
		if (next - __hugetlb_opts.path > PATH_MAX) {
			ERROR("Path too long in HUGETLB_PATH -- "
				"ignoring environment\n");
			break;
		}

		strncpy(path, __hugetlb_opts.path, next - __hugetlb_opts.path);
		path[next - __hugetlb_opts.path] = '\0';
		add_hugetlbfs_mount(path, 1);

		/* skip the ':' token */
		__hugetlb_opts.path = *next == '\0' ? NULL : next + 1;
	}

	/* Then probe all mounted filesystems */
	if (do_scan)
		find_mounts();
}

int get_pool_size(long size, struct hpage_pool *pool)
{
	long nr_over = 0;
	long nr_used = 0;
	long nr_surp = 0;
	long nr_resv = 0;
	long nr_static = 0;

	long it_used = -1;
	long it_surp = -1;
	long it_resv = -1;

	/*
	 * Pick up those values which are basically stable with respect to
	 * the admin; ie. only changed by them.
	 *
	 * nr_over may be negative if this kernel does not support overcommit
	 * in that case we will consider it always 0 and max will track min
	 * always.
	 */
	nr_over = get_huge_page_counter(size, HUGEPAGES_OC);
	if (nr_over < 0)
		nr_over = 0;

	/* Sample the volatile values until they are stable. */
	while (nr_used != it_used || nr_surp != it_surp || nr_resv != it_resv) {
		nr_used = it_used;
		nr_surp = it_surp;
		nr_resv = it_resv;

		it_used = get_huge_page_counter(size, HUGEPAGES_TOTAL);
		it_surp = get_huge_page_counter(size, HUGEPAGES_SURP);
		it_resv = get_huge_page_counter(size, HUGEPAGES_RSVD);
	}
	if (nr_surp < 0)
		nr_surp = 0;
	if (nr_resv < 0)
		nr_resv = 0;

	nr_static = nr_used - nr_surp;

	if (nr_static >= 0) {
		DEBUG("pagesize<%ld> min<%ld> max<%ld> "
			"in-use<%ld>\n",
			size, nr_static, nr_static + nr_over,
			nr_used);
		pool->pagesize = size;
		pool->minimum = nr_static;
		pool->maximum = nr_static + nr_over;
		pool->size = nr_used;
		pool->is_default = 0;

		return 1;
	}

	return 0;
}

int hpool_sizes(struct hpage_pool *pools, int pcnt)
{
	long default_size;
	int which = 0;
	DIR *dir;
	struct dirent *entry;

	default_size = kernel_default_hugepage_size();
	if (default_size >= 0 && which < pcnt)
		if (get_pool_size(default_size, &pools[which])) {
			pools[which].is_default = 1;
			which++;
		}

	dir = opendir(SYSFS_HUGEPAGES_DIR);
	if (dir) {
		while ((entry = readdir(dir))) {
			char *name = entry->d_name;
			long size;

			DEBUG("parsing<%s>\n", name);
			if (strncmp(name, "hugepages-", 10) != 0)
				continue;
			name += 10;

			size = size_to_smaller_unit(atol(name));
			if (size < 0 || size == default_size)
				continue;

			if (get_pool_size(size, &pools[which]))
				which++;
		}
		closedir(dir);
	}

	return (which < pcnt) ? which : -1;
}

int arch_has_slice_support(void)
{
#ifdef __powerpc64__
	char mmu_type[16];
	FILE *fp;

	fp = popen("cat /proc/cpuinfo | grep MMU | awk '{ print $3}'", "r");
	if (!fp || fscanf(fp, "%s", mmu_type) < 0) {
		ERROR("Failed to determine MMU type\n");
		abort();
	}

	pclose(fp);
	return strcmp(mmu_type, "Hash") == 0;
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
	return 1;
#else
	return 0;
#endif
}

/*
 * If we have a default page size then we support hugepages.
 */
int kernel_has_hugepages(void)
{
	long default_size = kernel_default_hugepage_size();
	if (default_size < 0)
		return 0;

	return 1;
}

/*
 * If we can find the default page size, and if we can find an overcommit
 * control for it then the kernel must support overcommit.
 */
int kernel_has_overcommit(void)
{
	long default_size = kernel_default_hugepage_size();
	if (default_size < 0)
		return 0;

	if (get_huge_page_counter(default_size, HUGEPAGES_OC) < 0)
		return 0;

	return 1;
}

/********************************************************************/
/* Library user visible functions                                   */
/********************************************************************/

/*
 * NOTE: This function uses data that is initialized by
 * setup_mounts() which is called during libhugetlbfs initialization.
 *
 * returns:
 *   on success, size of a huge page in number of bytes
 *   on failure, -1
 *	errno set to ENOSYS if huge pages are not supported
 *	errno set to EOVERFLOW if huge page size would overflow return type
 */
long gethugepagesize(void)
{
	long hpage_size;

	/* Are huge pages available and have they been initialized? */
	if (hpage_sizes_default_idx == -1) {
		errno = hugepagesize_errno = ENOSYS;
		return -1;
	}

	errno = 0;
	hpage_size = hpage_sizes[hpage_sizes_default_idx].pagesize;
	return hpage_size;
}

int gethugepagesizes(long pagesizes[], int n_elem)
{
	long default_size;
	DIR *sysfs;
	struct dirent *ent;
	int nr_sizes = 0;

	if (n_elem < 0) {
		errno = EINVAL;
		return -1;
	}

	if (n_elem > 0 && pagesizes == NULL) {
		errno = EINVAL;
		return -1;
	}

	errno = 0;

	/* Get the system default size. */
	default_size = kernel_default_hugepage_size();
	if (default_size < 0)
		return 0;

	if (pagesizes && (nr_sizes == n_elem))
		return nr_sizes;
	if (pagesizes)
		pagesizes[nr_sizes] = default_size;
	nr_sizes++;

	/*
	 * Scan sysfs to look for other sizes.
	 * Non-existing dir is not an error, we got one size from /proc/meminfo.
	 */
	sysfs = opendir(SYSFS_HUGEPAGES_DIR);
	if (!sysfs) {
		if (errno == ENOENT) {
			errno = 0;
			return nr_sizes;
		} else
			return -1;
	}
	while ((ent = readdir(sysfs))) {
		long size;

		if (strncmp(ent->d_name, "hugepages-", 10))
			continue;

		size = strtol(ent->d_name + 10, NULL, 10);
		if (size == LONG_MIN || size == LONG_MAX)
			continue;
		size = size_to_smaller_unit(size);

		if (size < 0 || size == default_size)
			continue;
		if (pagesizes && (nr_sizes == n_elem))
			return nr_sizes;
		if (pagesizes)
			pagesizes[nr_sizes] = size;
		nr_sizes++;
	}
	closedir(sysfs);

	return nr_sizes;
}

int getpagesizes(long pagesizes[], int n_elem)
{
	int ret;

	if (n_elem < 0 || (n_elem > 0 && pagesizes == NULL)) {
		errno = EINVAL;
		return -1;
	}

	/* Requests for sizing, we need one more slot than gethugepagesizes. */
	if (pagesizes == NULL && n_elem == 0) {
		ret = gethugepagesizes(pagesizes, n_elem);
	} else {
		/* Install the base page size. */
		if (pagesizes && n_elem == 0)
			return 0;
		if (pagesizes)
			pagesizes[0] = sysconf(_SC_PAGESIZE);

		ret = gethugepagesizes(pagesizes + 1, n_elem - 1);
	}
	if (ret < 0)
		return ret;
	return ret + 1;
}

int hugetlbfs_test_path(const char *mount)
{
	struct statfs64 sb;
	int err;

	/* Bugs in the 32<->64 translation code in pre-2.6.15 kernels
	 * mean that plain statfs() returns bogus errors on hugetlbfs
	 * filesystems.  Use statfs64() to work around. */
	err = statfs64(mount, &sb);
	if (err)
		return -1;

	return (sb.f_type == HUGETLBFS_MAGIC);
}

/* Return the page size for the given mount point in bytes */
long hugetlbfs_test_pagesize(const char *mount)
{
	struct statfs64 sb;
	int err;

	err = statfs64(mount, &sb);
	if (err)
		return -1;

	if ((sb.f_bsize <= 0) || (sb.f_bsize > LONG_MAX))
		return -1;

	return sb.f_bsize;
}

const char *hugetlbfs_find_path_for_size(long page_size)
{
	char *path;
	int idx;

	idx = hpage_size_to_index(page_size);
	if (idx >= 0) {
		path = hpage_sizes[idx].mount;
		if (strlen(path))
			return path;
	}
	return NULL;
}

const char *hugetlbfs_find_path(void)
{
	long hpage_size = gethugepagesize();
	if (hpage_size > 0)
		return hugetlbfs_find_path_for_size(hpage_size);
	else
		return NULL;
}

int hugetlbfs_unlinked_fd_for_size(long page_size)
{
	const char *path;
	char name[PATH_MAX+1];
	int fd;

	path = hugetlbfs_find_path_for_size(page_size);
	if (!path)
		return -1;

	name[sizeof(name)-1] = '\0';

	strcpy(name, path);
	strncat(name, "/libhugetlbfs.tmp.XXXXXX", sizeof(name)-1);
	/* FIXME: deal with overflows */

	fd = mkstemp64(name);

	if (fd < 0) {
		ERROR("mkstemp() failed: %s\n", strerror(errno));
		return -1;
	}

	unlink(name);

	return fd;
}

int hugetlbfs_unlinked_fd(void)
{
	long hpage_size = gethugepagesize();
	if (hpage_size > 0)
		return hugetlbfs_unlinked_fd_for_size(hpage_size);
	else
		return -1;
}

#define IOV_LEN 64
int hugetlbfs_prefault(void *addr, size_t length)
{
	size_t offset;
	struct iovec iov[IOV_LEN];
	int ret;
	int i;
	int fd;

	if (!__hugetlbfs_prefault)
		return 0;

	/*
	 * The NUMA users of libhugetlbfs' malloc feature are
	 * expected to use the numactl program to specify an
	 * appropriate policy for hugepage allocation
	 *
	 * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT
	 * is set. If we instead returned a hugepage mapping with insufficient
	 * hugepages, the VM system would kill the process when the
	 * process tried to access the missing memory.
	 *
	 * The value of this environment variable is read during library
	 * initialisation and sets __hugetlbfs_prefault accordingly. If
	 * prefaulting is enabled and we can't get all that were requested,
	 * -ENOMEM is returned. The caller is expected to release the entire
	 * mapping and optionally it may recover by mapping base pages instead.
	 */

	fd = open("/dev/zero", O_RDONLY);
	if (fd < 0) {
		ERROR("Failed to open /dev/zero for reading\n");
		return -ENOMEM;
	}

	for (offset = 0; offset < length; ) {
		for (i = 0; i < IOV_LEN && offset < length; i++) {
			iov[i].iov_base = addr + offset;
			iov[i].iov_len = 1;
			offset += gethugepagesize();
		}
		ret = readv(fd, iov, i);
		if (ret != i) {
			DEBUG("Got %d of %d requested; err=%d\n", ret,
					i, ret < 0 ? errno : 0);
			WARNING("Failed to reserve %ld huge pages "
					"for new region\n",
					length / gethugepagesize());
			close(fd);
			return -ENOMEM;
		}
	}

	close(fd);
	return 0;
}

long get_huge_page_counter(long pagesize, unsigned int counter)
{
	char file[PATH_MAX+1];
	char *key;

	if (select_pool_counter(counter, pagesize, file, &key))
		return -1;

	if (access(file, O_RDONLY))
		return -1;

	return file_read_ulong(file, key);
}

int set_huge_page_counter(long pagesize, unsigned int counter,
			unsigned long val)
{
	char file[PATH_MAX+1];

	if (select_pool_counter(counter, pagesize, file, NULL))
		return -1;

	return file_write_ulong(file, val);
}

int set_nr_hugepages(long pagesize, unsigned long val)
{
	return set_huge_page_counter(pagesize, HUGEPAGES_TOTAL, val);
}

int set_nr_overcommit_hugepages(long pagesize, unsigned long val)
{
	DEBUG("setting HUGEPAGES_OC to %ld\n", val);
	return set_huge_page_counter(pagesize, HUGEPAGES_OC, val);
}

long read_nr_overcommit(long page_size)
{
	if (!kernel_has_overcommit())
		return -1;

	return get_huge_page_counter(page_size, HUGEPAGES_OC);
}

void restore_overcommit_pages(long page_size, long oc_pool)
{
	if (!kernel_has_overcommit())
		return;

	set_nr_overcommit_hugepages(page_size, oc_pool);
}

/********************************************************************/
/* Library user visible DIAGNOSES/DEBUGGING ONLY functions          */
/********************************************************************/

#define MAPS_BUF_SZ 4096
long dump_proc_pid_maps()
{
	FILE *f;
	char line[MAPS_BUF_SZ];
	size_t ret;

	f = fopen("/proc/self/maps", "r");
	if (!f) {
		ERROR("Failed to open /proc/self/maps\n");
		return -1;
	}

	while (1) {
		ret = fread(line, sizeof(char), MAPS_BUF_SZ, f);
		if (ret < 0) {
			ERROR("Failed to read /proc/self/maps\n");
			return -1;
		}
		if (ret == 0)
			break;
		ret = fwrite(line, sizeof(char), ret, stderr);
		if (ret < 0) {
			ERROR("Failed to write /proc/self/maps to stderr\n");
			return -1;
		}
	}

	fclose(f);
	return 0;
}

long read_meminfo(const char *tag)
{
	return file_read_ulong(MEMINFO, tag);
}