Blob Blame History Raw
/***************************************************************************
 *   User front end for using huge pages Copyright (C) 2008, IBM           *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the Lesser GNU General Public License as        *
 *   published by the Free Software Foundation; either version 2.1 of the  *
 *   License, or at your option) any later version.                        *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU Lesser General Public License for more details.                   *
 *                                                                         *
 *   You should have received a copy of the Lesser GNU General Public      *
 *   License along with this program; if not, write to the                 *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/

/*
 * hugeadm is designed to make an administrators life simpler, to automate
 * and simplify basic system configuration as it relates to hugepages.  It
 * is designed to help with pool and mount configuration.
 */

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <limits.h>
#include <mntent.h>
#include <unistd.h>
#include <grp.h>
#include <pwd.h>
#include <fcntl.h>

#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mount.h>
#include <sys/swap.h>
#include <sys/wait.h>

#define _GNU_SOURCE /* for getopt_long */
#include <unistd.h>
#include <getopt.h>

#define KB (1024)
#define MB (1024*KB)
#define GB (1024*MB)

#define REPORT_UTIL "hugeadm"
#define REPORT(level, prefix, format, ...)                                    \
	do {                                                                  \
		if (verbose_level >= level)                                   \
			fprintf(stderr, "hugeadm:" prefix ": " format,       \
				##__VA_ARGS__);                               \
	} while (0);

#include "libhugetlbfs_internal.h"
#include "hugetlbfs.h"

extern int optind;
extern char *optarg;

#define OPTION(opts, text)	fprintf(stderr, " %-25s  %s\n", opts, text)
#define CONT(text) 		fprintf(stderr, " %-25s  %s\n", "", text)

#define MOUNT_DIR "/var/lib/hugetlbfs"
#define OPT_MAX 4096

#define PROCMOUNTS "/proc/mounts"
#define PROCHUGEPAGES_MOVABLE "/proc/sys/vm/hugepages_treat_as_movable"
#define PROCMINFREEKBYTES "/proc/sys/vm/min_free_kbytes"
#define PROCSHMMAX "/proc/sys/kernel/shmmax"
#define PROCHUGETLBGROUP "/proc/sys/vm/hugetlb_shm_group"
#define PROCZONEINFO "/proc/zoneinfo"
#define FS_NAME "hugetlbfs"
#define MIN_COL 20
#define MAX_SIZE_MNTENT (64 + PATH_MAX + 32 + 128 + 2 * sizeof(int))
#define FORMAT_LEN 20

#define MEM_TOTAL "MemTotal:"
#define SWAP_FREE "SwapFree:"
#define SWAP_TOTAL "SwapTotal:"

#define ALWAYS		  "always"
#define MADVISE		  "madvise"
#define NEVER		  "never"
#define TRANS_ENABLE	  "/sys/kernel/mm/transparent_hugepage/enabled"
#define KHUGE_SCAN_PAGES  "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan"
#define KHUGE_SCAN_SLEEP  "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs"
#define KHUGE_ALLOC_SLEEP "/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs"

void print_usage()
{
	fprintf(stderr, "hugeadm [options]\n");
	fprintf(stderr, "options:\n");

	OPTION("--list-all-mounts", "List all current hugetlbfs mount points");
	OPTION("--pool-list", "List all pools");
	OPTION("--hard", "specified with --pool-pages-min to make");
	CONT("multiple attempts at adjusting the pool size to the");
	CONT("specified count on failure");
	OPTION("--pool-pages-min <size|DEFAULT>:[+|-]<pagecount|memsize<G|M|K>>", "");
	CONT("Adjust pool 'size' lower bound");
	OPTION("--obey-mempolicy", "Obey the NUMA memory policy when");
	CONT("adjusting the pool 'size' lower bound");
	OPTION("--thp-always", "Enable transparent huge pages always");
	OPTION("--thp-madvise", "Enable transparent huge pages with madvise");
	OPTION("--thp-never", "Disable transparent huge pages");
	OPTION("--thp-khugepaged-pages <pages to scan>", "Number of pages that khugepaged");
	CONT("should scan on each pass");
	OPTION("--thp-khugepaged-scan-sleep <milliseconds>", "Time in ms to sleep between");
	CONT("khugepaged passes");
	OPTION("--thp-khugepaged-alloc-sleep <milliseconds>", "Time in ms for khugepaged");
	CONT("to wait if there was a huge page allocation failure");
	OPTION("--pool-pages-max <size|DEFAULT>:[+|-]<pagecount|memsize<G|M|K>>", "");
	CONT("Adjust pool 'size' upper bound");
	OPTION("--set-recommended-min_free_kbytes", "");
	CONT("Sets min_free_kbytes to a recommended value to improve availability of");
	CONT("huge pages at runtime");
	OPTION("--set-recommended-shmmax", "Sets shmmax to a recommended value to");
	CONT("maximise the size possible for shared memory pools");
	OPTION("--set-shm-group <gid|groupname>", "Sets hugetlb_shm_group to the");
	CONT("specified group, which has permission to use hugetlb shared memory pools");
	OPTION("--add-temp-swap[=count]", "Specified with --pool-pages-min to create");
	CONT("temporary swap space for the duration of the pool resize. Default swap");
	CONT("size is 5 huge pages. Optional arg sets size to 'count' huge pages");
	OPTION("--add-ramdisk-swap", "Specified with --pool-pages-min to create");
	CONT("swap space on ramdisks. By default, swap is removed after the resize.");
	OPTION("--persist", "Specified with --add-temp-swap or --add-ramdisk-swap");
	CONT("options to make swap space persist after the resize.");
	OPTION("--enable-zone-movable", "Use ZONE_MOVABLE for huge pages");
	OPTION("--disable-zone-movable", "Do not use ZONE_MOVABLE for huge pages");
	OPTION("--create-mounts", "Creates a mount point for each available");
	CONT("huge page size on this system under /var/lib/hugetlbfs");
	OPTION("--create-user-mounts <user>", "");
	CONT("Creates a mount point for each available huge");
	CONT("page size under /var/lib/hugetlbfs/<user>");
	CONT("usable by user <user>");
	OPTION("--create-group-mounts <group>", "");
	CONT("Creates a mount point for each available huge");
	CONT("page size under /var/lib/hugetlbfs/<group>");
	CONT("usable by group <group>");
	OPTION("--create-global-mounts", "");
	CONT("Creates a mount point for each available huge");
	CONT("page size under /var/lib/hugetlbfs/global");
	CONT("usable by anyone");

	OPTION("--max-size <size<G|M|K>>", "Limit the filesystem size of a new mount point");
	OPTION("--max-inodes <number>", "Limit the number of inodes on a new mount point");

	OPTION("--page-sizes", "Display page sizes that a configured pool");
	OPTION("--page-sizes-all",
			"Display page sizes support by the hardware");
	OPTION("--dry-run", "Print the equivalent shell commands for what");
	CONT("the specified options would have done without");
	CONT("taking any action");

	OPTION("--explain", "Gives a overview of the status of the system");
	CONT("with respect to huge page availability");

	OPTION("--verbose <level>, -v", "Increases/sets tracing levels");
	OPTION("--help, -h", "Prints this message");
}

int opt_dry_run = 0;
int opt_hard = 0;
int opt_movable = -1;
int opt_set_recommended_minfreekbytes = 0;
int opt_set_recommended_shmmax = 0;
int opt_set_hugetlb_shm_group = 0;
int opt_temp_swap = 0;
int opt_ramdisk_swap = 0;
int opt_swap_persist = 0;
int opt_obey_mempolicy = 0;
unsigned long opt_limit_mount_size = 0;
int opt_limit_mount_inodes = 0;
int verbose_level = VERBOSITY_DEFAULT;
char ramdisk_list[PATH_MAX] = "";

void setup_environment(char *var, char *val)
{
	if (opt_dry_run) {
		printf("%s='%s'\n", var, val);
		return;
	}

	setenv(var, val, 1);
	DEBUG("%s='%s'\n", var, val);
}

/* Enable/disable allocation of hugepages from ZONE_MOVABLE */
void setup_zone_movable(int able)
{
	if (opt_dry_run) {
		printf("echo %d > %s\n", able, PROCHUGEPAGES_MOVABLE);
		return;
	}

	DEBUG("Setting %s to %d\n", PROCHUGEPAGES_MOVABLE, able);

	/* libhugetlbfs reports any error that occurs */
	file_write_ulong(PROCHUGEPAGES_MOVABLE, (unsigned long)able);
}

void verbose_init(void)
{
	char *env;

	env = getenv("HUGETLB_VERBOSE");
	if (env)
		verbose_level = atoi(env);
	env = getenv("HUGETLB_DEBUG");
	if (env)
		verbose_level = VERBOSITY_MAX;
}

void verbose(char *which)
{
	int new_level;

	if (which) {
		new_level = atoi(which);
		if (new_level < 0 || new_level > 99) {
			ERROR("%d: verbosity out of range 0-99\n",
				new_level);
			exit(EXIT_FAILURE);
		}
	} else {
		new_level = verbose_level + 1;
		if (new_level == 100) {
			WARNING("verbosity limited to 99\n");
			new_level--;
		}
	}
	verbose_level = new_level;
}

void verbose_expose(void)
{
	char level[3];

	if (verbose_level == 99) {
		setup_environment("HUGETLB_DEBUG", "yes");
	}
	snprintf(level, sizeof(level), "%d", verbose_level);
	setup_environment("HUGETLB_VERBOSE", level);
}

/*
 * getopts return values for options which are long only.
 */
#define LONG_POOL		('p' << 8)
#define LONG_POOL_LIST		(LONG_POOL|'l')
#define LONG_POOL_MIN_ADJ	(LONG_POOL|'m')
#define LONG_POOL_MAX_ADJ	(LONG_POOL|'M')
#define LONG_POOL_MEMPOL	(LONG_POOL|'p')

#define LONG_SET_RECOMMENDED_MINFREEKBYTES	('k' << 8)
#define LONG_SET_RECOMMENDED_SHMMAX		('x' << 8)
#define LONG_SET_HUGETLB_SHM_GROUP		('R' << 8)

#define LONG_MOVABLE		('z' << 8)
#define LONG_MOVABLE_ENABLE	(LONG_MOVABLE|'e')
#define LONG_MOVABLE_DISABLE	(LONG_MOVABLE|'d')

#define LONG_HARD		('h' << 8)
#define LONG_SWAP		('s' << 8)
#define LONG_SWAP_DISK		(LONG_SWAP|'d')
#define LONG_SWAP_RAMDISK	(LONG_SWAP|'r')
#define LONG_SWAP_PERSIST	(LONG_SWAP|'p')

#define LONG_PAGE	('P' << 8)
#define LONG_PAGE_SIZES	(LONG_PAGE|'s')
#define LONG_PAGE_AVAIL	(LONG_PAGE|'a')

#define LONG_MOUNTS			('m' << 8)
#define LONG_CREATE_MOUNTS		(LONG_MOUNTS|'C')
#define LONG_CREATE_USER_MOUNTS		(LONG_MOUNTS|'U')
#define LONG_CREATE_GROUP_MOUNTS	(LONG_MOUNTS|'g')
#define LONG_CREATE_GLOBAL_MOUNTS	(LONG_MOUNTS|'G')
#define LONG_LIST_ALL_MOUNTS		(LONG_MOUNTS|'A')

#define LONG_LIMITS			('l' << 8)
#define LONG_LIMIT_SIZE			(LONG_LIMITS|'S')
#define LONG_LIMIT_INODES		(LONG_LIMITS|'I')

#define LONG_EXPLAIN	('e' << 8)

#define LONG_TRANS			('t' << 8)
#define LONG_TRANS_ALWAYS		(LONG_TRANS|'a')
#define LONG_TRANS_MADVISE		(LONG_TRANS|'m')
#define LONG_TRANS_NEVER		(LONG_TRANS|'n')

#define LONG_KHUGE			('K' << 8)
#define LONG_KHUGE_PAGES		(LONG_KHUGE|'p')
#define LONG_KHUGE_SCAN			(LONG_KHUGE|'s')
#define LONG_KHUGE_ALLOC		(LONG_KHUGE|'a')

#define MAX_POOLS	32

static int cmpsizes(const void *p1, const void *p2)
{
	return ((struct hpage_pool *)p1)->pagesize >
			((struct hpage_pool *)p2)->pagesize;
}

void pool_list(void)
{
	struct hpage_pool pools[MAX_POOLS];
	int pos;
	int cnt;

	cnt = hpool_sizes(pools, MAX_POOLS);
	if (cnt < 0) {
		ERROR("unable to obtain pools list");
		exit(EXIT_FAILURE);
	}
	qsort(pools, cnt, sizeof(pools[0]), cmpsizes);

	printf("%10s %8s %8s %8s %8s\n",
		"Size", "Minimum", "Current", "Maximum", "Default");
	for (pos = 0; cnt--; pos++) {
		printf("%10ld %8ld %8ld %8ld %8s\n", pools[pos].pagesize,
			pools[pos].minimum, pools[pos].size,
			pools[pos].maximum, (pools[pos].is_default) ? "*" : "");
	}
}

struct mount_list
{
	struct mntent entry;
	char data[MAX_SIZE_MNTENT];
	struct mount_list *next;
};

void print_mounts(struct mount_list *current, int longest)
{
	char format_str[FORMAT_LEN];

	snprintf(format_str, FORMAT_LEN, "%%-%ds %%s\n", longest);
	printf(format_str, "Mount Point", "Options");
	while (current) {
		printf(format_str, current->entry.mnt_dir,
				   current->entry.mnt_opts);
		current = current->next;
	}
}

/*
 * collect_active_mounts returns a list of active hugetlbfs
 * mount points, and, if longest is not NULL, the number of
 * characters in the longest mount point to ease output
 * formatting.  Caller is expected to free the list of mounts.
 */
struct mount_list *collect_active_mounts(int *longest)
{
	FILE *mounts;
	struct mount_list *list, *current, *previous = NULL;
	int length;

	/* First try /proc/mounts, then /etc/mtab */
	mounts = setmntent(PROCMOUNTS, "r");
	if (!mounts) {
		mounts = setmntent(MOUNTED, "r");
		if (!mounts) {
			ERROR("unable to open %s or %s for reading",
				PROCMOUNTS, MOUNTED);
			exit(EXIT_FAILURE);
		}
	}

	list = malloc(sizeof(struct mount_list));
	if (!list) {
		ERROR("out of memory");
		exit(EXIT_FAILURE);
	}

	list->next = NULL;
	current = list;
	while (getmntent_r(mounts, &(current->entry), current->data, MAX_SIZE_MNTENT)) {
		if (strcasecmp(current->entry.mnt_type, FS_NAME) == 0) {
			length = strlen(current->entry.mnt_dir);
			if (longest && length > *longest)
				*longest = length;

			current->next = malloc(sizeof(struct mount_list));
			if (!current->next) {
				ERROR("out of memory");
				exit(EXIT_FAILURE);
			}
			previous = current;
			current = current->next;
			current->next = NULL;
		}
	}

	endmntent(mounts);

	if (previous) {
		free(previous->next);
		previous->next = NULL;
		return list;
	}
	return NULL;
}

void mounts_list_all(void)
{
	struct mount_list *list, *previous;
	int longest = MIN_COL;

	list = collect_active_mounts(&longest);

	if (!list) {
		ERROR("No hugetlbfs mount points found\n");
		return;
	}

	print_mounts(list, longest);

	while (list) {
		previous = list;
		list = list->next;
		free(previous);
	}
}

int make_dir(char *path, mode_t mode, uid_t uid, gid_t gid)
{
	struct passwd *pwd;
	struct group *grp;

	if (opt_dry_run) {
		pwd = getpwuid(uid);
		grp = getgrgid(gid);
		printf("if [ ! -e %s ]\n", path);
		printf("then\n");
		printf(" mkdir %s\n", path);
		printf(" chown %s:%s %s\n", pwd->pw_name, grp->gr_name, path);
		printf(" chmod %o %s\n", mode, path);
		printf("fi\n");
		return 0;
	}

	if (mkdir(path, mode)) {
		if (errno != EEXIST) {
			ERROR("Unable to create dir %s, error: %s\n",
				path, strerror(errno));
			return 1;
		}
	} else {
		if (chown(path, uid, gid)) {
			ERROR("Unable to change ownership of %s, error: %s\n",
				path, strerror(errno));
			return 1;
		}

		if (chmod(path, mode)) {
			ERROR("Unable to change permission on %s, error: %s\n",
				path, strerror(errno));
			return 1;
		}
	}

	return 0;
}

/**
 * ensure_dir will build the entire directory structure up to and
 * including path, all directories built will be owned by
 * user:group and permissions will be set to mode.
 */
int ensure_dir(char *path, mode_t mode, uid_t uid, gid_t gid)
{
	char *idx;

	if (!path || strlen(path) == 0)
		return 0;

	idx = strchr(path + 1, '/');

	do {
		if (idx)
			*idx = '\0';

		if (make_dir(path, mode, uid, gid))
			return 1;

		if (idx) {
			*idx = '/';
			idx++;
		}
	} while ((idx = strchr(idx, '/')) != NULL);

	if (make_dir(path, mode, uid, gid))
		return 1;

	return 0;
}

int check_if_already_mounted(struct mount_list *list, char *path)
{
	while (list) {
		if (!strcmp(list->entry.mnt_dir, path))
			return 1;
		list = list->next;
	}
	return 0;
}

int mount_dir(char *path, char *options, mode_t mode, uid_t uid, gid_t gid)
{
	struct passwd *pwd;
	struct group *grp;
	struct mntent entry;
	FILE *mounts;
        char dummy;
        int useMtab;

	struct mount_list *list, *previous;

	list = collect_active_mounts(NULL);

	if (list && check_if_already_mounted(list, path)) {
		WARNING("Directory %s is already mounted.\n", path);

		while (list) {
			previous = list;
			list = list->next;
			free(previous);
		}
		return 0;
	}

	while (list) {
		previous = list;
		list = list->next;
		free(previous);
	}

	if (opt_dry_run) {
		pwd = getpwuid(uid);
		grp = getgrgid(gid);
		printf("mount -t %s none %s -o %s\n", FS_NAME,
			path, options);
		printf("chown %s:%s %s\n", pwd->pw_name, grp->gr_name,
			path);
		printf("chmod %o %s\n", mode, path);
	} else {
		if (mount("none", path, FS_NAME, 0, options)) {
			ERROR("Unable to mount %s, error: %s\n",
				path, strerror(errno));
			return 1;
		}

          /* Check if mtab is a symlink */
          useMtab = (readlink(MOUNTED, &dummy, 1) < 0);
          if (useMtab) {
			mounts = setmntent(MOUNTED, "a+");
			if (mounts) {
				entry.mnt_fsname = FS_NAME;
				entry.mnt_dir = path;
				entry.mnt_type = FS_NAME;
				entry.mnt_opts = options;
				entry.mnt_freq = 0;
				entry.mnt_passno = 0;
				if (addmntent(mounts, &entry))
					WARNING("Unable to add entry %s to %s, error: %s\n",
						path, MOUNTED, strerror(errno));
				endmntent(mounts);
			} else {
				WARNING("Unable to open %s, error: %s\n",
					MOUNTED, strerror(errno));
			}
		}

		if (chown(path, uid, gid)) {
			ERROR("Unable to change ownership of %s, error: %s\n",
				path, strerror(errno));
			return 1;
		}

		if (chmod(path, mode)) {
			ERROR("Unable to set permissions on %s, error: %s\n",
				path, strerror(errno));
			return 1;
		}
	}
	return 0;
}

void scale_size(char *buf, unsigned long pagesize)
{
	if(pagesize >= GB)
		snprintf(buf, OPT_MAX, "%luGB", pagesize / GB);
	else if(pagesize >= MB)
		snprintf(buf, OPT_MAX, "%luMB", pagesize / MB);
	else
		snprintf(buf, OPT_MAX, "%luKB", pagesize / KB);
}

void create_mounts(char *user, char *group, char *base, mode_t mode)
{
	struct hpage_pool pools[MAX_POOLS];
	char path[PATH_MAX];
	char options[OPT_MAX];
	char limits[OPT_MAX];
	char scaled[OPT_MAX];
	int cnt, pos;
	struct passwd *pwd;
	struct group *grp;
	uid_t uid = 0;
	gid_t gid = 0;

	if (geteuid() != 0) {
		ERROR("Mounts can only be created by root\n");
		exit(EXIT_FAILURE);
	}

	if (user) {
		pwd = getpwnam(user);
		if (!pwd) {
			ERROR("Could not find specified user %s\n", user);
			exit(EXIT_FAILURE);
		}
		uid = pwd->pw_uid;
	} else if (group) {
		grp = getgrnam(group);
		if (!grp) {
			ERROR("Could not find specified group %s\n", group);
			exit(EXIT_FAILURE);
		}
		gid = grp->gr_gid;
	}

	if (ensure_dir(base,
		S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH, 0, 0))
		exit(EXIT_FAILURE);

	cnt = hpool_sizes(pools, MAX_POOLS);
	if (cnt < 0) {
		ERROR("Unable to obtain pools list\n");
		exit(EXIT_FAILURE);
	}

	for (pos=0; cnt--; pos++) {
		scaled[0] = 0;
		scale_size(scaled, pools[pos].pagesize);
		if (user)
			snprintf(path, PATH_MAX, "%s/%s/pagesize-%s",
				base, user, scaled);
		else if (group)
			snprintf(path, PATH_MAX, "%s/%s/pagesize-%s",
				base, group, scaled);
		else
			snprintf(path, PATH_MAX, "%s/pagesize-%s",
				base, scaled);

		snprintf(options, OPT_MAX, "pagesize=%ld",
				pools[pos].pagesize);

		/* Yes, this could be cleverer */
		if (opt_limit_mount_size && opt_limit_mount_inodes)
			snprintf(limits, OPT_MAX, ",size=%lu,nr_inodes=%d",
				opt_limit_mount_size, opt_limit_mount_inodes);
		else {
			if (opt_limit_mount_size)
				snprintf(limits, OPT_MAX, ",size=%lu",
					opt_limit_mount_size);
			if (opt_limit_mount_inodes)
				snprintf(limits, OPT_MAX, ",nr_inodes=%d",
					opt_limit_mount_inodes);
		}

		/* Append limits if specified */
		if (limits[0] != 0) {
			size_t maxlen = OPT_MAX - strlen(options);
			if (maxlen > strlen(limits))
				strcat(options, limits);
			else
				WARNING("String limitations met, cannot append limitations onto mount options string. Increase OPT_MAX");
		}

		if (ensure_dir(path, mode, uid, gid))
			exit(EXIT_FAILURE);

		if (mount_dir(path, options, mode, uid, gid))
			exit(EXIT_FAILURE);
	}
}

/**
 * show_mem shouldn't change the behavior of any of its
 * callers, it only prints a message to the user showing the
 * total amount of memory in the system (in megabytes).
 */
void show_mem()
{
	long mem_total;

	mem_total = read_meminfo(MEM_TOTAL);
	printf("Total System Memory: %ld MB\n\n", mem_total / 1024);
}

/**
 * check_swap shouldn't change the behavior of any of its
 * callers, it only prints a message to the user if something
 * is being done that might fail without swap available.  i.e.
 * resizing a huge page pool
 */
void check_swap()
{
	long swap_sz;
	long swap_total;

	swap_total = read_meminfo(SWAP_TOTAL);
	if (swap_total <= 0) {
		WARNING("There is no swap space configured, resizing hugepage pool may fail\n");
		WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n");
		return;
	}

	swap_sz = read_meminfo(SWAP_FREE);
	/* meminfo keeps values in kb, but we use bytes for hpage sizes */
	swap_sz *= 1024;
	if (swap_sz <= gethugepagesize()) {
		WARNING("There is very little swap space free, resizing hugepage pool may fail\n");
		WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n");
	}
}

#define ZONEINFO_LINEBUF 1024
long recommended_minfreekbytes(void)
{
	FILE *f;
	char buf[ZONEINFO_LINEBUF];
	int nr_zones = 0;
	long recommended_min;
	long pageblock_kbytes = kernel_default_hugepage_size() / 1024;

	/* Detect the number of zones in the system */
	f = fopen(PROCZONEINFO, "r");
	if (f == NULL) {
		WARNING("Unable to open " PROCZONEINFO);
		return 0;
	}
	while (fgets(buf, ZONEINFO_LINEBUF, f) != NULL) {
		if (strncmp(buf, "Node ", 5) == 0)
			nr_zones++;
	}
	fclose(f);

	/* Make sure at least 2 pageblocks are free for MIGRATE_RESERVE */
	recommended_min = pageblock_kbytes * nr_zones * 2;

	/*
	 * Make sure that on average at least two pageblocks are almost free
	 * of another type, one for a migratetype to fall back to and a
	 * second to avoid subsequent fallbacks of other types There are 3
	 * MIGRATE_TYPES we care about.
	 */
	recommended_min += pageblock_kbytes * nr_zones * 3 * 3;
	return recommended_min;
}

void set_recommended_minfreekbytes(void)
{
	long recommended_min = recommended_minfreekbytes();

	if (opt_dry_run) {
		printf("echo \"%ld\" > %s\n", recommended_min,
			PROCMINFREEKBYTES);
		return;
	}

	DEBUG("Setting min_free_kbytes to %ld\n", recommended_min);
	file_write_ulong(PROCMINFREEKBYTES, (unsigned long)recommended_min);
}

/*
 * check_minfreekbytes does not alter the value of min_free_kbytes. It just
 * reports what the current value is and what it should be
 */
void check_minfreekbytes(void)
{
	long min_free_kbytes = file_read_ulong(PROCMINFREEKBYTES, NULL);
	long recommended_min = recommended_minfreekbytes();

	/* There should be at least one pageblock free per zone in the system */
	if (recommended_min > min_free_kbytes) {
		printf("\n");
		printf("The " PROCMINFREEKBYTES " of %ld is too small. To maximiuse efficiency\n", min_free_kbytes);
		printf("of fragmentation avoidance, there should be at least one huge page free per zone\n");
		printf("in the system which minimally requires a min_free_kbytes value of %ld\n", recommended_min);
	}
}

unsigned long long recommended_shmmax(void)
{
	struct hpage_pool pools[MAX_POOLS];
	unsigned long long recommended_shmmax = 0;
	int pos, cnt;

	cnt = hpool_sizes(pools, MAX_POOLS);
	if (cnt < 0) {
		ERROR("unable to obtain pools list");
		exit(EXIT_FAILURE);
	}

	for (pos = 0; cnt--; pos++)
		recommended_shmmax += ((unsigned long long)pools[pos].maximum *
							pools[pos].pagesize);

	return recommended_shmmax;
}

void set_recommended_shmmax(void)
{
	int ret;
	unsigned long max_recommended = -1UL;
	unsigned long long recommended = recommended_shmmax();

	if (recommended == 0) {
		printf("\n");
		WARNING("We can only set a recommended shmmax when huge pages are configured!\n");
		return;
	}

	if (recommended > max_recommended)
		recommended = max_recommended;

	DEBUG("Setting shmmax to %llu\n", recommended);
	ret = file_write_ulong(PROCSHMMAX, (unsigned long)recommended);

	if (!ret) {
		INFO("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n");
		INFO("  kernel.shmmax = %llu\n", recommended);
	}
}

void check_shmmax(void)
{
	long current_shmmax = file_read_ulong(PROCSHMMAX, NULL);
	long recommended = recommended_shmmax();

	if (current_shmmax != recommended) {
		printf("\n");
		printf("A " PROCSHMMAX " value of %ld bytes may be sub-optimal. To maximise\n", current_shmmax);
		printf("shared memory usage, this should be set to the size of the largest shared memory\n");
		printf("segment size you want to be able to use. Alternatively, set it to a size matching\n");
		printf("the maximum possible allocation size of all huge pages. This can be done\n");
		printf("automatically, using the --set-recommended-shmmax option.\n");
	}

	if (recommended == 0) {
		printf("\n");
		WARNING("We can't make a shmmax recommendation until huge pages are configured!\n");
		return;
	}

	printf("\n");
	printf("The recommended shmmax for your currently allocated huge pages is %ld bytes.\n", recommended);
	printf("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n");
	printf("  kernel.shmmax = %ld\n", recommended);
}

void set_hugetlb_shm_group(gid_t gid, char *group)
{
	int ret;

	DEBUG("Setting hugetlb_shm_group to %d (%s)\n", gid, group);
	ret = file_write_ulong(PROCHUGETLBGROUP, (unsigned long)gid);

	if (!ret) {
		INFO("To make hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n");
		INFO("  vm.hugetlb_shm_group = %d\n", gid);
	}
}

/* heisted from shadow-utils/libmisc/list.c::is_on_list() */
static int user_in_group(char *const *list, const char *member)
{
	while (*list != NULL) {
		if (strcmp(*list, member) == 0) {
			return 1;
		}
		list++;
	}

	return 0;
}

void check_user(void)
{
	uid_t uid;
	gid_t gid;
	struct passwd *pwd;
	struct group *grp;

	gid = (gid_t)file_read_ulong(PROCHUGETLBGROUP, NULL);
	grp = getgrgid(gid);
	if (!grp) {
		printf("\n");
		WARNING("Group ID %d in hugetlb_shm_group doesn't appear to be a valid group!\n", gid);
		return;
	}

	uid = getuid();
	pwd = getpwuid(uid);

	/* Don't segfault if user does not have a passwd entry. */
	if (!pwd) {
		printf("\n");
		WARNING("User uid %d is not in the password file!\n", uid);
		return;
	}

	if (gid != pwd->pw_gid && !user_in_group(grp->gr_mem, pwd->pw_name) && uid != 0) {
		printf("\n");
		WARNING("User %s (uid: %d) is not a member of the hugetlb_shm_group %s (gid: %d)!\n", pwd->pw_name, uid, grp->gr_name, gid);
	} else {
		printf("\n");
		printf("To make your hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n");
		printf("  vm.hugetlb_shm_group = %d\n", gid);
	}
}

void add_temp_swap(long page_size)
{
	char path[PATH_MAX];
	char file[PATH_MAX];
	char mkswap_cmd[PATH_MAX];
	FILE *f;
	char *buf;
	long swap_size;
	long pid;
	int ret;
	int num_pages;

	if (geteuid() != 0) {
		ERROR("Swap can only be manipulated by root\n");
		exit(EXIT_FAILURE);
	}

	pid = getpid();
	snprintf(path, PATH_MAX, "%s/swap/temp", MOUNT_DIR);
	snprintf(file, PATH_MAX, "%s/swapfile-%ld", path, pid);

	/* swapsize is 5 hugepages */
	if (opt_temp_swap == -1)
		num_pages = 5;
	else
		num_pages = opt_temp_swap;
	swap_size = num_pages * page_size;

	if (ensure_dir(path, S_IRWXU | S_IRGRP | S_IXGRP, 0, 0))
		exit(EXIT_FAILURE);

	if (opt_dry_run) {
		printf("dd bs=1024 count=%ld if=/dev/zero of=%s\n",
			swap_size / 1024, file);
		printf("mkswap %s\nswapon %s\n", file, file);
		return;
	}

	f = fopen(file, "wx");
	if (!f) {
		WARNING("Couldn't open %s: %s\n", file, strerror(errno));
		opt_temp_swap = 0;
		return;
	}

	buf = malloc(swap_size);
	memset(buf, 0, swap_size);
	fwrite(buf, sizeof(char), swap_size, f);
	free(buf);
	fclose(f);

	snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", file);
	ret = system(mkswap_cmd);
	if (WIFSIGNALED(ret)) {
		WARNING("Call to mkswap failed\n");
		opt_temp_swap = 0;
		return;
	} else if (WIFEXITED(ret)) {
		ret = WEXITSTATUS(ret);
		if (ret) {
			WARNING("Call to mkswap failed\n");
			opt_temp_swap = 0;
			return;
		}
	}

	DEBUG("swapon %s\n", file);
	if (swapon(file, 0)) {
		WARNING("swapon on %s failed: %s\n", file, strerror(errno));
		opt_temp_swap = 0;
	}
}

void rem_temp_swap() {
	char file[PATH_MAX];
	long pid;

	pid = getpid();
	snprintf(file, PATH_MAX, "%s/swap/temp/swapfile-%ld", MOUNT_DIR, pid);

	if (opt_dry_run) {
		printf("swapoff %s\nrm -f %s\n", file, file);
		return;
	}

	if (swapoff(file))
		WARNING("swapoff on %s failed: %s\n", file, strerror(errno));
	remove(file);
	DEBUG("swapoff %s\n", file);
}

void add_ramdisk_swap(long page_size) {
	char ramdisk[PATH_MAX];
	char mkswap_cmd[PATH_MAX];
	int disk_num=0;
	int count = 0;
	long ramdisk_size;
	int ret;
	int fd;

	snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num);
	fd = open(ramdisk, O_RDONLY);
	ioctl(fd, BLKGETSIZE, &ramdisk_size);
	close(fd);

	ramdisk_size = ramdisk_size * 512;
	count = (page_size/ramdisk_size) + 1;

	if (count > 1) {
		INFO("Swap will be initialized on multiple ramdisks because\n\
		ramdisk size is less than huge page size. To avoid\n\
		this in the future, use kernel command line parameter\n\
		ramdisk_size=N, to set ramdisk size to N blocks.\n");
	}

	while (count > 0) {
		snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num);
		if (access(ramdisk, F_OK) != 0){
			break;
		}
		disk_num++;

		if (opt_dry_run) {
			printf("mkswap %s\nswapon %s\n", ramdisk, ramdisk);
		} else {
			snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", ramdisk);
			ret = system(mkswap_cmd);
			if (WIFSIGNALED(ret)) {
				WARNING("Call to mkswap failed\n");
				continue;
			} else if (WIFEXITED(ret)) {
				ret = WEXITSTATUS(ret);
				if (ret) {
					WARNING("Call to mkswap failed\n");
					continue;
				}
			}
			DEBUG("swapon %s\n", ramdisk);
			if (swapon(ramdisk, 0)) {
				WARNING("swapon on %s failed: %s\n", ramdisk, strerror(errno));
				opt_temp_swap = 0;
				continue;
			}
		}
		count--;
		strcat(ramdisk_list, " ");
		strcat(ramdisk_list, ramdisk);
	}
}

void rem_ramdisk_swap(){
	char *ramdisk;
	char *iter = NULL;

	ramdisk = strtok_r(ramdisk_list, " ", &iter);
	while (ramdisk != NULL) {
		if (opt_dry_run) {
			printf("swapoff %s\n", ramdisk);
		} else {
			DEBUG("swapoff %s\n", ramdisk);
			if (swapoff(ramdisk)) {
				WARNING("swapoff on %s failed: %s\n", ramdisk, strerror(errno));
				continue;
			}
		}
		ramdisk = strtok_r(NULL, " ", &iter);
	}
}

void set_trans_opt(const char *file, const char *value)
{
	FILE *f;

	if (geteuid() != 0) {
                ERROR("Transparent huge page options can only be set by root\n");
                exit(EXIT_FAILURE);
        }

	if (opt_dry_run) {
		printf("echo '%s' > %s\n", value, file);
		return;
	}

	f = fopen(file, "w");
	if (!f) {
		ERROR("Couldn't open %s: %s\n", file, strerror(errno));
		return;
	}

	fprintf(f, "%s", value);
	fclose(f);
}

enum {
	POOL_MIN,
	POOL_MAX,
	POOL_BOTH,
};

static long value_adjust(char *adjust_str, long base, long page_size)
{
	long long adjust;
	char *iter;

	/* Convert and validate the adjust. */
	errno = 0;
	adjust = strtol(adjust_str, &iter, 0);
	/* Catch strtol errors and sizes that overflow the native word size */
	if (errno || adjust_str == iter) {
		if (errno == ERANGE)
			errno = EOVERFLOW;
		else
			errno = EINVAL;
		ERROR("%s: invalid adjustment\n", adjust_str);
		exit(EXIT_FAILURE);
	}

	/* size_to_smaller_unit() only works with positive values */
	if (adjust_str[0] == '-')
		adjust = -adjust;

	switch (*iter) {
	case 'G':
	case 'g':
		adjust = size_to_smaller_unit(adjust);
	case 'M':
	case 'm':
		adjust = size_to_smaller_unit(adjust);
	case 'K':
	case 'k':
		adjust = size_to_smaller_unit(adjust);
		adjust = adjust / page_size;
	}

	/* if previously negative, make negative again */
	if (adjust_str[0] == '-')
		adjust = -adjust;

	if (adjust_str[0] != '+' && adjust_str[0] != '-')
		base = 0;

	/* Ensure we neither go negative nor exceed LONG_MAX. */
	if (adjust < 0 && -adjust > base) {
		adjust = -base;
	}
	if (adjust > 0 && (base + adjust) < base) {
		adjust = LONG_MAX - base;
	}
	base += adjust;

	DEBUG("Returning page count of %ld\n", base);

	return base;
}


void pool_adjust(char *cmd, unsigned int counter)
{
	struct hpage_pool pools[MAX_POOLS];
	int pos;
	int cnt;

	char *iter = NULL;
	char *page_size_str = NULL;
	char *adjust_str = NULL;
	long page_size;

	unsigned long min;
	unsigned long min_orig;
	unsigned long max;
	unsigned long last_pool_value;

	/* Extract the pagesize and adjustment. */
	page_size_str = strtok_r(cmd, ":", &iter);
	if (page_size_str)
		adjust_str = strtok_r(NULL, ":", &iter);

	if (!page_size_str || !adjust_str) {
		ERROR("%s: invalid resize specification\n", cmd);
		exit(EXIT_FAILURE);
	}
	INFO("page_size<%s> adjust<%s> counter<%d>\n",
					page_size_str, adjust_str, counter);

	/* Convert and validate the page_size. */
	if (strcmp(page_size_str, "DEFAULT") == 0)
		page_size = kernel_default_hugepage_size();
	else
		page_size = parse_page_size(page_size_str);

	DEBUG("Working with page_size of %ld\n", page_size);

	cnt = hpool_sizes(pools, MAX_POOLS);
	if (cnt < 0) {
		ERROR("unable to obtain pools list");
		exit(EXIT_FAILURE);
	}
	for (pos = 0; cnt--; pos++) {
		if (pools[pos].pagesize == page_size)
			break;
	}
	if (cnt < 0) {
		ERROR("%s: unknown page size\n", page_size_str);
		exit(EXIT_FAILURE);
	}

	min_orig = min = pools[pos].minimum;
	max = pools[pos].maximum;

	if (counter == POOL_BOTH) {
		min = value_adjust(adjust_str, min, page_size);
		max = min;
	} else if (counter == POOL_MIN) {
		min = value_adjust(adjust_str, min, page_size);
		if (min > max)
			max = min;
	} else {
		max = value_adjust(adjust_str, max, page_size);
		if (max < min)
			min = max;
	}

	INFO("%ld, %ld -> %ld, %ld\n", pools[pos].minimum, pools[pos].maximum,
		min, max);

	if ((pools[pos].maximum - pools[pos].minimum) < (max - min)) {
		INFO("setting HUGEPAGES_OC to %ld\n", (max - min));
		set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min));
	}

	if (opt_hard)
		cnt = 5;
	else
		cnt = -1;

	if (min > min_orig) {
		if (opt_temp_swap)
			add_temp_swap(page_size);
		if (opt_ramdisk_swap)
			add_ramdisk_swap(page_size);
		check_swap();
	}

	if (opt_obey_mempolicy && get_huge_page_counter(page_size,
				HUGEPAGES_TOTAL_MEMPOL) < 0) {
		opt_obey_mempolicy = 0;
		WARNING("Counter for NUMA huge page allocations is not found, continuing with normal pool adjustment\n");
	}

	INFO("setting HUGEPAGES_TOTAL%s to %ld\n",
		opt_obey_mempolicy ? "_MEMPOL" : "", min);
	set_huge_page_counter(page_size,
		opt_obey_mempolicy ? HUGEPAGES_TOTAL_MEMPOL : HUGEPAGES_TOTAL,
		min);
	get_pool_size(page_size, &pools[pos]);

	/* If we fail to make an allocation, retry if user requests */
	last_pool_value = pools[pos].minimum;
	while ((pools[pos].minimum != min) && (cnt > 0)) {
		/* Make note if progress is being made and sleep for IO */
		if (last_pool_value == pools[pos].minimum)
			cnt--;
		else
			cnt = 5;
		sleep(6);

		last_pool_value = pools[pos].minimum;
		INFO("Retrying allocation HUGEPAGES_TOTAL%s to %ld current %ld\n", opt_obey_mempolicy ? "_MEMPOL" : "", min, pools[pos].minimum);
		set_huge_page_counter(page_size,
			opt_obey_mempolicy ?
				HUGEPAGES_TOTAL_MEMPOL :
				HUGEPAGES_TOTAL,
			min);
		get_pool_size(page_size, &pools[pos]);
	}

	if (min > min_orig && !opt_swap_persist) {
		if (opt_temp_swap)
			rem_temp_swap();
		else if (opt_ramdisk_swap)
			rem_ramdisk_swap();
	}

	/*
	 * HUGEPAGES_TOTAL is not guarenteed to check to exactly the figure
	 * requested should there be insufficient pages.  Check the new
	 * value and adjust HUGEPAGES_OC accordingly.
	 */
	if (pools[pos].minimum != min) {
		WARNING("failed to set pool minimum to %ld became %ld\n",
			min, pools[pos].minimum);
		min = pools[pos].minimum;
	}
	if (pools[pos].maximum != max) {
		INFO("setting HUGEPAGES_OC to %ld\n", (max - min));
		set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min));
	}
}

void page_sizes(int all)
{
	struct hpage_pool pools[MAX_POOLS];
	int pos;
	int cnt;

	cnt = hpool_sizes(pools, MAX_POOLS);
	if (cnt < 0) {
		ERROR("unable to obtain pools list");
		exit(EXIT_FAILURE);
	}
	qsort(pools, cnt, sizeof(pools[0]), cmpsizes);

	for (pos = 0; cnt--; pos++) {
		if (all || (pools[pos].maximum &&
		    hugetlbfs_find_path_for_size(pools[pos].pagesize)))
			printf("%ld\n", pools[pos].pagesize);
	}
}

void explain()
{
	show_mem();
	mounts_list_all();
	printf("\nHuge page pools:\n");
	pool_list();
	printf("\nHuge page sizes with configured pools:\n");
	page_sizes(0);
	check_minfreekbytes();
	check_shmmax();
	check_swap();
	check_user();
	printf("\nNote: Permanent swap space should be preferred when dynamic "
		"huge page pools are used.\n");
}

int main(int argc, char** argv)
{
	int ops;
	int has_hugepages = kernel_has_hugepages();

	char opts[] = "+hdv";
	char base[PATH_MAX];
	char *opt_min_adj[MAX_POOLS], *opt_max_adj[MAX_POOLS];
	char *opt_user_mounts = NULL, *opt_group_mounts = NULL;
	int opt_list_mounts = 0, opt_pool_list = 0, opt_create_mounts = 0;
	int opt_global_mounts = 0, opt_pgsizes = 0, opt_pgsizes_all = 0;
	int opt_explain = 0, minadj_count = 0, maxadj_count = 0;
	int opt_trans_always = 0, opt_trans_never = 0, opt_trans_madvise = 0;
	int opt_khuge_pages = 0, opt_khuge_scan = 0, opt_khuge_alloc = 0;
	int ret = 0, index = 0;
	char *khuge_pages = NULL, *khuge_alloc = NULL, *khuge_scan = NULL;
	gid_t opt_gid = 0;
	struct group *opt_grp = NULL;
	int group_invalid = 0;
	struct option long_opts[] = {
		{"help",       no_argument, NULL, 'h'},
		{"verbose",    required_argument, NULL, 'v' },

		{"list-all-mounts", no_argument, NULL, LONG_LIST_ALL_MOUNTS},
		{"pool-list", no_argument, NULL, LONG_POOL_LIST},
		{"pool-pages-min", required_argument, NULL, LONG_POOL_MIN_ADJ},
		{"pool-pages-max", required_argument, NULL, LONG_POOL_MAX_ADJ},
		{"obey-mempolicy", no_argument, NULL, LONG_POOL_MEMPOL},
		{"thp-always", no_argument, NULL, LONG_TRANS_ALWAYS},
		{"thp-madvise", no_argument, NULL, LONG_TRANS_MADVISE},
		{"thp-never", no_argument, NULL, LONG_TRANS_NEVER},
		{"thp-khugepaged-pages", required_argument, NULL, LONG_KHUGE_PAGES},
		{"thp-khugepaged-scan-sleep", required_argument, NULL, LONG_KHUGE_SCAN},
		{"thp-khugepaged-alloc-sleep", required_argument, NULL, LONG_KHUGE_ALLOC},
		{"set-recommended-min_free_kbytes", no_argument, NULL, LONG_SET_RECOMMENDED_MINFREEKBYTES},
		{"set-recommended-shmmax", no_argument, NULL, LONG_SET_RECOMMENDED_SHMMAX},
		{"set-shm-group", required_argument, NULL, LONG_SET_HUGETLB_SHM_GROUP},
		{"enable-zone-movable", no_argument, NULL, LONG_MOVABLE_ENABLE},
		{"disable-zone-movable", no_argument, NULL, LONG_MOVABLE_DISABLE},
		{"hard", no_argument, NULL, LONG_HARD},
		{"add-temp-swap", optional_argument, NULL, LONG_SWAP_DISK},
		{"add-ramdisk-swap", no_argument, NULL, LONG_SWAP_RAMDISK},
		{"persist", no_argument, NULL, LONG_SWAP_PERSIST},
		{"create-mounts", no_argument, NULL, LONG_CREATE_MOUNTS},
		{"create-user-mounts", required_argument, NULL, LONG_CREATE_USER_MOUNTS},
		{"create-group-mounts", required_argument, NULL, LONG_CREATE_GROUP_MOUNTS},
		{"create-global-mounts", no_argument, NULL, LONG_CREATE_GLOBAL_MOUNTS},

		{"max-size", required_argument, NULL, LONG_LIMIT_SIZE},
		{"max-inodes", required_argument, NULL, LONG_LIMIT_INODES},

		{"page-sizes", no_argument, NULL, LONG_PAGE_SIZES},
		{"page-sizes-all", no_argument, NULL, LONG_PAGE_AVAIL},
		{"dry-run", no_argument, NULL, 'd'},
		{"explain", no_argument, NULL, LONG_EXPLAIN},

		{0},
	};

	hugetlbfs_setup_debug();
	setup_mounts();
	verbose_init();

	ops = 0;
	while (ret != -1) {
		ret = getopt_long(argc, argv, opts, long_opts, &index);
		switch (ret) {
		case -1:
			break;

		case '?':
			print_usage();
			exit(EXIT_FAILURE);

		case 'h':
			print_usage();
			exit(EXIT_SUCCESS);

		case 'v':
			verbose(optarg);
			continue;

		case 'd':
			opt_dry_run = 1;
			continue;

		default:
			/* All other commands require hugepage support. */
			if (! has_hugepages) {
				ERROR("kernel does not support huge pages\n");
				exit(EXIT_FAILURE);
			}
		}
		switch (ret) {
		case -1:
			break;

		case LONG_HARD:
			opt_hard = 1;
			continue;

		case LONG_SWAP_DISK:
			if (optarg)
				opt_temp_swap = atoi(optarg);
			else
				opt_temp_swap = -1;
			break;

		case LONG_SWAP_RAMDISK:
			opt_ramdisk_swap = 1;
			break;

		case LONG_SWAP_PERSIST:
			opt_swap_persist = 1;

		case LONG_LIST_ALL_MOUNTS:
			opt_list_mounts = 1;
			break;

		case LONG_POOL_LIST:
			opt_pool_list = 1;
			break;

		case LONG_POOL_MIN_ADJ:
			if (minadj_count == MAX_POOLS) {
				WARNING("Attempting to adjust an invalid "
					"pool or a pool multiple times, "
					"ignoring request: '%s'\n", optarg);
			} else {
				opt_min_adj[minadj_count++] = optarg;
			}
			break;

		case LONG_POOL_MEMPOL:
			opt_obey_mempolicy = 1;
			break;

		case LONG_TRANS_ALWAYS:
			opt_trans_always = 1;
			break;

		case LONG_TRANS_MADVISE:
			opt_trans_madvise = 1;
			break;

		case LONG_TRANS_NEVER:
			opt_trans_never = 1;
			break;

		case LONG_KHUGE_PAGES:
			opt_khuge_pages = 1;
			khuge_pages = optarg;
			break;

		case LONG_KHUGE_SCAN:
			opt_khuge_scan = 1;
			khuge_scan = optarg;
			break;

		case LONG_KHUGE_ALLOC:
			opt_khuge_alloc = 1;
			khuge_alloc = optarg;
			break;

		case LONG_POOL_MAX_ADJ:
			if (! kernel_has_overcommit()) {
				ERROR("kernel does not support overcommit, "
					"max cannot be adjusted\n");
				exit(EXIT_FAILURE);
			}

			if (maxadj_count == MAX_POOLS) {
				WARNING("Attempting to adjust an invalid "
					"pool or a pool multiple times, "
					"ignoring request: '%s'\n", optarg);
			} else {
				opt_max_adj[maxadj_count++] = optarg;
			}
                        break;

		case LONG_MOVABLE_ENABLE:
			opt_movable = 1;
			break;

		case LONG_SET_RECOMMENDED_MINFREEKBYTES:
			opt_set_recommended_minfreekbytes = 1;
			break;

		case LONG_SET_RECOMMENDED_SHMMAX:
			opt_set_recommended_shmmax = 1;
			break;

		case LONG_SET_HUGETLB_SHM_GROUP:
			opt_grp = getgrnam(optarg);
			if (!opt_grp) {
				opt_gid = atoi(optarg);
				if (opt_gid == 0 && strcmp(optarg, "0"))
					group_invalid = 1;
				opt_grp = getgrgid(opt_gid);
				if (!opt_grp)
					group_invalid = 1;
			} else {
				opt_gid = opt_grp->gr_gid;
			}
			if (group_invalid) {
				ERROR("Invalid group specification (%s)\n", optarg);
				exit(EXIT_FAILURE);
			}
			opt_set_hugetlb_shm_group = 1;
			break;

		case LONG_MOVABLE_DISABLE:
			opt_movable = 0;
			break;

		case LONG_CREATE_MOUNTS:
			opt_create_mounts = 1;
			break;

		case LONG_CREATE_USER_MOUNTS:
			opt_user_mounts = optarg;
			break;

		case LONG_CREATE_GROUP_MOUNTS:
			opt_group_mounts = optarg;
			break;

		case LONG_CREATE_GLOBAL_MOUNTS:
			opt_global_mounts = 1;
			break;

		case LONG_LIMIT_SIZE:
			/* Not a pagesize, but the conversions the same */
			opt_limit_mount_size = parse_page_size(optarg);
			if (!opt_limit_mount_size)
				WARNING("Mount max size specification 0, invalid or overflowed\n");
			break;

		case LONG_LIMIT_INODES:
			opt_limit_mount_inodes = atoi(optarg);
			break;

		case LONG_PAGE_SIZES:
			opt_pgsizes = 1;
			break;

		case LONG_PAGE_AVAIL:
			opt_pgsizes_all = 1;
			break;

		case LONG_EXPLAIN:
			opt_explain = 1;
			break;

		default:
			WARNING("unparsed option %08x\n", ret);
			ret = -1;
			break;
		}
		if (ret != -1)
			ops++;
	}

	verbose_expose();

	if (opt_list_mounts)
		mounts_list_all();

	if (opt_pool_list)
		pool_list();

	if (opt_movable != -1)
		setup_zone_movable(opt_movable);

	if (opt_trans_always)
		set_trans_opt(TRANS_ENABLE, ALWAYS);

	if (opt_trans_madvise)
		set_trans_opt(TRANS_ENABLE, MADVISE);

	if (opt_trans_never)
		set_trans_opt(TRANS_ENABLE, NEVER);

	if (opt_khuge_pages)
		set_trans_opt(KHUGE_SCAN_PAGES, khuge_pages);

	if (opt_khuge_alloc)
		set_trans_opt(KHUGE_ALLOC_SLEEP, khuge_alloc);

	if (opt_khuge_scan)
		set_trans_opt(KHUGE_SCAN_SLEEP, khuge_scan);

	if (opt_set_recommended_minfreekbytes)
		set_recommended_minfreekbytes();

	if (opt_set_recommended_shmmax)
		set_recommended_shmmax();

	if (opt_set_hugetlb_shm_group)
		set_hugetlb_shm_group(opt_gid, opt_grp->gr_name);

	while (--minadj_count >= 0) {
		if (! kernel_has_overcommit())
			pool_adjust(opt_min_adj[minadj_count], POOL_BOTH);
		else
			pool_adjust(opt_min_adj[minadj_count], POOL_MIN);
	}

	while (--maxadj_count >=0)
			pool_adjust(opt_max_adj[maxadj_count], POOL_MAX);

	if (opt_create_mounts) {
		snprintf(base, PATH_MAX, "%s", MOUNT_DIR);
		create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG);
	}


	if (opt_user_mounts != NULL) {
		snprintf(base, PATH_MAX, "%s/user", MOUNT_DIR);
		create_mounts(opt_user_mounts, NULL, base, S_IRWXU);
	}

	if (opt_group_mounts) {
		snprintf(base, PATH_MAX, "%s/group", MOUNT_DIR);
		create_mounts(NULL, opt_group_mounts, base, S_IRWXG);
	}

	if (opt_global_mounts) {
		snprintf(base, PATH_MAX, "%s/global", MOUNT_DIR);
		create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX );
	}

	if (opt_pgsizes)
		page_sizes(0);

	if (opt_pgsizes_all)
		page_sizes(1);

	if (opt_explain)
		explain();

	index = optind;

	if ((argc - index) != 0 || ops == 0) {
		print_usage();
		exit(EXIT_FAILURE);
	}

	exit(EXIT_SUCCESS);
}