/*************************************************************************** * User front end for using huge pages Copyright (C) 2008, IBM * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the Lesser GNU General Public License as * * published by the Free Software Foundation; either version 2.1 of the * * License, or at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the Lesser GNU General Public * * License along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ /* * hugeadm is designed to make an administrators life simpler, to automate * and simplify basic system configuration as it relates to hugepages. It * is designed to help with pool and mount configuration. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define _GNU_SOURCE /* for getopt_long */ #include #include #define KB (1024) #define MB (1024*KB) #define GB (1024*MB) #define REPORT_UTIL "hugeadm" #define REPORT(level, prefix, format, ...) \ do { \ if (verbose_level >= level) \ fprintf(stderr, "hugeadm:" prefix ": " format, \ ##__VA_ARGS__); \ } while (0); #include "libhugetlbfs_internal.h" #include "hugetlbfs.h" extern int optind; extern char *optarg; #define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) #define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) #define MOUNT_DIR "/var/lib/hugetlbfs" #define OPT_MAX 4096 #define PROCMOUNTS "/proc/mounts" #define PROCHUGEPAGES_MOVABLE "/proc/sys/vm/hugepages_treat_as_movable" #define PROCMINFREEKBYTES "/proc/sys/vm/min_free_kbytes" #define PROCSHMMAX "/proc/sys/kernel/shmmax" #define PROCHUGETLBGROUP "/proc/sys/vm/hugetlb_shm_group" #define PROCZONEINFO "/proc/zoneinfo" #define FS_NAME "hugetlbfs" #define MIN_COL 20 #define MAX_SIZE_MNTENT (64 + PATH_MAX + 32 + 128 + 2 * sizeof(int)) #define FORMAT_LEN 20 #define MEM_TOTAL "MemTotal:" #define SWAP_FREE "SwapFree:" #define SWAP_TOTAL "SwapTotal:" #define ALWAYS "always" #define MADVISE "madvise" #define NEVER "never" #define TRANS_ENABLE "/sys/kernel/mm/transparent_hugepage/enabled" #define KHUGE_SCAN_PAGES "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan" #define KHUGE_SCAN_SLEEP "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs" #define KHUGE_ALLOC_SLEEP "/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs" void print_usage() { fprintf(stderr, "hugeadm [options]\n"); fprintf(stderr, "options:\n"); OPTION("--list-all-mounts", "List all current hugetlbfs mount points"); OPTION("--pool-list", "List all pools"); OPTION("--hard", "specified with --pool-pages-min to make"); CONT("multiple attempts at adjusting the pool size to the"); CONT("specified count on failure"); OPTION("--pool-pages-min :[+|-]>", ""); CONT("Adjust pool 'size' lower bound"); OPTION("--obey-mempolicy", "Obey the NUMA memory policy when"); CONT("adjusting the pool 'size' lower bound"); OPTION("--thp-always", "Enable transparent huge pages always"); OPTION("--thp-madvise", "Enable transparent huge pages with madvise"); OPTION("--thp-never", "Disable transparent huge pages"); OPTION("--thp-khugepaged-pages ", "Number of pages that khugepaged"); CONT("should scan on each pass"); OPTION("--thp-khugepaged-scan-sleep ", "Time in ms to sleep between"); CONT("khugepaged passes"); OPTION("--thp-khugepaged-alloc-sleep ", "Time in ms for khugepaged"); CONT("to wait if there was a huge page allocation failure"); OPTION("--pool-pages-max :[+|-]>", ""); CONT("Adjust pool 'size' upper bound"); OPTION("--set-recommended-min_free_kbytes", ""); CONT("Sets min_free_kbytes to a recommended value to improve availability of"); CONT("huge pages at runtime"); OPTION("--set-recommended-shmmax", "Sets shmmax to a recommended value to"); CONT("maximise the size possible for shared memory pools"); OPTION("--set-shm-group ", "Sets hugetlb_shm_group to the"); CONT("specified group, which has permission to use hugetlb shared memory pools"); OPTION("--add-temp-swap[=count]", "Specified with --pool-pages-min to create"); CONT("temporary swap space for the duration of the pool resize. Default swap"); CONT("size is 5 huge pages. Optional arg sets size to 'count' huge pages"); OPTION("--add-ramdisk-swap", "Specified with --pool-pages-min to create"); CONT("swap space on ramdisks. By default, swap is removed after the resize."); OPTION("--persist", "Specified with --add-temp-swap or --add-ramdisk-swap"); CONT("options to make swap space persist after the resize."); OPTION("--enable-zone-movable", "Use ZONE_MOVABLE for huge pages"); OPTION("--disable-zone-movable", "Do not use ZONE_MOVABLE for huge pages"); OPTION("--create-mounts", "Creates a mount point for each available"); CONT("huge page size on this system under /var/lib/hugetlbfs"); OPTION("--create-user-mounts ", ""); CONT("Creates a mount point for each available huge"); CONT("page size under /var/lib/hugetlbfs/"); CONT("usable by user "); OPTION("--create-group-mounts ", ""); CONT("Creates a mount point for each available huge"); CONT("page size under /var/lib/hugetlbfs/"); CONT("usable by group "); OPTION("--create-global-mounts", ""); CONT("Creates a mount point for each available huge"); CONT("page size under /var/lib/hugetlbfs/global"); CONT("usable by anyone"); OPTION("--max-size >", "Limit the filesystem size of a new mount point"); OPTION("--max-inodes ", "Limit the number of inodes on a new mount point"); OPTION("--page-sizes", "Display page sizes that a configured pool"); OPTION("--page-sizes-all", "Display page sizes support by the hardware"); OPTION("--dry-run", "Print the equivalent shell commands for what"); CONT("the specified options would have done without"); CONT("taking any action"); OPTION("--explain", "Gives a overview of the status of the system"); CONT("with respect to huge page availability"); OPTION("--verbose , -v", "Increases/sets tracing levels"); OPTION("--help, -h", "Prints this message"); } int opt_dry_run = 0; int opt_hard = 0; int opt_movable = -1; int opt_set_recommended_minfreekbytes = 0; int opt_set_recommended_shmmax = 0; int opt_set_hugetlb_shm_group = 0; int opt_temp_swap = 0; int opt_ramdisk_swap = 0; int opt_swap_persist = 0; int opt_obey_mempolicy = 0; unsigned long opt_limit_mount_size = 0; int opt_limit_mount_inodes = 0; int verbose_level = VERBOSITY_DEFAULT; char ramdisk_list[PATH_MAX] = ""; void setup_environment(char *var, char *val) { if (opt_dry_run) { printf("%s='%s'\n", var, val); return; } setenv(var, val, 1); DEBUG("%s='%s'\n", var, val); } /* Enable/disable allocation of hugepages from ZONE_MOVABLE */ void setup_zone_movable(int able) { if (opt_dry_run) { printf("echo %d > %s\n", able, PROCHUGEPAGES_MOVABLE); return; } DEBUG("Setting %s to %d\n", PROCHUGEPAGES_MOVABLE, able); /* libhugetlbfs reports any error that occurs */ file_write_ulong(PROCHUGEPAGES_MOVABLE, (unsigned long)able); } void verbose_init(void) { char *env; env = getenv("HUGETLB_VERBOSE"); if (env) verbose_level = atoi(env); env = getenv("HUGETLB_DEBUG"); if (env) verbose_level = VERBOSITY_MAX; } void verbose(char *which) { int new_level; if (which) { new_level = atoi(which); if (new_level < 0 || new_level > 99) { ERROR("%d: verbosity out of range 0-99\n", new_level); exit(EXIT_FAILURE); } } else { new_level = verbose_level + 1; if (new_level == 100) { WARNING("verbosity limited to 99\n"); new_level--; } } verbose_level = new_level; } void verbose_expose(void) { char level[3]; if (verbose_level == 99) { setup_environment("HUGETLB_DEBUG", "yes"); } snprintf(level, sizeof(level), "%d", verbose_level); setup_environment("HUGETLB_VERBOSE", level); } /* * getopts return values for options which are long only. */ #define LONG_POOL ('p' << 8) #define LONG_POOL_LIST (LONG_POOL|'l') #define LONG_POOL_MIN_ADJ (LONG_POOL|'m') #define LONG_POOL_MAX_ADJ (LONG_POOL|'M') #define LONG_POOL_MEMPOL (LONG_POOL|'p') #define LONG_SET_RECOMMENDED_MINFREEKBYTES ('k' << 8) #define LONG_SET_RECOMMENDED_SHMMAX ('x' << 8) #define LONG_SET_HUGETLB_SHM_GROUP ('R' << 8) #define LONG_MOVABLE ('z' << 8) #define LONG_MOVABLE_ENABLE (LONG_MOVABLE|'e') #define LONG_MOVABLE_DISABLE (LONG_MOVABLE|'d') #define LONG_HARD ('h' << 8) #define LONG_SWAP ('s' << 8) #define LONG_SWAP_DISK (LONG_SWAP|'d') #define LONG_SWAP_RAMDISK (LONG_SWAP|'r') #define LONG_SWAP_PERSIST (LONG_SWAP|'p') #define LONG_PAGE ('P' << 8) #define LONG_PAGE_SIZES (LONG_PAGE|'s') #define LONG_PAGE_AVAIL (LONG_PAGE|'a') #define LONG_MOUNTS ('m' << 8) #define LONG_CREATE_MOUNTS (LONG_MOUNTS|'C') #define LONG_CREATE_USER_MOUNTS (LONG_MOUNTS|'U') #define LONG_CREATE_GROUP_MOUNTS (LONG_MOUNTS|'g') #define LONG_CREATE_GLOBAL_MOUNTS (LONG_MOUNTS|'G') #define LONG_LIST_ALL_MOUNTS (LONG_MOUNTS|'A') #define LONG_LIMITS ('l' << 8) #define LONG_LIMIT_SIZE (LONG_LIMITS|'S') #define LONG_LIMIT_INODES (LONG_LIMITS|'I') #define LONG_EXPLAIN ('e' << 8) #define LONG_TRANS ('t' << 8) #define LONG_TRANS_ALWAYS (LONG_TRANS|'a') #define LONG_TRANS_MADVISE (LONG_TRANS|'m') #define LONG_TRANS_NEVER (LONG_TRANS|'n') #define LONG_KHUGE ('K' << 8) #define LONG_KHUGE_PAGES (LONG_KHUGE|'p') #define LONG_KHUGE_SCAN (LONG_KHUGE|'s') #define LONG_KHUGE_ALLOC (LONG_KHUGE|'a') #define MAX_POOLS 32 static int cmpsizes(const void *p1, const void *p2) { return ((struct hpage_pool *)p1)->pagesize > ((struct hpage_pool *)p2)->pagesize; } void pool_list(void) { struct hpage_pool pools[MAX_POOLS]; int pos; int cnt; cnt = hpool_sizes(pools, MAX_POOLS); if (cnt < 0) { ERROR("unable to obtain pools list"); exit(EXIT_FAILURE); } qsort(pools, cnt, sizeof(pools[0]), cmpsizes); printf("%10s %8s %8s %8s %8s\n", "Size", "Minimum", "Current", "Maximum", "Default"); for (pos = 0; cnt--; pos++) { printf("%10ld %8ld %8ld %8ld %8s\n", pools[pos].pagesize, pools[pos].minimum, pools[pos].size, pools[pos].maximum, (pools[pos].is_default) ? "*" : ""); } } struct mount_list { struct mntent entry; char data[MAX_SIZE_MNTENT]; struct mount_list *next; }; void print_mounts(struct mount_list *current, int longest) { char format_str[FORMAT_LEN]; snprintf(format_str, FORMAT_LEN, "%%-%ds %%s\n", longest); printf(format_str, "Mount Point", "Options"); while (current) { printf(format_str, current->entry.mnt_dir, current->entry.mnt_opts); current = current->next; } } /* * collect_active_mounts returns a list of active hugetlbfs * mount points, and, if longest is not NULL, the number of * characters in the longest mount point to ease output * formatting. Caller is expected to free the list of mounts. */ struct mount_list *collect_active_mounts(int *longest) { FILE *mounts; struct mount_list *list, *current, *previous = NULL; int length; /* First try /proc/mounts, then /etc/mtab */ mounts = setmntent(PROCMOUNTS, "r"); if (!mounts) { mounts = setmntent(MOUNTED, "r"); if (!mounts) { ERROR("unable to open %s or %s for reading", PROCMOUNTS, MOUNTED); exit(EXIT_FAILURE); } } list = malloc(sizeof(struct mount_list)); if (!list) { ERROR("out of memory"); exit(EXIT_FAILURE); } list->next = NULL; current = list; while (getmntent_r(mounts, &(current->entry), current->data, MAX_SIZE_MNTENT)) { if (strcasecmp(current->entry.mnt_type, FS_NAME) == 0) { length = strlen(current->entry.mnt_dir); if (longest && length > *longest) *longest = length; current->next = malloc(sizeof(struct mount_list)); if (!current->next) { ERROR("out of memory"); exit(EXIT_FAILURE); } previous = current; current = current->next; current->next = NULL; } } endmntent(mounts); if (previous) { free(previous->next); previous->next = NULL; return list; } return NULL; } void mounts_list_all(void) { struct mount_list *list, *previous; int longest = MIN_COL; list = collect_active_mounts(&longest); if (!list) { ERROR("No hugetlbfs mount points found\n"); return; } print_mounts(list, longest); while (list) { previous = list; list = list->next; free(previous); } } int make_dir(char *path, mode_t mode, uid_t uid, gid_t gid) { struct passwd *pwd; struct group *grp; if (opt_dry_run) { pwd = getpwuid(uid); grp = getgrgid(gid); printf("if [ ! -e %s ]\n", path); printf("then\n"); printf(" mkdir %s\n", path); printf(" chown %s:%s %s\n", pwd->pw_name, grp->gr_name, path); printf(" chmod %o %s\n", mode, path); printf("fi\n"); return 0; } if (mkdir(path, mode)) { if (errno != EEXIST) { ERROR("Unable to create dir %s, error: %s\n", path, strerror(errno)); return 1; } } else { if (chown(path, uid, gid)) { ERROR("Unable to change ownership of %s, error: %s\n", path, strerror(errno)); return 1; } if (chmod(path, mode)) { ERROR("Unable to change permission on %s, error: %s\n", path, strerror(errno)); return 1; } } return 0; } /** * ensure_dir will build the entire directory structure up to and * including path, all directories built will be owned by * user:group and permissions will be set to mode. */ int ensure_dir(char *path, mode_t mode, uid_t uid, gid_t gid) { char *idx; if (!path || strlen(path) == 0) return 0; idx = strchr(path + 1, '/'); do { if (idx) *idx = '\0'; if (make_dir(path, mode, uid, gid)) return 1; if (idx) { *idx = '/'; idx++; } } while ((idx = strchr(idx, '/')) != NULL); if (make_dir(path, mode, uid, gid)) return 1; return 0; } int check_if_already_mounted(struct mount_list *list, char *path) { while (list) { if (!strcmp(list->entry.mnt_dir, path)) return 1; list = list->next; } return 0; } int mount_dir(char *path, char *options, mode_t mode, uid_t uid, gid_t gid) { struct passwd *pwd; struct group *grp; struct mntent entry; FILE *mounts; char dummy; int useMtab; struct mount_list *list, *previous; list = collect_active_mounts(NULL); if (list && check_if_already_mounted(list, path)) { WARNING("Directory %s is already mounted.\n", path); while (list) { previous = list; list = list->next; free(previous); } return 0; } while (list) { previous = list; list = list->next; free(previous); } if (opt_dry_run) { pwd = getpwuid(uid); grp = getgrgid(gid); printf("mount -t %s none %s -o %s\n", FS_NAME, path, options); printf("chown %s:%s %s\n", pwd->pw_name, grp->gr_name, path); printf("chmod %o %s\n", mode, path); } else { if (mount("none", path, FS_NAME, 0, options)) { ERROR("Unable to mount %s, error: %s\n", path, strerror(errno)); return 1; } /* Check if mtab is a symlink */ useMtab = (readlink(MOUNTED, &dummy, 1) < 0); if (useMtab) { mounts = setmntent(MOUNTED, "a+"); if (mounts) { entry.mnt_fsname = FS_NAME; entry.mnt_dir = path; entry.mnt_type = FS_NAME; entry.mnt_opts = options; entry.mnt_freq = 0; entry.mnt_passno = 0; if (addmntent(mounts, &entry)) WARNING("Unable to add entry %s to %s, error: %s\n", path, MOUNTED, strerror(errno)); endmntent(mounts); } else { WARNING("Unable to open %s, error: %s\n", MOUNTED, strerror(errno)); } } if (chown(path, uid, gid)) { ERROR("Unable to change ownership of %s, error: %s\n", path, strerror(errno)); return 1; } if (chmod(path, mode)) { ERROR("Unable to set permissions on %s, error: %s\n", path, strerror(errno)); return 1; } } return 0; } void scale_size(char *buf, unsigned long pagesize) { if(pagesize >= GB) snprintf(buf, OPT_MAX, "%luGB", pagesize / GB); else if(pagesize >= MB) snprintf(buf, OPT_MAX, "%luMB", pagesize / MB); else snprintf(buf, OPT_MAX, "%luKB", pagesize / KB); } void create_mounts(char *user, char *group, char *base, mode_t mode) { struct hpage_pool pools[MAX_POOLS]; char path[PATH_MAX]; char options[OPT_MAX]; char limits[OPT_MAX]; char scaled[OPT_MAX]; int cnt, pos; struct passwd *pwd; struct group *grp; uid_t uid = 0; gid_t gid = 0; if (geteuid() != 0) { ERROR("Mounts can only be created by root\n"); exit(EXIT_FAILURE); } if (user) { pwd = getpwnam(user); if (!pwd) { ERROR("Could not find specified user %s\n", user); exit(EXIT_FAILURE); } uid = pwd->pw_uid; } else if (group) { grp = getgrnam(group); if (!grp) { ERROR("Could not find specified group %s\n", group); exit(EXIT_FAILURE); } gid = grp->gr_gid; } if (ensure_dir(base, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH, 0, 0)) exit(EXIT_FAILURE); cnt = hpool_sizes(pools, MAX_POOLS); if (cnt < 0) { ERROR("Unable to obtain pools list\n"); exit(EXIT_FAILURE); } for (pos=0; cnt--; pos++) { scaled[0] = 0; scale_size(scaled, pools[pos].pagesize); if (user) snprintf(path, PATH_MAX, "%s/%s/pagesize-%s", base, user, scaled); else if (group) snprintf(path, PATH_MAX, "%s/%s/pagesize-%s", base, group, scaled); else snprintf(path, PATH_MAX, "%s/pagesize-%s", base, scaled); snprintf(options, OPT_MAX, "pagesize=%ld", pools[pos].pagesize); /* Yes, this could be cleverer */ if (opt_limit_mount_size && opt_limit_mount_inodes) snprintf(limits, OPT_MAX, ",size=%lu,nr_inodes=%d", opt_limit_mount_size, opt_limit_mount_inodes); else { if (opt_limit_mount_size) snprintf(limits, OPT_MAX, ",size=%lu", opt_limit_mount_size); if (opt_limit_mount_inodes) snprintf(limits, OPT_MAX, ",nr_inodes=%d", opt_limit_mount_inodes); } /* Append limits if specified */ if (limits[0] != 0) { size_t maxlen = OPT_MAX - strlen(options); if (maxlen > strlen(limits)) strcat(options, limits); else WARNING("String limitations met, cannot append limitations onto mount options string. Increase OPT_MAX"); } if (ensure_dir(path, mode, uid, gid)) exit(EXIT_FAILURE); if (mount_dir(path, options, mode, uid, gid)) exit(EXIT_FAILURE); } } /** * show_mem shouldn't change the behavior of any of its * callers, it only prints a message to the user showing the * total amount of memory in the system (in megabytes). */ void show_mem() { long mem_total; mem_total = read_meminfo(MEM_TOTAL); printf("Total System Memory: %ld MB\n\n", mem_total / 1024); } /** * check_swap shouldn't change the behavior of any of its * callers, it only prints a message to the user if something * is being done that might fail without swap available. i.e. * resizing a huge page pool */ void check_swap() { long swap_sz; long swap_total; swap_total = read_meminfo(SWAP_TOTAL); if (swap_total <= 0) { WARNING("There is no swap space configured, resizing hugepage pool may fail\n"); WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n"); return; } swap_sz = read_meminfo(SWAP_FREE); /* meminfo keeps values in kb, but we use bytes for hpage sizes */ swap_sz *= 1024; if (swap_sz <= gethugepagesize()) { WARNING("There is very little swap space free, resizing hugepage pool may fail\n"); WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n"); } } #define ZONEINFO_LINEBUF 1024 long recommended_minfreekbytes(void) { FILE *f; char buf[ZONEINFO_LINEBUF]; int nr_zones = 0; long recommended_min; long pageblock_kbytes = kernel_default_hugepage_size() / 1024; /* Detect the number of zones in the system */ f = fopen(PROCZONEINFO, "r"); if (f == NULL) { WARNING("Unable to open " PROCZONEINFO); return 0; } while (fgets(buf, ZONEINFO_LINEBUF, f) != NULL) { if (strncmp(buf, "Node ", 5) == 0) nr_zones++; } fclose(f); /* Make sure at least 2 pageblocks are free for MIGRATE_RESERVE */ recommended_min = pageblock_kbytes * nr_zones * 2; /* * Make sure that on average at least two pageblocks are almost free * of another type, one for a migratetype to fall back to and a * second to avoid subsequent fallbacks of other types There are 3 * MIGRATE_TYPES we care about. */ recommended_min += pageblock_kbytes * nr_zones * 3 * 3; return recommended_min; } void set_recommended_minfreekbytes(void) { long recommended_min = recommended_minfreekbytes(); if (opt_dry_run) { printf("echo \"%ld\" > %s\n", recommended_min, PROCMINFREEKBYTES); return; } DEBUG("Setting min_free_kbytes to %ld\n", recommended_min); file_write_ulong(PROCMINFREEKBYTES, (unsigned long)recommended_min); } /* * check_minfreekbytes does not alter the value of min_free_kbytes. It just * reports what the current value is and what it should be */ void check_minfreekbytes(void) { long min_free_kbytes = file_read_ulong(PROCMINFREEKBYTES, NULL); long recommended_min = recommended_minfreekbytes(); /* There should be at least one pageblock free per zone in the system */ if (recommended_min > min_free_kbytes) { printf("\n"); printf("The " PROCMINFREEKBYTES " of %ld is too small. To maximiuse efficiency\n", min_free_kbytes); printf("of fragmentation avoidance, there should be at least one huge page free per zone\n"); printf("in the system which minimally requires a min_free_kbytes value of %ld\n", recommended_min); } } unsigned long long recommended_shmmax(void) { struct hpage_pool pools[MAX_POOLS]; unsigned long long recommended_shmmax = 0; int pos, cnt; cnt = hpool_sizes(pools, MAX_POOLS); if (cnt < 0) { ERROR("unable to obtain pools list"); exit(EXIT_FAILURE); } for (pos = 0; cnt--; pos++) recommended_shmmax += ((unsigned long long)pools[pos].maximum * pools[pos].pagesize); return recommended_shmmax; } void set_recommended_shmmax(void) { int ret; unsigned long max_recommended = -1UL; unsigned long long recommended = recommended_shmmax(); if (recommended == 0) { printf("\n"); WARNING("We can only set a recommended shmmax when huge pages are configured!\n"); return; } if (recommended > max_recommended) recommended = max_recommended; DEBUG("Setting shmmax to %llu\n", recommended); ret = file_write_ulong(PROCSHMMAX, (unsigned long)recommended); if (!ret) { INFO("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n"); INFO(" kernel.shmmax = %llu\n", recommended); } } void check_shmmax(void) { long current_shmmax = file_read_ulong(PROCSHMMAX, NULL); long recommended = recommended_shmmax(); if (current_shmmax != recommended) { printf("\n"); printf("A " PROCSHMMAX " value of %ld bytes may be sub-optimal. To maximise\n", current_shmmax); printf("shared memory usage, this should be set to the size of the largest shared memory\n"); printf("segment size you want to be able to use. Alternatively, set it to a size matching\n"); printf("the maximum possible allocation size of all huge pages. This can be done\n"); printf("automatically, using the --set-recommended-shmmax option.\n"); } if (recommended == 0) { printf("\n"); WARNING("We can't make a shmmax recommendation until huge pages are configured!\n"); return; } printf("\n"); printf("The recommended shmmax for your currently allocated huge pages is %ld bytes.\n", recommended); printf("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n"); printf(" kernel.shmmax = %ld\n", recommended); } void set_hugetlb_shm_group(gid_t gid, char *group) { int ret; DEBUG("Setting hugetlb_shm_group to %d (%s)\n", gid, group); ret = file_write_ulong(PROCHUGETLBGROUP, (unsigned long)gid); if (!ret) { INFO("To make hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n"); INFO(" vm.hugetlb_shm_group = %d\n", gid); } } /* heisted from shadow-utils/libmisc/list.c::is_on_list() */ static int user_in_group(char *const *list, const char *member) { while (*list != NULL) { if (strcmp(*list, member) == 0) { return 1; } list++; } return 0; } void check_user(void) { uid_t uid; gid_t gid; struct passwd *pwd; struct group *grp; gid = (gid_t)file_read_ulong(PROCHUGETLBGROUP, NULL); grp = getgrgid(gid); if (!grp) { printf("\n"); WARNING("Group ID %d in hugetlb_shm_group doesn't appear to be a valid group!\n", gid); return; } uid = getuid(); pwd = getpwuid(uid); /* Don't segfault if user does not have a passwd entry. */ if (!pwd) { printf("\n"); WARNING("User uid %d is not in the password file!\n", uid); return; } if (gid != pwd->pw_gid && !user_in_group(grp->gr_mem, pwd->pw_name) && uid != 0) { printf("\n"); WARNING("User %s (uid: %d) is not a member of the hugetlb_shm_group %s (gid: %d)!\n", pwd->pw_name, uid, grp->gr_name, gid); } else { printf("\n"); printf("To make your hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n"); printf(" vm.hugetlb_shm_group = %d\n", gid); } } void add_temp_swap(long page_size) { char path[PATH_MAX]; char file[PATH_MAX]; char mkswap_cmd[PATH_MAX]; FILE *f; char *buf; long swap_size; long pid; int ret; int num_pages; if (geteuid() != 0) { ERROR("Swap can only be manipulated by root\n"); exit(EXIT_FAILURE); } pid = getpid(); snprintf(path, PATH_MAX, "%s/swap/temp", MOUNT_DIR); snprintf(file, PATH_MAX, "%s/swapfile-%ld", path, pid); /* swapsize is 5 hugepages */ if (opt_temp_swap == -1) num_pages = 5; else num_pages = opt_temp_swap; swap_size = num_pages * page_size; if (ensure_dir(path, S_IRWXU | S_IRGRP | S_IXGRP, 0, 0)) exit(EXIT_FAILURE); if (opt_dry_run) { printf("dd bs=1024 count=%ld if=/dev/zero of=%s\n", swap_size / 1024, file); printf("mkswap %s\nswapon %s\n", file, file); return; } f = fopen(file, "wx"); if (!f) { WARNING("Couldn't open %s: %s\n", file, strerror(errno)); opt_temp_swap = 0; return; } buf = malloc(swap_size); memset(buf, 0, swap_size); fwrite(buf, sizeof(char), swap_size, f); free(buf); fclose(f); snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", file); ret = system(mkswap_cmd); if (WIFSIGNALED(ret)) { WARNING("Call to mkswap failed\n"); opt_temp_swap = 0; return; } else if (WIFEXITED(ret)) { ret = WEXITSTATUS(ret); if (ret) { WARNING("Call to mkswap failed\n"); opt_temp_swap = 0; return; } } DEBUG("swapon %s\n", file); if (swapon(file, 0)) { WARNING("swapon on %s failed: %s\n", file, strerror(errno)); opt_temp_swap = 0; } } void rem_temp_swap() { char file[PATH_MAX]; long pid; pid = getpid(); snprintf(file, PATH_MAX, "%s/swap/temp/swapfile-%ld", MOUNT_DIR, pid); if (opt_dry_run) { printf("swapoff %s\nrm -f %s\n", file, file); return; } if (swapoff(file)) WARNING("swapoff on %s failed: %s\n", file, strerror(errno)); remove(file); DEBUG("swapoff %s\n", file); } void add_ramdisk_swap(long page_size) { char ramdisk[PATH_MAX]; char mkswap_cmd[PATH_MAX]; int disk_num=0; int count = 0; long ramdisk_size; int ret; int fd; snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num); fd = open(ramdisk, O_RDONLY); ioctl(fd, BLKGETSIZE, &ramdisk_size); close(fd); ramdisk_size = ramdisk_size * 512; count = (page_size/ramdisk_size) + 1; if (count > 1) { INFO("Swap will be initialized on multiple ramdisks because\n\ ramdisk size is less than huge page size. To avoid\n\ this in the future, use kernel command line parameter\n\ ramdisk_size=N, to set ramdisk size to N blocks.\n"); } while (count > 0) { snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num); if (access(ramdisk, F_OK) != 0){ break; } disk_num++; if (opt_dry_run) { printf("mkswap %s\nswapon %s\n", ramdisk, ramdisk); } else { snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", ramdisk); ret = system(mkswap_cmd); if (WIFSIGNALED(ret)) { WARNING("Call to mkswap failed\n"); continue; } else if (WIFEXITED(ret)) { ret = WEXITSTATUS(ret); if (ret) { WARNING("Call to mkswap failed\n"); continue; } } DEBUG("swapon %s\n", ramdisk); if (swapon(ramdisk, 0)) { WARNING("swapon on %s failed: %s\n", ramdisk, strerror(errno)); opt_temp_swap = 0; continue; } } count--; strcat(ramdisk_list, " "); strcat(ramdisk_list, ramdisk); } } void rem_ramdisk_swap(){ char *ramdisk; char *iter = NULL; ramdisk = strtok_r(ramdisk_list, " ", &iter); while (ramdisk != NULL) { if (opt_dry_run) { printf("swapoff %s\n", ramdisk); } else { DEBUG("swapoff %s\n", ramdisk); if (swapoff(ramdisk)) { WARNING("swapoff on %s failed: %s\n", ramdisk, strerror(errno)); continue; } } ramdisk = strtok_r(NULL, " ", &iter); } } void set_trans_opt(const char *file, const char *value) { FILE *f; if (geteuid() != 0) { ERROR("Transparent huge page options can only be set by root\n"); exit(EXIT_FAILURE); } if (opt_dry_run) { printf("echo '%s' > %s\n", value, file); return; } f = fopen(file, "w"); if (!f) { ERROR("Couldn't open %s: %s\n", file, strerror(errno)); return; } fprintf(f, "%s", value); fclose(f); } enum { POOL_MIN, POOL_MAX, POOL_BOTH, }; static long value_adjust(char *adjust_str, long base, long page_size) { long long adjust; char *iter; /* Convert and validate the adjust. */ errno = 0; adjust = strtol(adjust_str, &iter, 0); /* Catch strtol errors and sizes that overflow the native word size */ if (errno || adjust_str == iter) { if (errno == ERANGE) errno = EOVERFLOW; else errno = EINVAL; ERROR("%s: invalid adjustment\n", adjust_str); exit(EXIT_FAILURE); } /* size_to_smaller_unit() only works with positive values */ if (adjust_str[0] == '-') adjust = -adjust; switch (*iter) { case 'G': case 'g': adjust = size_to_smaller_unit(adjust); case 'M': case 'm': adjust = size_to_smaller_unit(adjust); case 'K': case 'k': adjust = size_to_smaller_unit(adjust); adjust = adjust / page_size; } /* if previously negative, make negative again */ if (adjust_str[0] == '-') adjust = -adjust; if (adjust_str[0] != '+' && adjust_str[0] != '-') base = 0; /* Ensure we neither go negative nor exceed LONG_MAX. */ if (adjust < 0 && -adjust > base) { adjust = -base; } if (adjust > 0 && (base + adjust) < base) { adjust = LONG_MAX - base; } base += adjust; DEBUG("Returning page count of %ld\n", base); return base; } void pool_adjust(char *cmd, unsigned int counter) { struct hpage_pool pools[MAX_POOLS]; int pos; int cnt; char *iter = NULL; char *page_size_str = NULL; char *adjust_str = NULL; long page_size; unsigned long min; unsigned long min_orig; unsigned long max; unsigned long last_pool_value; /* Extract the pagesize and adjustment. */ page_size_str = strtok_r(cmd, ":", &iter); if (page_size_str) adjust_str = strtok_r(NULL, ":", &iter); if (!page_size_str || !adjust_str) { ERROR("%s: invalid resize specification\n", cmd); exit(EXIT_FAILURE); } INFO("page_size<%s> adjust<%s> counter<%d>\n", page_size_str, adjust_str, counter); /* Convert and validate the page_size. */ if (strcmp(page_size_str, "DEFAULT") == 0) page_size = kernel_default_hugepage_size(); else page_size = parse_page_size(page_size_str); DEBUG("Working with page_size of %ld\n", page_size); cnt = hpool_sizes(pools, MAX_POOLS); if (cnt < 0) { ERROR("unable to obtain pools list"); exit(EXIT_FAILURE); } for (pos = 0; cnt--; pos++) { if (pools[pos].pagesize == page_size) break; } if (cnt < 0) { ERROR("%s: unknown page size\n", page_size_str); exit(EXIT_FAILURE); } min_orig = min = pools[pos].minimum; max = pools[pos].maximum; if (counter == POOL_BOTH) { min = value_adjust(adjust_str, min, page_size); max = min; } else if (counter == POOL_MIN) { min = value_adjust(adjust_str, min, page_size); if (min > max) max = min; } else { max = value_adjust(adjust_str, max, page_size); if (max < min) min = max; } INFO("%ld, %ld -> %ld, %ld\n", pools[pos].minimum, pools[pos].maximum, min, max); if ((pools[pos].maximum - pools[pos].minimum) < (max - min)) { INFO("setting HUGEPAGES_OC to %ld\n", (max - min)); set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min)); } if (opt_hard) cnt = 5; else cnt = -1; if (min > min_orig) { if (opt_temp_swap) add_temp_swap(page_size); if (opt_ramdisk_swap) add_ramdisk_swap(page_size); check_swap(); } if (opt_obey_mempolicy && get_huge_page_counter(page_size, HUGEPAGES_TOTAL_MEMPOL) < 0) { opt_obey_mempolicy = 0; WARNING("Counter for NUMA huge page allocations is not found, continuing with normal pool adjustment\n"); } INFO("setting HUGEPAGES_TOTAL%s to %ld\n", opt_obey_mempolicy ? "_MEMPOL" : "", min); set_huge_page_counter(page_size, opt_obey_mempolicy ? HUGEPAGES_TOTAL_MEMPOL : HUGEPAGES_TOTAL, min); get_pool_size(page_size, &pools[pos]); /* If we fail to make an allocation, retry if user requests */ last_pool_value = pools[pos].minimum; while ((pools[pos].minimum != min) && (cnt > 0)) { /* Make note if progress is being made and sleep for IO */ if (last_pool_value == pools[pos].minimum) cnt--; else cnt = 5; sleep(6); last_pool_value = pools[pos].minimum; INFO("Retrying allocation HUGEPAGES_TOTAL%s to %ld current %ld\n", opt_obey_mempolicy ? "_MEMPOL" : "", min, pools[pos].minimum); set_huge_page_counter(page_size, opt_obey_mempolicy ? HUGEPAGES_TOTAL_MEMPOL : HUGEPAGES_TOTAL, min); get_pool_size(page_size, &pools[pos]); } if (min > min_orig && !opt_swap_persist) { if (opt_temp_swap) rem_temp_swap(); else if (opt_ramdisk_swap) rem_ramdisk_swap(); } /* * HUGEPAGES_TOTAL is not guarenteed to check to exactly the figure * requested should there be insufficient pages. Check the new * value and adjust HUGEPAGES_OC accordingly. */ if (pools[pos].minimum != min) { WARNING("failed to set pool minimum to %ld became %ld\n", min, pools[pos].minimum); min = pools[pos].minimum; } if (pools[pos].maximum != max) { INFO("setting HUGEPAGES_OC to %ld\n", (max - min)); set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min)); } } void page_sizes(int all) { struct hpage_pool pools[MAX_POOLS]; int pos; int cnt; cnt = hpool_sizes(pools, MAX_POOLS); if (cnt < 0) { ERROR("unable to obtain pools list"); exit(EXIT_FAILURE); } qsort(pools, cnt, sizeof(pools[0]), cmpsizes); for (pos = 0; cnt--; pos++) { if (all || (pools[pos].maximum && hugetlbfs_find_path_for_size(pools[pos].pagesize))) printf("%ld\n", pools[pos].pagesize); } } void explain() { show_mem(); mounts_list_all(); printf("\nHuge page pools:\n"); pool_list(); printf("\nHuge page sizes with configured pools:\n"); page_sizes(0); check_minfreekbytes(); check_shmmax(); check_swap(); check_user(); printf("\nNote: Permanent swap space should be preferred when dynamic " "huge page pools are used.\n"); } int main(int argc, char** argv) { int ops; int has_hugepages = kernel_has_hugepages(); char opts[] = "+hdv"; char base[PATH_MAX]; char *opt_min_adj[MAX_POOLS], *opt_max_adj[MAX_POOLS]; char *opt_user_mounts = NULL, *opt_group_mounts = NULL; int opt_list_mounts = 0, opt_pool_list = 0, opt_create_mounts = 0; int opt_global_mounts = 0, opt_pgsizes = 0, opt_pgsizes_all = 0; int opt_explain = 0, minadj_count = 0, maxadj_count = 0; int opt_trans_always = 0, opt_trans_never = 0, opt_trans_madvise = 0; int opt_khuge_pages = 0, opt_khuge_scan = 0, opt_khuge_alloc = 0; int ret = 0, index = 0; char *khuge_pages = NULL, *khuge_alloc = NULL, *khuge_scan = NULL; gid_t opt_gid = 0; struct group *opt_grp = NULL; int group_invalid = 0; struct option long_opts[] = { {"help", no_argument, NULL, 'h'}, {"verbose", required_argument, NULL, 'v' }, {"list-all-mounts", no_argument, NULL, LONG_LIST_ALL_MOUNTS}, {"pool-list", no_argument, NULL, LONG_POOL_LIST}, {"pool-pages-min", required_argument, NULL, LONG_POOL_MIN_ADJ}, {"pool-pages-max", required_argument, NULL, LONG_POOL_MAX_ADJ}, {"obey-mempolicy", no_argument, NULL, LONG_POOL_MEMPOL}, {"thp-always", no_argument, NULL, LONG_TRANS_ALWAYS}, {"thp-madvise", no_argument, NULL, LONG_TRANS_MADVISE}, {"thp-never", no_argument, NULL, LONG_TRANS_NEVER}, {"thp-khugepaged-pages", required_argument, NULL, LONG_KHUGE_PAGES}, {"thp-khugepaged-scan-sleep", required_argument, NULL, LONG_KHUGE_SCAN}, {"thp-khugepaged-alloc-sleep", required_argument, NULL, LONG_KHUGE_ALLOC}, {"set-recommended-min_free_kbytes", no_argument, NULL, LONG_SET_RECOMMENDED_MINFREEKBYTES}, {"set-recommended-shmmax", no_argument, NULL, LONG_SET_RECOMMENDED_SHMMAX}, {"set-shm-group", required_argument, NULL, LONG_SET_HUGETLB_SHM_GROUP}, {"enable-zone-movable", no_argument, NULL, LONG_MOVABLE_ENABLE}, {"disable-zone-movable", no_argument, NULL, LONG_MOVABLE_DISABLE}, {"hard", no_argument, NULL, LONG_HARD}, {"add-temp-swap", optional_argument, NULL, LONG_SWAP_DISK}, {"add-ramdisk-swap", no_argument, NULL, LONG_SWAP_RAMDISK}, {"persist", no_argument, NULL, LONG_SWAP_PERSIST}, {"create-mounts", no_argument, NULL, LONG_CREATE_MOUNTS}, {"create-user-mounts", required_argument, NULL, LONG_CREATE_USER_MOUNTS}, {"create-group-mounts", required_argument, NULL, LONG_CREATE_GROUP_MOUNTS}, {"create-global-mounts", no_argument, NULL, LONG_CREATE_GLOBAL_MOUNTS}, {"max-size", required_argument, NULL, LONG_LIMIT_SIZE}, {"max-inodes", required_argument, NULL, LONG_LIMIT_INODES}, {"page-sizes", no_argument, NULL, LONG_PAGE_SIZES}, {"page-sizes-all", no_argument, NULL, LONG_PAGE_AVAIL}, {"dry-run", no_argument, NULL, 'd'}, {"explain", no_argument, NULL, LONG_EXPLAIN}, {0}, }; hugetlbfs_setup_debug(); setup_mounts(); verbose_init(); ops = 0; while (ret != -1) { ret = getopt_long(argc, argv, opts, long_opts, &index); switch (ret) { case -1: break; case '?': print_usage(); exit(EXIT_FAILURE); case 'h': print_usage(); exit(EXIT_SUCCESS); case 'v': verbose(optarg); continue; case 'd': opt_dry_run = 1; continue; default: /* All other commands require hugepage support. */ if (! has_hugepages) { ERROR("kernel does not support huge pages\n"); exit(EXIT_FAILURE); } } switch (ret) { case -1: break; case LONG_HARD: opt_hard = 1; continue; case LONG_SWAP_DISK: if (optarg) opt_temp_swap = atoi(optarg); else opt_temp_swap = -1; break; case LONG_SWAP_RAMDISK: opt_ramdisk_swap = 1; break; case LONG_SWAP_PERSIST: opt_swap_persist = 1; case LONG_LIST_ALL_MOUNTS: opt_list_mounts = 1; break; case LONG_POOL_LIST: opt_pool_list = 1; break; case LONG_POOL_MIN_ADJ: if (minadj_count == MAX_POOLS) { WARNING("Attempting to adjust an invalid " "pool or a pool multiple times, " "ignoring request: '%s'\n", optarg); } else { opt_min_adj[minadj_count++] = optarg; } break; case LONG_POOL_MEMPOL: opt_obey_mempolicy = 1; break; case LONG_TRANS_ALWAYS: opt_trans_always = 1; break; case LONG_TRANS_MADVISE: opt_trans_madvise = 1; break; case LONG_TRANS_NEVER: opt_trans_never = 1; break; case LONG_KHUGE_PAGES: opt_khuge_pages = 1; khuge_pages = optarg; break; case LONG_KHUGE_SCAN: opt_khuge_scan = 1; khuge_scan = optarg; break; case LONG_KHUGE_ALLOC: opt_khuge_alloc = 1; khuge_alloc = optarg; break; case LONG_POOL_MAX_ADJ: if (! kernel_has_overcommit()) { ERROR("kernel does not support overcommit, " "max cannot be adjusted\n"); exit(EXIT_FAILURE); } if (maxadj_count == MAX_POOLS) { WARNING("Attempting to adjust an invalid " "pool or a pool multiple times, " "ignoring request: '%s'\n", optarg); } else { opt_max_adj[maxadj_count++] = optarg; } break; case LONG_MOVABLE_ENABLE: opt_movable = 1; break; case LONG_SET_RECOMMENDED_MINFREEKBYTES: opt_set_recommended_minfreekbytes = 1; break; case LONG_SET_RECOMMENDED_SHMMAX: opt_set_recommended_shmmax = 1; break; case LONG_SET_HUGETLB_SHM_GROUP: opt_grp = getgrnam(optarg); if (!opt_grp) { opt_gid = atoi(optarg); if (opt_gid == 0 && strcmp(optarg, "0")) group_invalid = 1; opt_grp = getgrgid(opt_gid); if (!opt_grp) group_invalid = 1; } else { opt_gid = opt_grp->gr_gid; } if (group_invalid) { ERROR("Invalid group specification (%s)\n", optarg); exit(EXIT_FAILURE); } opt_set_hugetlb_shm_group = 1; break; case LONG_MOVABLE_DISABLE: opt_movable = 0; break; case LONG_CREATE_MOUNTS: opt_create_mounts = 1; break; case LONG_CREATE_USER_MOUNTS: opt_user_mounts = optarg; break; case LONG_CREATE_GROUP_MOUNTS: opt_group_mounts = optarg; break; case LONG_CREATE_GLOBAL_MOUNTS: opt_global_mounts = 1; break; case LONG_LIMIT_SIZE: /* Not a pagesize, but the conversions the same */ opt_limit_mount_size = parse_page_size(optarg); if (!opt_limit_mount_size) WARNING("Mount max size specification 0, invalid or overflowed\n"); break; case LONG_LIMIT_INODES: opt_limit_mount_inodes = atoi(optarg); break; case LONG_PAGE_SIZES: opt_pgsizes = 1; break; case LONG_PAGE_AVAIL: opt_pgsizes_all = 1; break; case LONG_EXPLAIN: opt_explain = 1; break; default: WARNING("unparsed option %08x\n", ret); ret = -1; break; } if (ret != -1) ops++; } verbose_expose(); if (opt_list_mounts) mounts_list_all(); if (opt_pool_list) pool_list(); if (opt_movable != -1) setup_zone_movable(opt_movable); if (opt_trans_always) set_trans_opt(TRANS_ENABLE, ALWAYS); if (opt_trans_madvise) set_trans_opt(TRANS_ENABLE, MADVISE); if (opt_trans_never) set_trans_opt(TRANS_ENABLE, NEVER); if (opt_khuge_pages) set_trans_opt(KHUGE_SCAN_PAGES, khuge_pages); if (opt_khuge_alloc) set_trans_opt(KHUGE_ALLOC_SLEEP, khuge_alloc); if (opt_khuge_scan) set_trans_opt(KHUGE_SCAN_SLEEP, khuge_scan); if (opt_set_recommended_minfreekbytes) set_recommended_minfreekbytes(); if (opt_set_recommended_shmmax) set_recommended_shmmax(); if (opt_set_hugetlb_shm_group) set_hugetlb_shm_group(opt_gid, opt_grp->gr_name); while (--minadj_count >= 0) { if (! kernel_has_overcommit()) pool_adjust(opt_min_adj[minadj_count], POOL_BOTH); else pool_adjust(opt_min_adj[minadj_count], POOL_MIN); } while (--maxadj_count >=0) pool_adjust(opt_max_adj[maxadj_count], POOL_MAX); if (opt_create_mounts) { snprintf(base, PATH_MAX, "%s", MOUNT_DIR); create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG); } if (opt_user_mounts != NULL) { snprintf(base, PATH_MAX, "%s/user", MOUNT_DIR); create_mounts(opt_user_mounts, NULL, base, S_IRWXU); } if (opt_group_mounts) { snprintf(base, PATH_MAX, "%s/group", MOUNT_DIR); create_mounts(NULL, opt_group_mounts, base, S_IRWXG); } if (opt_global_mounts) { snprintf(base, PATH_MAX, "%s/global", MOUNT_DIR); create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX ); } if (opt_pgsizes) page_sizes(0); if (opt_pgsizes_all) page_sizes(1); if (opt_explain) explain(); index = optind; if ((argc - index) != 0 || ops == 0) { print_usage(); exit(EXIT_FAILURE); } exit(EXIT_SUCCESS); }