/* * libhugetlbfs - Easy use of Linux hugepages * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #define _LARGEFILE64_SOURCE /* Need this for statfs64 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libhugetlbfs_internal.h" #include "hugetlbfs.h" struct libhugeopts_t __hugetlb_opts; static int hugepagesize_errno; /* = 0 */ #define MAX_HPAGE_SIZES 10 static struct hpage_size hpage_sizes[MAX_HPAGE_SIZES]; static int nr_hpage_sizes; static int hpage_sizes_default_idx = -1; static long default_size; /********************************************************************/ /* Internal functions */ /********************************************************************/ /* * Lookup the kernel default page size. */ long kernel_default_hugepage_size() { if (default_size == 0) { default_size = file_read_ulong(MEMINFO, "Hugepagesize:"); default_size = size_to_smaller_unit(default_size); /* kB to B */ } return default_size; } void kernel_default_hugepage_size_reset(void) { default_size = 0; } #define BUF_SZ 256 #define MEMINFO_SIZE 2048 /* * Convert a quantity in a given unit to the next smallest unit by * multiplying the quantity by 1024 (eg. convert 1MB to 1024kB). * If the conversion would overflow the variable, return ULONGLONG_MAX to * signify the error. */ unsigned long long size_to_smaller_unit(unsigned long long size) { if (size * 1024 < size) return -1; else return size * 1024; } /* * Convert a page size string with an optional unit suffix into a page size * in bytes. * * On error, -1 is returned and errno is set appropriately: * EINVAL - str could not be parsed or was not greater than zero * EOVERFLOW - Overflow when converting from the specified units */ long parse_page_size(const char *str) { char *pos; long size; errno = 0; size = strtol(str, &pos, 0); /* Catch strtoul errors and sizes that overflow the native word size */ if (errno || str == pos || size <= 0) { if (errno == ERANGE) errno = EOVERFLOW; else errno = EINVAL; return -1; } switch (*pos) { case 'G': case 'g': size = size_to_smaller_unit(size); case 'M': case 'm': size = size_to_smaller_unit(size); case 'K': case 'k': size = size_to_smaller_unit(size); } if (size < 0) errno = EOVERFLOW; return size; } struct hugetlb_pool_counter_info_t { char *meminfo_key; char *sysfs_file; }; static struct hugetlb_pool_counter_info_t hugetlb_counter_info[] = { [HUGEPAGES_TOTAL] = { .meminfo_key = "HugePages_Total:", .sysfs_file = "nr_hugepages", }, [HUGEPAGES_TOTAL_MEMPOL] = { .meminfo_key = "HugePages_Total:", .sysfs_file = "nr_hugepages_mempolicy", }, [HUGEPAGES_FREE] = { .meminfo_key = "HugePages_Free:", .sysfs_file = "free_hugepages", }, [HUGEPAGES_RSVD] = { .meminfo_key = "HugePages_Rsvd:", .sysfs_file = "resv_hugepages", }, [HUGEPAGES_SURP] = { .meminfo_key = "HugePages_Surp:", .sysfs_file = "surplus_hugepages", }, [HUGEPAGES_OC] = { .meminfo_key = NULL, .sysfs_file = "nr_overcommit_hugepages" }, }; /* * Read numeric data from raw and tagged kernel status files. Used to read * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). */ long file_read_ulong(char *file, const char *tag) { int fd; char buf[MEMINFO_SIZE]; int len, readerr; char *p, *q; long val; fd = open(file, O_RDONLY); if (fd < 0) { ERROR("Couldn't open %s: %s\n", file, strerror(errno)); return -1; } len = read(fd, buf, sizeof(buf)); readerr = errno; close(fd); if (len < 0) { ERROR("Error reading %s: %s\n", file, strerror(readerr)); return -1; } if (len == sizeof(buf)) { ERROR("%s is too large\n", file); return -1; } buf[len] = '\0'; /* Search for a tag if provided */ if (tag) { p = strstr(buf, tag); if (!p) return -1; /* looks like the line we want isn't there */ p += strlen(tag); } else p = buf; val = strtol(p, &q, 0); if (! isspace(*q)) { ERROR("Couldn't parse %s value\n", file); return -1; } return val; } int file_write_ulong(char *file, unsigned long val) { int fd, ret, buflen; char buf[20]; fd = open(file, O_WRONLY); if (fd < 0) { ERROR("Couldn't open %s: %s\n", file, strerror(errno)); return -1; } buflen = sprintf(buf, "%lu", val); ret = write(fd, buf, buflen); close(fd); return ret > 0 ? 0 : -1; } /* * Return the name of this executable, using buf as temporary space. */ #define MAX_EXE 4096 static char *get_exe_name(char *buf, int size) { char *p; int fd; ssize_t nread; buf[0] = 0; fd = open("/proc/self/cmdline", O_RDONLY); if (fd < 0) { WARNING("Unable to open cmdline, no exe name\n"); return buf; } nread = read(fd, buf, size-1); close(fd); if (nread < 0) { WARNING("Error %d reading cmdline, no exe name\n", errno); return buf; } if (nread == 0) { WARNING("Read zero bytes from cmdline, no exe name\n"); return buf; } buf[nread] = 0; /* make sure we're null terminated */ /* * Take advantage of cmdline being a series of null-terminated * strings. The first string is the path to the executable in * the form: * * /path/to/exe * * The exe name starts one character after the last '/'. */ p = strrchr(buf, '/'); if (!p) return buf; return p + 1; /* skip over "/" */ } /* * Reads the contents of hugetlb environment variables and save their * values for later use. */ void hugetlbfs_setup_env() { char *env; __hugetlb_opts.min_copy = true; env = getenv("HUGETLB_VERBOSE"); if (env) __hugetlbfs_verbose = atoi(env); env = getenv("HUGETLB_DEBUG"); if (env) { __hugetlbfs_debug = true; __hugetlbfs_verbose = VERBOSE_DEBUG; } env = getenv("HUGETLB_RESTRICT_EXE"); if (env) { char *p, *tok, *exe, buf[MAX_EXE+1], restriction[MAX_EXE]; int found = 0; exe = get_exe_name(buf, sizeof buf); DEBUG("Found HUGETLB_RESTRICT_EXE, this exe is \"%s\"\n", exe); strncpy(restriction, env, sizeof restriction); restriction[sizeof(restriction)-1] = 0; for (p = restriction; (tok = strtok(p, ":")) != NULL; p = NULL) { DEBUG(" ...check exe match for \"%s\"\n", tok); if (strcmp(tok, exe) == 0) { found = 1; DEBUG("exe match - libhugetlbfs is active for this exe\n"); break; } } if (!found) { DEBUG("No exe match - libhugetlbfs is inactive for this exe\n"); return; } } env = getenv("HUGETLB_NO_PREFAULT"); if (env) __hugetlbfs_prefault = false; __hugetlb_opts.share_path = getenv("HUGETLB_SHARE_PATH"); __hugetlb_opts.elfmap = getenv("HUGETLB_ELFMAP"); __hugetlb_opts.ld_preload = getenv("LD_PRELOAD"); __hugetlb_opts.def_page_size = getenv("HUGETLB_DEFAULT_PAGE_SIZE"); __hugetlb_opts.path = getenv("HUGETLB_PATH"); __hugetlb_opts.features = getenv("HUGETLB_FEATURES"); __hugetlb_opts.morecore = getenv("HUGETLB_MORECORE"); __hugetlb_opts.heapbase = getenv("HUGETLB_MORECORE_HEAPBASE"); if (__hugetlb_opts.morecore) __hugetlb_opts.thp_morecore = (strcasecmp(__hugetlb_opts.morecore, "thp") == 0); if (__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) { DEBUG("Heapbase specified with THP for morecore, ignoring heapbase\n"); __hugetlb_opts.heapbase = NULL; } env = getenv("HUGETLB_FORCE_ELFMAP"); if (env && (strcasecmp(env, "yes") == 0)) __hugetlb_opts.force_elfmap = 1; env = getenv("HUGETLB_MINIMAL_COPY"); if (__hugetlb_opts.min_copy && env && (strcasecmp(env, "no") == 0)) { INFO("HUGETLB_MINIMAL_COPY=%s, disabling filesz copy " "optimization\n", env); __hugetlb_opts.min_copy = false; } env = getenv("HUGETLB_SHARE"); if (env) __hugetlb_opts.sharing = atoi(env); /* * We have been seeing some unexpected behavior from malloc when * heap shrinking is enabled, so heap shrinking is disabled by * default. * * If malloc has been called successfully before setup_morecore, * glibc will notice a gap between the previous top-of-heap and * the new top-of-heap when it calls hugetlbfs_morecore. It treats * this as a "foreign sbrk." Unfortunately, the "foreign sbrk" * handling code will then immediately try to free the memory * allocated by hugetlbfs_morecore! * * This behavior has been reported to the ptmalloc2 maintainer, * along with a patch to correct the behavior. */ env = getenv("HUGETLB_MORECORE_SHRINK"); if (env && strcasecmp(env, "yes") == 0) __hugetlb_opts.shrink_ok = true; /* Determine if shmget() calls should be overridden */ env = getenv("HUGETLB_SHM"); if (env && !strcasecmp(env, "yes")) __hugetlb_opts.shm_enabled = true; /* Determine if all reservations should be avoided */ env = getenv("HUGETLB_NO_RESERVE"); if (env && !strcasecmp(env, "yes")) __hugetlb_opts.no_reserve = true; } void hugetlbfs_setup_kernel_page_size() { long page_size = kernel_default_hugepage_size(); if (page_size <= 0) { WARNING("Unable to find default kernel huge page size\n"); return; } INFO("Found pagesize %ld kB\n", page_size / 1024); hpage_sizes[0].pagesize = page_size; nr_hpage_sizes = 1; } void hugetlbfs_check_priv_resv() { /* * If the kernel supports MAP_PRIVATE reservations, we can skip * prefaulting the huge pages we allocate since the kernel * guarantees them. This can help NUMA performance quite a bit. */ if (hugetlbfs_test_feature(HUGETLB_FEATURE_PRIVATE_RESV) > 0) { INFO("Kernel has MAP_PRIVATE reservations. Disabling " "heap prefaulting.\n"); __hugetlbfs_prefault = false; } } void hugetlbfs_check_safe_noreserve() { /* * Some kernels will trigger an OOM if MAP_NORESERVE is used and * a huge page allocation fails. This is unfortunate so limit * the user of NORESERVE where necessary */ if (__hugetlb_opts.no_reserve && hugetlbfs_test_feature(HUGETLB_FEATURE_SAFE_NORESERVE) <= 0) { INFO("Kernel is not safe for MAP_NORESERVE. Forcing " "use of reservations.\n"); __hugetlb_opts.no_reserve = false; } } void hugetlbfs_check_map_hugetlb() { /* * FIXME: MAP_HUGETLB has not been picked up by glibc so even though the * kernel may support it, without the userspace mmap flag it cannot be * used. This ifdef should be removed when the MAP_HUGETLB flag makes it * into glibc. */ #ifdef MAP_HUGETLB /* * Kernels after 2.6.32 support mmaping pseudo-anonymous regions * backed by huge pages, use this feature for huge pages we * don't intend to share. */ if (hugetlbfs_test_feature(HUGETLB_FEATURE_MAP_HUGETLB) > 0) { INFO("Kernel supports MAP_HUGETLB\n"); __hugetlb_opts.map_hugetlb = true; } #endif } /* * Pool counters are typically exposed in sysfs in modern kernels, the * counters for the default page size are exposed in procfs in all kernels * supporting hugepages. Given a specific counter (e.g. HUGEPAGES_RSVD) * and a page size return both a filename and an optional tag to locate * and extract this counter. */ static int select_pool_counter(unsigned int counter, unsigned long pagesize, char *filename, char **key) { long default_size; char *meminfo_key; char *sysfs_file; if (counter >= HUGEPAGES_MAX_COUNTERS) { ERROR("Invalid counter specified\n"); return -1; } meminfo_key = hugetlb_counter_info[counter].meminfo_key; sysfs_file = hugetlb_counter_info[counter].sysfs_file; if (key) *key = NULL; /* * Get the meminfo page size. * This could be made more efficient if utility functions were shared * between libhugetlbfs and the test suite. For now we will just * read /proc/meminfo. */ default_size = kernel_default_hugepage_size(); if (default_size < 0) { ERROR("Cannot determine the default page size\n"); return -1; } /* If the user is dealing in the default page size, we can use /proc */ if (pagesize == default_size) { if (meminfo_key && key) { strcpy(filename, MEMINFO); *key = meminfo_key; } else sprintf(filename, PROC_HUGEPAGES_DIR "%s", sysfs_file); } else /* Use the sysfs interface */ sprintf(filename, SYSFS_HUGEPAGES_DIR "hugepages-%lukB/%s", pagesize / 1024, sysfs_file); return 0; } static int hpage_size_to_index(unsigned long size) { int i; for (i = 0; i < nr_hpage_sizes; i++) if (hpage_sizes[i].pagesize == size) return i; return -1; } void probe_default_hpage_size(void) { long size; int index; int default_overrided; if (nr_hpage_sizes == 0) { INFO("No configured huge page sizes\n"); hpage_sizes_default_idx = -1; return; } /* * Check if the user specified a default size, otherwise use the * system default size as reported by /proc/meminfo. */ default_overrided = (__hugetlb_opts.def_page_size && strlen(__hugetlb_opts.def_page_size) > 0); if (default_overrided) size = parse_page_size(__hugetlb_opts.def_page_size); else { size = kernel_default_hugepage_size(); } if (size >= 0) { index = hpage_size_to_index(size); if (index >= 0) hpage_sizes_default_idx = index; else { /* * If the user specified HUGETLB_DEFAULT_PAGE_SIZE, * then this situation will alter semantics and they * should receive a WARNING. Otherwise, this detail * is purely informational in nature. */ char msg[] = "No mount point found for default huge " \ "page size. Using first available mount " "point.\n"; if (default_overrided) WARNING("%s", msg); else INFO("%s", msg); hpage_sizes_default_idx = 0; } } else { ERROR("Unable to determine default huge page size\n"); hpage_sizes_default_idx = -1; } } static void add_hugetlbfs_mount(char *path, int user_mount) { int idx; long size; if (strlen(path) > PATH_MAX) return; if (!hugetlbfs_test_path(path)) { WARNING("%s is not a hugetlbfs mount point, ignoring\n", path); return; } size = hugetlbfs_test_pagesize(path); if (size < 0) { WARNING("Unable to detect page size for path %s\n", path); return; } idx = hpage_size_to_index(size); if (idx < 0) { if (nr_hpage_sizes >= MAX_HPAGE_SIZES) { WARNING("Maximum number of huge page sizes exceeded, " "ignoring %lukB page size\n", size); return; } idx = nr_hpage_sizes; hpage_sizes[nr_hpage_sizes++].pagesize = size; } if (strlen(hpage_sizes[idx].mount)) { if (user_mount) WARNING("Mount point already defined for size %li, " "ignoring %s\n", size, path); return; } strcpy(hpage_sizes[idx].mount, path); } void debug_show_page_sizes(void) { int i; INFO("Detected page sizes:\n"); for (i = 0; i < nr_hpage_sizes; i++) INFO(" Size: %li kB %s Mount: %s\n", hpage_sizes[i].pagesize / 1024, i == hpage_sizes_default_idx ? "(default)" : "", hpage_sizes[i].mount); } #define LINE_MAXLEN 2048 static void find_mounts(void) { int fd; char path[PATH_MAX+1]; char line[LINE_MAXLEN + 1]; char *eol; char *match; char *end; int bytes; off_t offset; fd = open("/proc/mounts", O_RDONLY); if (fd < 0) { fd = open("/etc/mtab", O_RDONLY); if (fd < 0) { ERROR("Couldn't open /proc/mounts or /etc/mtab (%s)\n", strerror(errno)); return; } } while ((bytes = read(fd, line, LINE_MAXLEN)) > 0) { line[LINE_MAXLEN] = '\0'; eol = strchr(line, '\n'); if (!eol) { ERROR("Line too long when parsing mounts\n"); break; } /* * Truncate the string to just one line and reset the file * to begin reading at the start of the next line. */ *eol = '\0'; offset = bytes - (eol + 1 - line); lseek(fd, -offset, SEEK_CUR); /* Match only hugetlbfs filesystems. */ match = strstr(line, " hugetlbfs "); if (match) { match = strchr(line, '/'); if (!match) continue; end = strchr(match, ' '); if (!end) continue; strncpy(path, match, end - match); path[end - match] = '\0'; if ((hugetlbfs_test_path(path) == 1) && !(access(path, R_OK | W_OK | X_OK))) add_hugetlbfs_mount(path, 0); } } close(fd); } void setup_mounts(void) { int do_scan = 1; /* If HUGETLB_PATH is set, only add mounts specified there */ while (__hugetlb_opts.path) { char path[PATH_MAX + 1]; char *next = strchrnul(__hugetlb_opts.path, ':'); do_scan = 0; if (next - __hugetlb_opts.path > PATH_MAX) { ERROR("Path too long in HUGETLB_PATH -- " "ignoring environment\n"); break; } strncpy(path, __hugetlb_opts.path, next - __hugetlb_opts.path); path[next - __hugetlb_opts.path] = '\0'; add_hugetlbfs_mount(path, 1); /* skip the ':' token */ __hugetlb_opts.path = *next == '\0' ? NULL : next + 1; } /* Then probe all mounted filesystems */ if (do_scan) find_mounts(); } int get_pool_size(long size, struct hpage_pool *pool) { long nr_over = 0; long nr_used = 0; long nr_surp = 0; long nr_resv = 0; long nr_static = 0; long it_used = -1; long it_surp = -1; long it_resv = -1; /* * Pick up those values which are basically stable with respect to * the admin; ie. only changed by them. * * nr_over may be negative if this kernel does not support overcommit * in that case we will consider it always 0 and max will track min * always. */ nr_over = get_huge_page_counter(size, HUGEPAGES_OC); if (nr_over < 0) nr_over = 0; /* Sample the volatile values until they are stable. */ while (nr_used != it_used || nr_surp != it_surp || nr_resv != it_resv) { nr_used = it_used; nr_surp = it_surp; nr_resv = it_resv; it_used = get_huge_page_counter(size, HUGEPAGES_TOTAL); it_surp = get_huge_page_counter(size, HUGEPAGES_SURP); it_resv = get_huge_page_counter(size, HUGEPAGES_RSVD); } if (nr_surp < 0) nr_surp = 0; if (nr_resv < 0) nr_resv = 0; nr_static = nr_used - nr_surp; if (nr_static >= 0) { DEBUG("pagesize<%ld> min<%ld> max<%ld> " "in-use<%ld>\n", size, nr_static, nr_static + nr_over, nr_used); pool->pagesize = size; pool->minimum = nr_static; pool->maximum = nr_static + nr_over; pool->size = nr_used; pool->is_default = 0; return 1; } return 0; } int hpool_sizes(struct hpage_pool *pools, int pcnt) { long default_size; int which = 0; DIR *dir; struct dirent *entry; default_size = kernel_default_hugepage_size(); if (default_size >= 0 && which < pcnt) if (get_pool_size(default_size, &pools[which])) { pools[which].is_default = 1; which++; } dir = opendir(SYSFS_HUGEPAGES_DIR); if (dir) { while ((entry = readdir(dir))) { char *name = entry->d_name; long size; DEBUG("parsing<%s>\n", name); if (strncmp(name, "hugepages-", 10) != 0) continue; name += 10; size = size_to_smaller_unit(atol(name)); if (size < 0 || size == default_size) continue; if (get_pool_size(size, &pools[which])) which++; } closedir(dir); } return (which < pcnt) ? which : -1; } int arch_has_slice_support(void) { #ifdef __powerpc64__ char mmu_type[16]; FILE *fp; fp = popen("cat /proc/cpuinfo | grep MMU | awk '{ print $3}'", "r"); if (!fp || fscanf(fp, "%s", mmu_type) < 0) { ERROR("Failed to determine MMU type\n"); abort(); } pclose(fp); return strcmp(mmu_type, "Hash") == 0; #elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS) return 1; #else return 0; #endif } /* * If we have a default page size then we support hugepages. */ int kernel_has_hugepages(void) { long default_size = kernel_default_hugepage_size(); if (default_size < 0) return 0; return 1; } /* * If we can find the default page size, and if we can find an overcommit * control for it then the kernel must support overcommit. */ int kernel_has_overcommit(void) { long default_size = kernel_default_hugepage_size(); if (default_size < 0) return 0; if (get_huge_page_counter(default_size, HUGEPAGES_OC) < 0) return 0; return 1; } /********************************************************************/ /* Library user visible functions */ /********************************************************************/ /* * NOTE: This function uses data that is initialized by * setup_mounts() which is called during libhugetlbfs initialization. * * returns: * on success, size of a huge page in number of bytes * on failure, -1 * errno set to ENOSYS if huge pages are not supported * errno set to EOVERFLOW if huge page size would overflow return type */ long gethugepagesize(void) { long hpage_size; /* Are huge pages available and have they been initialized? */ if (hpage_sizes_default_idx == -1) { errno = hugepagesize_errno = ENOSYS; return -1; } errno = 0; hpage_size = hpage_sizes[hpage_sizes_default_idx].pagesize; return hpage_size; } int gethugepagesizes(long pagesizes[], int n_elem) { long default_size; DIR *sysfs; struct dirent *ent; int nr_sizes = 0; if (n_elem < 0) { errno = EINVAL; return -1; } if (n_elem > 0 && pagesizes == NULL) { errno = EINVAL; return -1; } errno = 0; /* Get the system default size. */ default_size = kernel_default_hugepage_size(); if (default_size < 0) return 0; if (pagesizes && (nr_sizes == n_elem)) return nr_sizes; if (pagesizes) pagesizes[nr_sizes] = default_size; nr_sizes++; /* * Scan sysfs to look for other sizes. * Non-existing dir is not an error, we got one size from /proc/meminfo. */ sysfs = opendir(SYSFS_HUGEPAGES_DIR); if (!sysfs) { if (errno == ENOENT) { errno = 0; return nr_sizes; } else return -1; } while ((ent = readdir(sysfs))) { long size; if (strncmp(ent->d_name, "hugepages-", 10)) continue; size = strtol(ent->d_name + 10, NULL, 10); if (size == LONG_MIN || size == LONG_MAX) continue; size = size_to_smaller_unit(size); if (size < 0 || size == default_size) continue; if (pagesizes && (nr_sizes == n_elem)) return nr_sizes; if (pagesizes) pagesizes[nr_sizes] = size; nr_sizes++; } closedir(sysfs); return nr_sizes; } int getpagesizes(long pagesizes[], int n_elem) { int ret; if (n_elem < 0 || (n_elem > 0 && pagesizes == NULL)) { errno = EINVAL; return -1; } /* Requests for sizing, we need one more slot than gethugepagesizes. */ if (pagesizes == NULL && n_elem == 0) { ret = gethugepagesizes(pagesizes, n_elem); } else { /* Install the base page size. */ if (pagesizes && n_elem == 0) return 0; if (pagesizes) pagesizes[0] = sysconf(_SC_PAGESIZE); ret = gethugepagesizes(pagesizes + 1, n_elem - 1); } if (ret < 0) return ret; return ret + 1; } int hugetlbfs_test_path(const char *mount) { struct statfs64 sb; int err; /* Bugs in the 32<->64 translation code in pre-2.6.15 kernels * mean that plain statfs() returns bogus errors on hugetlbfs * filesystems. Use statfs64() to work around. */ err = statfs64(mount, &sb); if (err) return -1; return (sb.f_type == HUGETLBFS_MAGIC); } /* Return the page size for the given mount point in bytes */ long hugetlbfs_test_pagesize(const char *mount) { struct statfs64 sb; int err; err = statfs64(mount, &sb); if (err) return -1; if ((sb.f_bsize <= 0) || (sb.f_bsize > LONG_MAX)) return -1; return sb.f_bsize; } const char *hugetlbfs_find_path_for_size(long page_size) { char *path; int idx; idx = hpage_size_to_index(page_size); if (idx >= 0) { path = hpage_sizes[idx].mount; if (strlen(path)) return path; } return NULL; } const char *hugetlbfs_find_path(void) { long hpage_size = gethugepagesize(); if (hpage_size > 0) return hugetlbfs_find_path_for_size(hpage_size); else return NULL; } int hugetlbfs_unlinked_fd_for_size(long page_size) { const char *path; char name[PATH_MAX+1]; int fd; path = hugetlbfs_find_path_for_size(page_size); if (!path) return -1; name[sizeof(name)-1] = '\0'; strcpy(name, path); strncat(name, "/libhugetlbfs.tmp.XXXXXX", sizeof(name)-1); /* FIXME: deal with overflows */ fd = mkstemp64(name); if (fd < 0) { ERROR("mkstemp() failed: %s\n", strerror(errno)); return -1; } unlink(name); return fd; } int hugetlbfs_unlinked_fd(void) { long hpage_size = gethugepagesize(); if (hpage_size > 0) return hugetlbfs_unlinked_fd_for_size(hpage_size); else return -1; } #define IOV_LEN 64 int hugetlbfs_prefault(void *addr, size_t length) { size_t offset; struct iovec iov[IOV_LEN]; int ret; int i; int fd; if (!__hugetlbfs_prefault) return 0; /* * The NUMA users of libhugetlbfs' malloc feature are * expected to use the numactl program to specify an * appropriate policy for hugepage allocation * * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT * is set. If we instead returned a hugepage mapping with insufficient * hugepages, the VM system would kill the process when the * process tried to access the missing memory. * * The value of this environment variable is read during library * initialisation and sets __hugetlbfs_prefault accordingly. If * prefaulting is enabled and we can't get all that were requested, * -ENOMEM is returned. The caller is expected to release the entire * mapping and optionally it may recover by mapping base pages instead. */ fd = open("/dev/zero", O_RDONLY); if (fd < 0) { ERROR("Failed to open /dev/zero for reading\n"); return -ENOMEM; } for (offset = 0; offset < length; ) { for (i = 0; i < IOV_LEN && offset < length; i++) { iov[i].iov_base = addr + offset; iov[i].iov_len = 1; offset += gethugepagesize(); } ret = readv(fd, iov, i); if (ret != i) { DEBUG("Got %d of %d requested; err=%d\n", ret, i, ret < 0 ? errno : 0); WARNING("Failed to reserve %ld huge pages " "for new region\n", length / gethugepagesize()); close(fd); return -ENOMEM; } } close(fd); return 0; } long get_huge_page_counter(long pagesize, unsigned int counter) { char file[PATH_MAX+1]; char *key; if (select_pool_counter(counter, pagesize, file, &key)) return -1; if (access(file, O_RDONLY)) return -1; return file_read_ulong(file, key); } int set_huge_page_counter(long pagesize, unsigned int counter, unsigned long val) { char file[PATH_MAX+1]; if (select_pool_counter(counter, pagesize, file, NULL)) return -1; return file_write_ulong(file, val); } int set_nr_hugepages(long pagesize, unsigned long val) { return set_huge_page_counter(pagesize, HUGEPAGES_TOTAL, val); } int set_nr_overcommit_hugepages(long pagesize, unsigned long val) { DEBUG("setting HUGEPAGES_OC to %ld\n", val); return set_huge_page_counter(pagesize, HUGEPAGES_OC, val); } long read_nr_overcommit(long page_size) { if (!kernel_has_overcommit()) return -1; return get_huge_page_counter(page_size, HUGEPAGES_OC); } void restore_overcommit_pages(long page_size, long oc_pool) { if (!kernel_has_overcommit()) return; set_nr_overcommit_hugepages(page_size, oc_pool); } /********************************************************************/ /* Library user visible DIAGNOSES/DEBUGGING ONLY functions */ /********************************************************************/ #define MAPS_BUF_SZ 4096 long dump_proc_pid_maps() { FILE *f; char line[MAPS_BUF_SZ]; size_t ret; f = fopen("/proc/self/maps", "r"); if (!f) { ERROR("Failed to open /proc/self/maps\n"); return -1; } while (1) { ret = fread(line, sizeof(char), MAPS_BUF_SZ, f); if (ret < 0) { ERROR("Failed to read /proc/self/maps\n"); return -1; } if (ret == 0) break; ret = fwrite(line, sizeof(char), ret, stderr); if (ret < 0) { ERROR("Failed to write /proc/self/maps to stderr\n"); return -1; } } fclose(f); return 0; } long read_meminfo(const char *tag) { return file_read_ulong(MEMINFO, tag); }