Blob Blame History Raw
/*
 * libhugetlbfs - Easy use of Linux hugepages
 * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include <dlfcn.h>
#include <string.h>
#include <fcntl.h>

#include "hugetlbfs.h"

#include "libhugetlbfs_internal.h"

static int heap_fd;

static void *heapbase;
static void *heaptop;
static long mapsize;
static long hpage_size;

static long hugetlbfs_next_addr(long addr)
{
#if defined(__powerpc64__)
	return ALIGN(addr, 1L << SLICE_HIGH_SHIFT);
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
	return ALIGN(addr, 1L << SLICE_LOW_SHIFT);
#elif defined(__ia64__)
	if (addr < (1UL << SLICE_HIGH_SHIFT))
		return ALIGN(addr, 1UL << SLICE_HIGH_SHIFT);
	else
		return ALIGN(addr, hpage_size);
#else
	return ALIGN(addr, hpage_size);
#endif
}

/*
 * Our plan is to ask for pages 'roughly' at the BASE.  We expect and
 * require the kernel to offer us sequential pages from wherever it
 * first gave us a page.  If it does not do so, we return the page and
 * pretend there are none this covers us for the case where another
 * map is in the way.  This is required because 'morecore' must have
 * 'sbrk' semantics, ie. return sequential, contigious memory blocks.
 * Luckily, if it does not do so and we error out malloc will happily
 * go back to small pages and use mmap to get them.  Hurrah.
 */
static void *hugetlbfs_morecore(ptrdiff_t increment)
{
	int ret;
	void *p;
	long delta;
	int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0;
	int mmap_hugetlb = 0;
	int using_default_pagesize =
		(hpage_size == kernel_default_hugepage_size());

	INFO("hugetlbfs_morecore(%ld) = ...\n", (long)increment);

	/*
	 * how much to grow the heap by =
	 * 	(size of heap) + malloc request - mmap'd space
	 */
	delta = (heaptop-heapbase) + increment - mapsize;

	INFO("heapbase = %p, heaptop = %p, mapsize = %lx, delta=%ld\n",
	      heapbase, heaptop, mapsize, delta);

	/* align to multiple of hugepagesize. */
	delta = ALIGN(delta, hpage_size);

#ifdef MAP_HUGETLB
	mmap_hugetlb = MAP_HUGETLB;
#endif

	if (delta > 0) {
		/* growing the heap */

		INFO("Attempting to map %ld bytes\n", delta);

		/* map in (extend) more of the file at the end of our last map */
		if (__hugetlb_opts.map_hugetlb && using_default_pagesize)
			p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE,
				 mmap_hugetlb|MAP_ANONYMOUS|MAP_PRIVATE|mmap_reserve,
				 heap_fd, mapsize);
		else
			p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE,
				 MAP_PRIVATE|mmap_reserve, heap_fd, mapsize);

		if (p == MAP_FAILED) {
			WARNING("New heap segment map at %p failed: %s\n",
				heapbase+mapsize, strerror(errno));
			return NULL;
		}

		/* if this is the first map */
		if (! mapsize) {
			if (heapbase && (heapbase != p)) {
				WARNING("Heap originates at %p instead of %p\n",
					p, heapbase);
				if (__hugetlbfs_debug)
					dump_proc_pid_maps();
			}
			/* then setup the heap variables */
			heapbase = heaptop = p;
		} else if (p != (heapbase + mapsize)) {
			/* Couldn't get the mapping where we wanted */
			munmap(p, delta);
			WARNING("New heap segment mapped at %p instead of %p\n",
			      p, heapbase + mapsize);
			if (__hugetlbfs_debug)
				dump_proc_pid_maps();
			return NULL;
		}

		/* Fault the region to ensure accesses succeed */
		if (hugetlbfs_prefault(p, delta) != 0) {
			munmap(p, delta);
			return NULL;
		}

		/* we now have mmap'd further */
		mapsize += delta;
	} else if (delta < 0) {
		/* shrinking the heap */

		if (!__hugetlb_opts.shrink_ok) {
			/* shouldn't ever get here */
			WARNING("Heap shrinking is turned off\n");
			return NULL;
		}

		if (!mapsize) {
			WARNING("Can't shrink empty heap!\n");
			return NULL;
		}

		/*
		 * If we are forced to change the heapaddr from the
		 * original brk() value we have violated brk semantics
		 * (which we are not supposed to do).  This shouldn't
		 * pose a problem until glibc tries to trim the heap to an
		 * address lower than what we aligned heapaddr to.  At that
		 * point the alignment "gap" causes heap corruption.
		 * So we don't allow the heap to shrink below heapbase.
		 */
		if (mapsize + delta < 0) {  /* remember: delta is negative */
			WARNING("Unable to shrink heap below %p\n", heapbase);
			/* unmap just what is currently mapped */
			delta = -mapsize;
			/* we need heaptop + increment == heapbase, so: */
			increment = heapbase - heaptop;
		}
		INFO("Attempting to unmap %ld bytes @ %p\n", -delta,
			heapbase + mapsize + delta);
		ret = munmap(heapbase + mapsize + delta, -delta);
		if (ret) {
			WARNING("Unmapping failed while shrinking heap: "
				"%s\n", strerror(errno));
		} else {
			mapsize += delta;
			/*
			* the glibc assumes by default that newly allocated
			* memory by morecore() will be zeroed.  It would be
			* wasteful to do it for allocation so we only shrink
			* the top by the size of a page.
			*/
			increment = heapbase - heaptop + mapsize;

			if (!__hugetlb_opts.map_hugetlb && !using_default_pagesize){

				/*
				* Now shrink the hugetlbfs file.
				*/
				ret = ftruncate(heap_fd, mapsize);
				if (ret) {
					WARNING("Could not truncate hugetlbfs file to "
						"shrink heap: %s\n", strerror(errno));
				}
			}
		}

	}
	else if (increment < 0) {
		/* Don't shrink by less than a page to avoid having to zero
		 * the memory.  There is no point in lying to glibc since
		 * we're not freeing any memory.
		 */
		increment = 0;
	}

	/* heap is continuous */
	p = heaptop;
	/* and we now have added this much more space to the heap */
	heaptop = heaptop + increment;

	INFO("... = %p\n", p);
	return p;
}

static void *thp_morecore(ptrdiff_t increment)
{
	void *p;
	long delta;

	INFO("thp_morecore(%ld) = ...\n", (long)increment);

	delta = (heaptop - heapbase) + increment - mapsize;
	delta = ALIGN(delta, hpage_size);

	if (delta > 0) {
		/*
		 * This first time we expand the mapping we need to account for
		 * the initial heap mapping not necessarily being huge page
		 * aligned
		 */
		if (!mapsize)
			delta = hugetlbfs_next_addr((long)heapbase + delta) -
					(unsigned long)heapbase;

		INFO("Adding %ld bytes to heap\n", delta);

		p = sbrk(delta);
		if (p == (void *)-1) {
			WARNING("sbrk returned ENOMEM\n");
			return NULL;
		}

		if (!mapsize) {
			if (heapbase && (heapbase != p)) {
				WARNING("Heap was expected at %p instead of %p, "
					"heap has been modified by someone else!\n",
					heapbase, p);
				if (__hugetlbfs_debug)
					dump_proc_pid_maps();
			}
			heapbase = heaptop = p;
		}

		mapsize += delta;
#ifdef MADV_HUGEPAGE
		madvise(p, delta, MADV_HUGEPAGE);
#endif
	} else if (delta < 0) {
		/* shrinking the heap */
		if (!mapsize) {
			WARNING("Can't shrink an empty heap\n");
			return NULL;
		}

		INFO("Attempting to shrink heap by %ld bytes with sbrk\n",
			-delta);
		p = sbrk(delta);
		if (p == (void *)-1) {
			WARNING("Unable to shrink heap\n");
			return heaptop;
		}

		mapsize += delta;
	}

	p = heaptop;
	heaptop += increment;
	INFO("... = %p\n", p);
	return p;
}

void hugetlbfs_setup_morecore(void)
{
	char *ep;
	unsigned long heapaddr;

	if (! __hugetlb_opts.morecore)
		return;
	if (strcasecmp(__hugetlb_opts.morecore, "no") == 0) {
		INFO("HUGETLB_MORECORE=%s, not setting up morecore\n",
						__hugetlb_opts.morecore);
		return;
	}

	/*
	 * Determine the page size that will be used for the heap.
	 * This can be set explicitly by setting HUGETLB_MORECORE to a valid
	 * page size string or by setting HUGETLB_DEFAULT_PAGE_SIZE.
	 */
	if (strncasecmp(__hugetlb_opts.morecore, "y", 1) == 0)
		hpage_size = gethugepagesize();
	else if (__hugetlb_opts.thp_morecore)
		hpage_size = kernel_default_hugepage_size();
	else
		hpage_size = parse_page_size(__hugetlb_opts.morecore);

	if (hpage_size <= 0) {
		if (errno == ENOSYS)
			WARNING("Hugepages unavailable\n");
		else if (errno == EOVERFLOW || errno == ERANGE)
			WARNING("Hugepage size too large\n");
		else if (errno == EINVAL)
			WARNING("Invalid huge page size\n");
		else
			WARNING("Hugepage size (%s)\n", strerror(errno));
		return;
	}

	/*
	 * We won't need an fd for the heap mmaps if we are using MAP_HUGETLB
	 * or we are depending on transparent huge pages
	 */
	if(__hugetlb_opts.thp_morecore || (__hugetlb_opts.map_hugetlb &&
			hpage_size == kernel_default_hugepage_size())) {
		heap_fd = -1;
	} else {
		if (!hugetlbfs_find_path_for_size(hpage_size)) {
			WARNING("Hugepage size %li unavailable", hpage_size);
			return;
		}

		heap_fd = hugetlbfs_unlinked_fd_for_size(hpage_size);
		if (heap_fd < 0) {
			WARNING("Couldn't open hugetlbfs file for morecore\n");
			return;
		}
	}

	/*
	 * THP morecore uses sbrk to allocate more heap space, counting on the
	 * kernel to back the area with THP.  So setting heapbase is
	 * meaningless if thp_morecore is used.
	 */
	if (!__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) {
		heapaddr = strtoul(__hugetlb_opts.heapbase, &ep, 16);
		if (*ep != '\0') {
			WARNING("Can't parse HUGETLB_MORECORE_HEAPBASE: %s\n",
			      __hugetlb_opts.heapbase);
			return;
		}
	} else {
		heapaddr = (unsigned long)sbrk(0);
		if (!__hugetlb_opts.thp_morecore)
			heapaddr = hugetlbfs_next_addr(heapaddr);
	}

	INFO("setup_morecore(): heapaddr = 0x%lx\n", heapaddr);

	heaptop = heapbase = (void *)heapaddr;
	if (__hugetlb_opts.thp_morecore)
		__morecore = &thp_morecore;
	else
		__morecore = &hugetlbfs_morecore;

	/* Set some allocator options more appropriate for hugepages */

	if (__hugetlb_opts.shrink_ok)
		mallopt(M_TRIM_THRESHOLD, hpage_size + hpage_size / 2);
	else
		mallopt(M_TRIM_THRESHOLD, -1);
	mallopt(M_TOP_PAD, hpage_size / 2);
	/* we always want to use our morecore, not ordinary mmap().
	 * This doesn't appear to prohibit malloc() from falling back
	 * to mmap() if we run out of hugepages. */
	mallopt(M_MMAP_MAX, 0);
}