/* * libhugetlbfs - Easy use of Linux hugepages * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include "hugetlbfs.h" #include "libhugetlbfs_internal.h" static int heap_fd; static void *heapbase; static void *heaptop; static long mapsize; static long hpage_size; static long hugetlbfs_next_addr(long addr) { #if defined(__powerpc64__) return ALIGN(addr, 1L << SLICE_HIGH_SHIFT); #elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS) return ALIGN(addr, 1L << SLICE_LOW_SHIFT); #elif defined(__ia64__) if (addr < (1UL << SLICE_HIGH_SHIFT)) return ALIGN(addr, 1UL << SLICE_HIGH_SHIFT); else return ALIGN(addr, hpage_size); #else return ALIGN(addr, hpage_size); #endif } /* * Our plan is to ask for pages 'roughly' at the BASE. We expect and * require the kernel to offer us sequential pages from wherever it * first gave us a page. If it does not do so, we return the page and * pretend there are none this covers us for the case where another * map is in the way. This is required because 'morecore' must have * 'sbrk' semantics, ie. return sequential, contigious memory blocks. * Luckily, if it does not do so and we error out malloc will happily * go back to small pages and use mmap to get them. Hurrah. */ static void *hugetlbfs_morecore(ptrdiff_t increment) { int ret; void *p; long delta; int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; int mmap_hugetlb = 0; int using_default_pagesize = (hpage_size == kernel_default_hugepage_size()); INFO("hugetlbfs_morecore(%ld) = ...\n", (long)increment); /* * how much to grow the heap by = * (size of heap) + malloc request - mmap'd space */ delta = (heaptop-heapbase) + increment - mapsize; INFO("heapbase = %p, heaptop = %p, mapsize = %lx, delta=%ld\n", heapbase, heaptop, mapsize, delta); /* align to multiple of hugepagesize. */ delta = ALIGN(delta, hpage_size); #ifdef MAP_HUGETLB mmap_hugetlb = MAP_HUGETLB; #endif if (delta > 0) { /* growing the heap */ INFO("Attempting to map %ld bytes\n", delta); /* map in (extend) more of the file at the end of our last map */ if (__hugetlb_opts.map_hugetlb && using_default_pagesize) p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE, mmap_hugetlb|MAP_ANONYMOUS|MAP_PRIVATE|mmap_reserve, heap_fd, mapsize); else p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE, MAP_PRIVATE|mmap_reserve, heap_fd, mapsize); if (p == MAP_FAILED) { WARNING("New heap segment map at %p failed: %s\n", heapbase+mapsize, strerror(errno)); return NULL; } /* if this is the first map */ if (! mapsize) { if (heapbase && (heapbase != p)) { WARNING("Heap originates at %p instead of %p\n", p, heapbase); if (__hugetlbfs_debug) dump_proc_pid_maps(); } /* then setup the heap variables */ heapbase = heaptop = p; } else if (p != (heapbase + mapsize)) { /* Couldn't get the mapping where we wanted */ munmap(p, delta); WARNING("New heap segment mapped at %p instead of %p\n", p, heapbase + mapsize); if (__hugetlbfs_debug) dump_proc_pid_maps(); return NULL; } /* Fault the region to ensure accesses succeed */ if (hugetlbfs_prefault(p, delta) != 0) { munmap(p, delta); return NULL; } /* we now have mmap'd further */ mapsize += delta; } else if (delta < 0) { /* shrinking the heap */ if (!__hugetlb_opts.shrink_ok) { /* shouldn't ever get here */ WARNING("Heap shrinking is turned off\n"); return NULL; } if (!mapsize) { WARNING("Can't shrink empty heap!\n"); return NULL; } /* * If we are forced to change the heapaddr from the * original brk() value we have violated brk semantics * (which we are not supposed to do). This shouldn't * pose a problem until glibc tries to trim the heap to an * address lower than what we aligned heapaddr to. At that * point the alignment "gap" causes heap corruption. * So we don't allow the heap to shrink below heapbase. */ if (mapsize + delta < 0) { /* remember: delta is negative */ WARNING("Unable to shrink heap below %p\n", heapbase); /* unmap just what is currently mapped */ delta = -mapsize; /* we need heaptop + increment == heapbase, so: */ increment = heapbase - heaptop; } INFO("Attempting to unmap %ld bytes @ %p\n", -delta, heapbase + mapsize + delta); ret = munmap(heapbase + mapsize + delta, -delta); if (ret) { WARNING("Unmapping failed while shrinking heap: " "%s\n", strerror(errno)); } else { mapsize += delta; /* * the glibc assumes by default that newly allocated * memory by morecore() will be zeroed. It would be * wasteful to do it for allocation so we only shrink * the top by the size of a page. */ increment = heapbase - heaptop + mapsize; if (!__hugetlb_opts.map_hugetlb && !using_default_pagesize){ /* * Now shrink the hugetlbfs file. */ ret = ftruncate(heap_fd, mapsize); if (ret) { WARNING("Could not truncate hugetlbfs file to " "shrink heap: %s\n", strerror(errno)); } } } } else if (increment < 0) { /* Don't shrink by less than a page to avoid having to zero * the memory. There is no point in lying to glibc since * we're not freeing any memory. */ increment = 0; } /* heap is continuous */ p = heaptop; /* and we now have added this much more space to the heap */ heaptop = heaptop + increment; INFO("... = %p\n", p); return p; } static void *thp_morecore(ptrdiff_t increment) { void *p; long delta; INFO("thp_morecore(%ld) = ...\n", (long)increment); delta = (heaptop - heapbase) + increment - mapsize; delta = ALIGN(delta, hpage_size); if (delta > 0) { /* * This first time we expand the mapping we need to account for * the initial heap mapping not necessarily being huge page * aligned */ if (!mapsize) delta = hugetlbfs_next_addr((long)heapbase + delta) - (unsigned long)heapbase; INFO("Adding %ld bytes to heap\n", delta); p = sbrk(delta); if (p == (void *)-1) { WARNING("sbrk returned ENOMEM\n"); return NULL; } if (!mapsize) { if (heapbase && (heapbase != p)) { WARNING("Heap was expected at %p instead of %p, " "heap has been modified by someone else!\n", heapbase, p); if (__hugetlbfs_debug) dump_proc_pid_maps(); } heapbase = heaptop = p; } mapsize += delta; #ifdef MADV_HUGEPAGE madvise(p, delta, MADV_HUGEPAGE); #endif } else if (delta < 0) { /* shrinking the heap */ if (!mapsize) { WARNING("Can't shrink an empty heap\n"); return NULL; } INFO("Attempting to shrink heap by %ld bytes with sbrk\n", -delta); p = sbrk(delta); if (p == (void *)-1) { WARNING("Unable to shrink heap\n"); return heaptop; } mapsize += delta; } p = heaptop; heaptop += increment; INFO("... = %p\n", p); return p; } void hugetlbfs_setup_morecore(void) { char *ep; unsigned long heapaddr; if (! __hugetlb_opts.morecore) return; if (strcasecmp(__hugetlb_opts.morecore, "no") == 0) { INFO("HUGETLB_MORECORE=%s, not setting up morecore\n", __hugetlb_opts.morecore); return; } /* * Determine the page size that will be used for the heap. * This can be set explicitly by setting HUGETLB_MORECORE to a valid * page size string or by setting HUGETLB_DEFAULT_PAGE_SIZE. */ if (strncasecmp(__hugetlb_opts.morecore, "y", 1) == 0) hpage_size = gethugepagesize(); else if (__hugetlb_opts.thp_morecore) hpage_size = kernel_default_hugepage_size(); else hpage_size = parse_page_size(__hugetlb_opts.morecore); if (hpage_size <= 0) { if (errno == ENOSYS) WARNING("Hugepages unavailable\n"); else if (errno == EOVERFLOW || errno == ERANGE) WARNING("Hugepage size too large\n"); else if (errno == EINVAL) WARNING("Invalid huge page size\n"); else WARNING("Hugepage size (%s)\n", strerror(errno)); return; } /* * We won't need an fd for the heap mmaps if we are using MAP_HUGETLB * or we are depending on transparent huge pages */ if(__hugetlb_opts.thp_morecore || (__hugetlb_opts.map_hugetlb && hpage_size == kernel_default_hugepage_size())) { heap_fd = -1; } else { if (!hugetlbfs_find_path_for_size(hpage_size)) { WARNING("Hugepage size %li unavailable", hpage_size); return; } heap_fd = hugetlbfs_unlinked_fd_for_size(hpage_size); if (heap_fd < 0) { WARNING("Couldn't open hugetlbfs file for morecore\n"); return; } } /* * THP morecore uses sbrk to allocate more heap space, counting on the * kernel to back the area with THP. So setting heapbase is * meaningless if thp_morecore is used. */ if (!__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) { heapaddr = strtoul(__hugetlb_opts.heapbase, &ep, 16); if (*ep != '\0') { WARNING("Can't parse HUGETLB_MORECORE_HEAPBASE: %s\n", __hugetlb_opts.heapbase); return; } } else { heapaddr = (unsigned long)sbrk(0); if (!__hugetlb_opts.thp_morecore) heapaddr = hugetlbfs_next_addr(heapaddr); } INFO("setup_morecore(): heapaddr = 0x%lx\n", heapaddr); heaptop = heapbase = (void *)heapaddr; if (__hugetlb_opts.thp_morecore) __morecore = &thp_morecore; else __morecore = &hugetlbfs_morecore; /* Set some allocator options more appropriate for hugepages */ if (__hugetlb_opts.shrink_ok) mallopt(M_TRIM_THRESHOLD, hpage_size + hpage_size / 2); else mallopt(M_TRIM_THRESHOLD, -1); mallopt(M_TOP_PAD, hpage_size / 2); /* we always want to use our morecore, not ordinary mmap(). * This doesn't appear to prohibit malloc() from falling back * to mmap() if we run out of hugepages. */ mallopt(M_MMAP_MAX, 0); }