Blob Blame History Raw
/*
 * libhugetlbfs - Easy use of Linux hugepages
 * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */

#define _GNU_SOURCE

#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <link.h>
#include <malloc.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/syscall.h>
#include <sys/file.h>
#include <linux/unistd.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <errno.h>
#include <limits.h>
#include <elf.h>
#include <dlfcn.h>

#include "version.h"
#include "hugetlbfs.h"
#include "libhugetlbfs_internal.h"

#ifdef __LP64__
#define Elf_Ehdr	Elf64_Ehdr
#define Elf_Phdr	Elf64_Phdr
#define Elf_Dyn		Elf64_Dyn
#define Elf_Sym		Elf64_Sym
#define ELF_ST_BIND(x)  ELF64_ST_BIND(x)
#define ELF_ST_TYPE(x)  ELF64_ST_TYPE(x)
#else
#define Elf_Ehdr	Elf32_Ehdr
#define Elf_Phdr	Elf32_Phdr
#define Elf_Dyn		Elf32_Dyn
#define Elf_Sym		Elf32_Sym
#define ELF_ST_BIND(x)  ELF64_ST_BIND(x)
#define ELF_ST_TYPE(x)  ELF64_ST_TYPE(x)
#endif

/*
 * SHARED_TIMEOUT is used by find_or_prepare_shared_file for when it
 * should timeout while waiting for other users to finish preparing
 * the file it wants.  The value is the number of tries before giving
 * up with a 1 second wait between tries
 */
#define SHARED_TIMEOUT 10

/* This function prints an error message to stderr, then aborts.  It
 * is safe to call, even if the executable segments are presently
 * unmapped.
 *
 * Arguments are printf() like, but at present supports only %d and %p
 * with no modifiers
 *
 * FIXME: This works in practice, but I suspect it
 * is not guaranteed safe: the library functions we call could in
 * theory call other functions via the PLT which will blow up. */
static void write_err(const char *start, int len)
{
	direct_syscall(__NR_write, 2 /*stderr*/, start, len);
}
static void sys_abort(void)
{
	pid_t pid = direct_syscall(__NR_getpid);

	direct_syscall(__NR_kill, pid, SIGABRT);
}
static void write_err_base(unsigned long val, int base)
{
	const char digit[] = "0123456789abcdef";
	char str1[sizeof(val)*8];
	char str2[sizeof(val)*8];
	int len = 0;
	int i;

	str1[0] = '0';
	while (val) {
		str1[len++] = digit[val % base];
		val /= base;
	}

	if (len == 0)
		len = 1;

	/* Reverse digits */
	for (i = 0; i < len; i++)
		str2[i] = str1[len-i-1];

	write_err(str2, len);
}

static void unmapped_abort(const char *fmt, ...)
{
	const char *p, *q;
	int done = 0;
	unsigned long val;
	va_list ap;

	/* World's worst printf()... */
	va_start(ap, fmt);
	p = q = fmt;
	while (! done) {
		switch (*p) {
		case '\0':
			write_err(q, p-q);
			done = 1;
			break;

		case '%':
			write_err(q, p-q);
			p++;
			switch (*p) {
			case 'u':
				val = va_arg(ap, unsigned);
				write_err_base(val, 10);
				p++;
				break;
			case 'p':
				val = (unsigned long)va_arg(ap, void *);
				write_err_base(val, 16);
				p++;
				break;
			}
			q = p;
			break;
		default:
			p++;
		}
	}

	va_end(ap);

	sys_abort();
}

/* The directory to use for sharing readonly segments */
static char share_readonly_path[PATH_MAX+1];

#define MAX_HTLB_SEGS	3
#define MAX_SEGS	10

struct seg_info {
	void *vaddr;
	unsigned long filesz, memsz, extrasz;
	int prot;
	int fd;
	int index;
	long page_size;
};

struct seg_layout {
	unsigned long start, end;
	long page_size;
};

static struct seg_info htlb_seg_table[MAX_HTLB_SEGS];
static int htlb_num_segs;
static unsigned long force_remap; /* =0 */
static long hpage_readonly_size, hpage_writable_size;

/**
 * assemble_path - handy wrapper around snprintf() for building paths
 * @dst: buffer of size PATH_MAX+1 to assemble string into
 * @fmt: format string for path
 * @...: printf() style parameters for path
 *
 * assemble_path() builds a path in the target buffer (which must have
 * PATH_MAX+1 available bytes), similar to sprintf().  However, f the
 * assembled path would exceed PATH_MAX characters in length,
 * assemble_path() prints an error and abort()s, so there is no need
 * to check the return value and backout.
 */
static void assemble_path(char *dst, const char *fmt, ...)
{
	va_list ap;
	int len;

	va_start(ap, fmt);
	len = vsnprintf(dst, PATH_MAX+1, fmt, ap);
	va_end(ap);

	if (len < 0) {
		ERROR("vsnprintf() error\n");
		abort();
	}

	if (len > PATH_MAX) {
		ERROR("Overflow assembling path\n");
		abort();
	}
}

static void check_memsz()
{
	int i;
	unsigned long memsz_total = 0, memsz_max = 0;
	if (htlb_num_segs == 0)
		return;
	/*
	 * rough heuristic to see if we'll run out of address
	 * space
	 */
	for (i = 0; i < htlb_num_segs; i++) {
		memsz_total += htlb_seg_table[i].memsz;
		if (htlb_seg_table[i].memsz > memsz_max)
			memsz_max = htlb_seg_table[i].memsz;
	}
	/* avoid overflow checking by using two checks */
	DEBUG("Total memsz = %#0lx, memsz of largest segment = %#0lx\n",
			memsz_total, memsz_max);
}

/**
 * find_or_create_share_path - obtain a directory to store the shared
 * hugetlbfs files
 *
 * Checks environment and filesystem to locate a suitable directory
 * for shared hugetlbfs files, creating a new directory if necessary.
 * The determined path is stored in global variable share_readonly_path.
 *
 * returns:
 *  -1, on error
 *  0, on success
 */
static int find_or_create_share_path(long page_size)
{
	const char *base_path;
	struct stat sb;
	int ret;

	/* If no remaping is planned for the read-only segments we are done */
	if (!page_size)
		return 0;

	if (__hugetlb_opts.share_path) {
		/* Given an explicit path */
		if (hugetlbfs_test_path(__hugetlb_opts.share_path) != 1) {
			WARNING("HUGETLB_SHARE_PATH %s is not on a hugetlbfs"
			      " filesystem\n", __hugetlb_opts.share_path);
			return -1;
		}

		/* Make sure the page size matches */
		if (page_size !=
			hugetlbfs_test_pagesize(__hugetlb_opts.share_path)) {
			WARNING("HUGETLB_SHARE_PATH %s is not valid for a %li "
			      "kB page size\n", __hugetlb_opts.share_path,
				page_size / 1024);
			return -1;
		}
		assemble_path(share_readonly_path, "%s",
				__hugetlb_opts.share_path);
		return 0;
	}

	base_path = hugetlbfs_find_path_for_size(page_size);
	if (!base_path)
		return -1;

	assemble_path(share_readonly_path, "%s/elflink-uid-%d",
			base_path, getuid());

	ret = mkdir(share_readonly_path, 0700);
	if ((ret != 0) && (errno != EEXIST)) {
		WARNING("Error creating share directory %s\n",
			share_readonly_path);
		return -1;
	}

	/* Check the share directory is sane */
	ret = lstat(share_readonly_path, &sb);
	if (ret != 0) {
		WARNING("Couldn't stat() %s: %s\n", share_readonly_path,
			strerror(errno));
		return -1;
	}

	if (! S_ISDIR(sb.st_mode)) {
		WARNING("%s is not a directory\n", share_readonly_path);
		return -1;
	}

	if (sb.st_uid != getuid()) {
		WARNING("%s has wrong owner (uid=%d instead of %d)\n",
		      share_readonly_path, sb.st_uid, getuid());
		return -1;
	}

	if (sb.st_mode & (S_IWGRP | S_IWOTH)) {
		WARNING("%s has bad permissions 0%03o\n",
		      share_readonly_path, sb.st_mode);
		return -1;
	}

	return 0;
}

/*
 * Look for non-zero BSS data inside a range and print out any matches
 */

static void check_bss(unsigned long *start, unsigned long *end)
{
	unsigned long *addr;

	for (addr = start; addr < end; addr++) {
		if (*addr != 0)
			DEBUG("Non-zero BSS data @ %p: %lx\n", addr, *addr);
	}
}

/**
 * get_shared_file_name - create a shared file name from program name,
 * segment number and current word size
 * @htlb_seg_info: pointer to program's segment data
 * @file_path: pointer to a PATH_MAX+1 array to store filename in
 *
 * The file name created is *not* intended to be unique, except when
 * the name, gid or phdr number differ. The goal here is to have a
 * standard means of accessing particular segments of particular
 * executables.
 *
 * returns:
 *   -1, on failure
 *   0, on success
 */
static int get_shared_file_name(struct seg_info *htlb_seg_info, char *file_path)
{
	int ret;
	char binary[PATH_MAX+1];
	char *binary2;

	memset(binary, 0, sizeof(binary));
	ret = readlink("/proc/self/exe", binary, PATH_MAX);
	if (ret < 0) {
		WARNING("shared_file: readlink() on /proc/self/exe "
		      "failed: %s\n", strerror(errno));
		return -1;
	}

	binary2 = basename(binary);
	if (!binary2) {
		WARNING("shared_file: basename() on %s failed: %s\n",
		      binary, strerror(errno));
		return -1;
	}

	assemble_path(file_path, "%s/%s_%zd_%d", share_readonly_path, binary2,
		      sizeof(unsigned long) * 8, htlb_seg_info->index);

	return 0;
}

/* Find the .dynamic program header */
static int find_dynamic(Elf_Dyn **dyntab, const ElfW(Addr) addr,
			const Elf_Phdr *phdr, int phnum)
{
	int i = 1;

	while ((phdr[i].p_type != PT_DYNAMIC) && (i < phnum)) {
		++i;
	}
	if (phdr[i].p_type == PT_DYNAMIC) {
		*dyntab = (Elf_Dyn *)(addr + phdr[i].p_vaddr);
		return 0;
	} else {
		DEBUG("No dynamic segment found\n");
		return -1;
	}
}

/* Find the dynamic string and symbol tables */
static int find_tables(Elf_Dyn *dyntab, Elf_Sym **symtab, char **strtab)
{
	int i = 1;
	while ((dyntab[i].d_tag != DT_NULL)) {
		if (dyntab[i].d_tag == DT_SYMTAB)
			*symtab = (Elf_Sym *)dyntab[i].d_un.d_ptr;
		else if (dyntab[i].d_tag == DT_STRTAB)
			*strtab = (char *)dyntab[i].d_un.d_ptr;
		i++;
	}

	if (!*symtab) {
		DEBUG("No symbol table found\n");
		return -1;
	}
	if (!*strtab) {
		DEBUG("No string table found\n");
		return -1;
	}
	return 0;
}

/* Find the number of symbol table entries */
static int find_numsyms(Elf_Sym *symtab, char *strtab)
{
	/*
	 * WARNING - The symbol table size calculation does not follow the ELF
	 *           standard, but rather exploits an assumption we enforce in
	 *           our linker scripts that the string table follows
	 *           immediately after the symbol table. The linker scripts
	 *           must maintain this assumption or this code will break.
	 */
	if ((void *)strtab <= (void *)symtab) {
		DEBUG("Could not calculate dynamic symbol table size\n");
		return -1;
	}
	return ((void *)strtab - (void *)symtab) / sizeof(Elf_Sym);
}

/*
 * To reduce the size of the extra copy window, we can eliminate certain
 * symbols based on information in the dynamic section. The following
 * characteristics apply to symbols which may require copying:
 * - Within the BSS
 * - Global or Weak binding
 * - Object type (variable)
 * - Non-zero size (zero size means the symbol is just a marker with no data)
 */
static inline int keep_symbol(char *strtab, Elf_Sym *s, void *start, void *end)
{
	if ((void *)s->st_value < start)
		return 0;
	if ((void *)s->st_value > end)
		return 0;
	if ((ELF_ST_BIND(s->st_info) != STB_GLOBAL) &&
	    (ELF_ST_BIND(s->st_info) != STB_WEAK))
		return 0;
	if (ELF_ST_TYPE(s->st_info) != STT_OBJECT)
		return 0;
	if (s->st_size == 0)
		return 0;

	if (__hugetlbfs_debug)
		DEBUG("symbol to copy at %p: %s\n", (void *)s->st_value,
						strtab + s->st_name);

	return 1;
}

/* If unspecified by the architecture, no extra copying of the plt is needed */
ElfW(Word) __attribute__ ((weak)) plt_extrasz(ElfW(Dyn) *dyntab)
{
	return 0;
}

/*
 * Subtle:  Since libhugetlbfs depends on glibc, we allow it
 * it to be loaded before us.  As part of its init functions, it
 * initializes stdin, stdout, and stderr in the bss.  We need to
 * include these initialized variables in our copy.
 */

static void get_extracopy(struct seg_info *seg, const ElfW(Addr) addr,
			  const Elf_Phdr *phdr, int phnum)
{
	Elf_Dyn *dyntab;        /* dynamic segment table */
	Elf_Sym *symtab = NULL; /* dynamic symbol table */
	Elf_Sym *sym;           /* a symbol */
	char *strtab = NULL;    /* string table for dynamic symbols */
	int ret, numsyms, found_sym = 0;
	void *start, *end, *end_orig;
	void *sym_end;
	void *plt_end;

	end_orig = seg->vaddr + seg->memsz;
	start = seg->vaddr + seg->filesz;
	if (seg->filesz == seg->memsz)
		return;
	if (!__hugetlb_opts.min_copy)
		goto bail2;

	/* Find dynamic program header */
	ret = find_dynamic(&dyntab, addr, phdr, phnum);
	if (ret < 0)
		goto bail;

	/* Find symbol and string tables */
	ret = find_tables(dyntab, &symtab, &strtab);
	if (ret < 0)
		goto bail;

	numsyms = find_numsyms(symtab, strtab);
	if (numsyms < 0)
		goto bail;

	/*
	 * We must ensure any returns done hereafter have sane start and end
	 * values, as the criss-cross apple sauce algorithm is beginning
	 */
	end = start;

	for (sym = symtab; sym < symtab + numsyms; sym++) {
		if (!keep_symbol(strtab, sym, start, end_orig))
			continue;

		/* These are the droids we are looking for */
		found_sym = 1;
		sym_end = (void *)(sym->st_value + sym->st_size);
		if (sym_end > end)
			end = sym_end;
	}

	/*
	 * Some platforms (PowerPC 64bit ELF) place their PLT beyond the filesz
	 * part of the data segment.  When this is the case, we must extend the
	 * copy window to include this data which has been initialized by the
	 * run-time linker.
	 */
	plt_end = start + plt_extrasz(dyntab);
	if (plt_end > end) {
		end = plt_end;
		found_sym = 1;
	}

	if (__hugetlbfs_debug)
		check_bss(end, end_orig);

	if (found_sym) {
		seg->extrasz = end - start;
	}
	/*
	 * else no need to copy anything, so leave seg->extrasz as zero
	 */
	return;

bail:
	DEBUG("Unable to perform minimal copy\n");
bail2:
	seg->extrasz = end_orig - start;
}

#if defined(__powerpc64__) || \
	(defined(__powerpc__) && !defined(PPC_NO_SEGMENTS))
#define SLICE_LOW_TOP		(0x100000000UL)
#define SLICE_LOW_SIZE		(1UL << SLICE_LOW_SHIFT)
#define SLICE_HIGH_SIZE		(1UL << SLICE_HIGH_SHIFT)
#endif

/*
 * Return the address of the start and end of the hugetlb slice
 * containing @addr. A slice is a range of addresses, start inclusive
 * and end exclusive.
 * Note, that since relinking is not supported on ia64, we can leave it
 * out here.
 */
static unsigned long hugetlb_slice_start(unsigned long addr)
{
	if (arch_has_slice_support()) {
#if defined(__powerpc64__)
		if (addr < SLICE_LOW_TOP)
			return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
		else if (addr < SLICE_HIGH_SIZE)
			return SLICE_LOW_TOP;
		else
			return ALIGN_DOWN(addr, SLICE_HIGH_SIZE);
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
		return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
#endif
	}
	return ALIGN_DOWN(addr, gethugepagesize());
}

static unsigned long hugetlb_slice_end(unsigned long addr)
{
	if (arch_has_slice_support()) {
#if defined(__powerpc64__)
		if (addr < SLICE_LOW_TOP)
			return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
		else
			return ALIGN_UP(addr, SLICE_HIGH_SIZE) - 1;
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
		return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
#endif
	}
	return ALIGN_UP(addr, gethugepagesize()) - 1;
}

static unsigned long hugetlb_next_slice_start(unsigned long addr)
{
	return hugetlb_slice_end(addr) + 1;
}

static unsigned long hugetlb_prev_slice_end(unsigned long addr)
{
	return hugetlb_slice_start(addr) - 1;
}

/*
 * Store a copy of the given program header
 */
static int save_phdr(int table_idx, int phnum, const ElfW(Addr) addr,
		     const ElfW(Phdr) *phdr)
{
	int prot = 0;

	if (table_idx >= MAX_HTLB_SEGS) {
		WARNING("Executable has too many segments (max %d)\n",
			MAX_HTLB_SEGS);
		htlb_num_segs = 0;
		return -1;
	}

	if (phdr->p_flags & PF_R)
		prot |= PROT_READ;
	if (phdr->p_flags & PF_W)
		prot |= PROT_WRITE;
	if (phdr->p_flags & PF_X)
		prot |= PROT_EXEC;

	htlb_seg_table[table_idx].vaddr = (void *)(addr + phdr->p_vaddr);
	htlb_seg_table[table_idx].filesz = phdr->p_filesz;
	htlb_seg_table[table_idx].memsz = phdr->p_memsz;
	htlb_seg_table[table_idx].prot = prot;
	htlb_seg_table[table_idx].index = phnum;

	INFO("Segment %d (phdr %d): %#0lx-%#0lx  (filesz=%#0lx) "
		"(prot = %#0x)\n", table_idx, phnum,
		(unsigned long) addr + phdr->p_vaddr,
		(unsigned long) addr + phdr->p_vaddr + phdr->p_memsz,
		(unsigned long) phdr->p_filesz, (unsigned int) prot);

	return 0;
}

static int verify_segment_layout(struct seg_layout *segs, int num_segs)
{
	int i;
	long base_size = getpagesize();

	for (i = 1; i < num_segs; i++) {
		unsigned long prev_end = segs[i - 1].end;
		unsigned long start = segs[i].start;

		/*
		 * Do not worry about the boundary between segments that will
		 * not be remapped.
		 */
		if (segs[i - 1].page_size == base_size &&
				segs[i].page_size == base_size)
			continue;

		/* Make sure alignment hasn't caused segments to overlap */
		if (prev_end > start) {
			WARNING("Layout problem with segments %i and %i:\n\t"
				"Segments would overlap\n", i - 1, i);
			return 1;
		}

		/* Make sure page size transitions occur on slice boundaries */
		if ((segs[i - 1].page_size != segs[i].page_size) &&
				hugetlb_slice_end(prev_end) >
				hugetlb_slice_start(start)) {
			WARNING("Layout problem with segments %i and %i:\n\t"
				"Only one page size per slice\n", i - 1, i);
			return 1;
		}
	}
	return 0;
}

static long segment_requested_page_size(const ElfW(Phdr) *phdr)
{
	int writable = phdr->p_flags & PF_W;

	/* Check if a page size was requested by the user */
	if (writable && hpage_writable_size)
		return hpage_writable_size;
	if (!writable && hpage_readonly_size)
		return hpage_readonly_size;

	/* Check if this segment requests remapping by default */
	if (!hpage_readonly_size && !hpage_writable_size &&
			(phdr->p_flags & PF_LINUX_HUGETLB))
		return gethugepagesize();

	/* No remapping selected, return the base page size */
	return getpagesize();
}

static
int parse_elf_normal(struct dl_phdr_info *info, size_t size, void *data)
{
	int i, num_segs;
	unsigned long page_size, seg_psize, start, end;
	struct seg_layout segments[MAX_SEGS];

	page_size = getpagesize();
	num_segs = 0;

	for (i = 0; i < info->dlpi_phnum; i++) {
		if (info->dlpi_phdr[i].p_type != PT_LOAD)
			continue;

		if (i >= MAX_SEGS) {
			WARNING("Maximum number of PT_LOAD segments"
					"exceeded\n");
			return 1;
		}

		seg_psize = segment_requested_page_size(&info->dlpi_phdr[i]);
		if (seg_psize != page_size) {
			if (save_phdr(htlb_num_segs, i, info->dlpi_addr,
				      &info->dlpi_phdr[i]))
				return 1;
			get_extracopy(&htlb_seg_table[htlb_num_segs],
				      info->dlpi_addr, info->dlpi_phdr,
				      info->dlpi_phnum);
			htlb_seg_table[htlb_num_segs].page_size = seg_psize;
			htlb_num_segs++;
		}
		start = ALIGN_DOWN(info->dlpi_addr +
				   info->dlpi_phdr[i].p_vaddr, seg_psize);
		end = ALIGN(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr +
			    info->dlpi_phdr[i].p_memsz, seg_psize);

		segments[num_segs].page_size = seg_psize;
		segments[num_segs].start = start;
		segments[num_segs].end = end;
		num_segs++;
	}
	if (verify_segment_layout(segments, num_segs))
		htlb_num_segs = 0;

	if (__hugetlbfs_debug)
		check_memsz();

	return 1;
}

/*
 * Parse the phdrs of a normal program to attempt partial segment remapping
 */
static
int parse_elf_partial(struct dl_phdr_info *info, size_t size, void *data)
{
	unsigned long vaddr, memsz, gap;
	unsigned long slice_end;
	int i;

	/* This should never actually be called more than once in an
	 * iteration: we assume that dl_iterate_phdrs() always gives
	 * us the main program's phdrs on the first iteration, and
	 * always return 1 to cease iteration at that point. */

	for (i = 0; i < info->dlpi_phnum; i++) {
		if (info->dlpi_phdr[i].p_type != PT_LOAD)
			continue;

		/*
		 * Partial segment remapping only makes sense if the
		 * memory size of the segment is larger than the
		 * granularity at which hugepages can be used. This
		 * mostly affects ppc, where the segment must be larger
		 * than 256M. This guarantees that remapping the binary
		 * in this forced way won't violate any contiguity
		 * constraints.
		 */
		vaddr = hugetlb_next_slice_start(info->dlpi_addr +
						 info->dlpi_phdr[i].p_vaddr);
		gap = vaddr - (info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
		slice_end = hugetlb_slice_end(vaddr);
		/*
		 * we should stop remapping just before the slice
		 * containing the end of the memsz portion (taking away
		 * the gap of the memsz)
		 */
		memsz = info->dlpi_phdr[i].p_memsz;
		if (memsz < gap) {
			INFO("Segment %d's unaligned memsz is too small: "
					"%#0lx < %#0lx\n",
					i, memsz, gap);
			continue;
		}
		memsz -= gap;
		if (memsz < (slice_end - vaddr)) {
			INFO("Segment %d's aligned memsz is too small: "
					"%#0lx < %#0lx\n",
					i, memsz, slice_end - vaddr);
			continue;
		}
		memsz = hugetlb_prev_slice_end(vaddr + memsz) - vaddr;

		if (save_phdr(htlb_num_segs, i, info->dlpi_addr,
			      &info->dlpi_phdr[i]))
			return 1;

		/*
		 * When remapping partial segments, we create a sub-segment
		 * that is based on the original.  For this reason, we must
		 * make some changes to the phdr captured by save_phdr():
		 * 	vaddr is aligned upwards to a slice boundary
		 * 	memsz is aligned downwards to a slice boundary
		 * 	filesz is set to memsz to force all memory to be copied
		 */
		htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr;
		htlb_seg_table[htlb_num_segs].filesz = memsz;
		htlb_seg_table[htlb_num_segs].memsz = memsz;

		htlb_num_segs++;
	}
	return 1;
}

/*
 * Verify that a range of memory is unoccupied and usable
 */
static void check_range_empty(void *addr, unsigned long len)
{
	void *p;

	p = mmap(addr, len, PROT_READ, MAP_PRIVATE|MAP_ANON, 0, 0);
	if (p != addr) {
		WARNING("Unable to verify address range %p - %p.  Not empty?\n",
				addr, addr + len);
		if (__hugetlbfs_debug)
			dump_proc_pid_maps();
	}
	if (p != MAP_FAILED)
		munmap(p, len);
}

/*
 * Copy a program segment into a huge page. If possible, try to copy the
 * smallest amount of data possible, unless the user disables this
 * optimization via the HUGETLB_ELFMAP environment variable.
 */
static int prepare_segment(struct seg_info *seg)
{
	void *start, *p, *end, *new_end;
	unsigned long size, offset;
	long page_size = getpagesize();
	long hpage_size;
	int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0;

	hpage_size = seg->page_size;

	/*
	 * mmaps must begin at an address aligned to the page size.  If the
	 * vaddr of this segment is not hpage_size aligned, align it downward
	 * and begin the mmap there.  Note the offset so we can copy data to
	 * the correct starting address within the temporary mmap.
	 */
	start = (void *) ALIGN_DOWN((unsigned long)seg->vaddr, hpage_size);
	offset = seg->vaddr - start;

	/*
	 * Calculate the size of the temporary mapping we must create.
	 * This includes the offset (described above) and the filesz and
	 * extrasz portions of the segment (described below).  We must align
	 * this total to the huge page size so it will be valid for mmap.
	 */
	size = ALIGN(offset + seg->filesz + seg->extrasz, hpage_size);

	/*
	 * If the segment's start or end addresses have been adjusted to align
	 * them to the hpage_size, check to make sure nothing is mapped in the
	 * padding before and after the segment.
	 */
	end = (void *) ALIGN((unsigned long)seg->vaddr + seg->memsz, page_size);
	new_end = (void *) ALIGN((unsigned long)end, hpage_size);
	if (ALIGN_DOWN(offset, page_size))
		check_range_empty(start, ALIGN_DOWN(offset, page_size));
	if (end != new_end)
		check_range_empty(end, new_end - end);

	/* Create the temporary huge page mmap */
	p = mmap(NULL, size, PROT_READ|PROT_WRITE,
				MAP_SHARED|mmap_reserve, seg->fd, 0);
	if (p == MAP_FAILED) {
		WARNING("Couldn't map hugepage segment to copy data: %s\n",
			strerror(errno));
		return -1;
	}

	/*
	 * Minimizing the amount of data copied will maximize performance.
	 * By definition, the filesz portion of the segment contains
	 * initialized data and must be copied.  If part of the memsz portion
	 * is known to be initialized already, extrasz will be non-zero and
	 * that many addtional bytes will be copied from the beginning of the
	 * memsz region.  The rest of the memsz is understood to be zeroes and
	 * need not be copied.
	 */
	INFO("Mapped hugeseg at %p. Copying %#0lx bytes and %#0lx extra bytes"
		" from %p...", p, seg->filesz, seg->extrasz, seg->vaddr);
	memcpy(p + offset, seg->vaddr, seg->filesz + seg->extrasz);
	INFO_CONT("done\n");

	munmap(p, size);

	return 0;
}

/*
 * [PPC] Prior to 2.6.22 (which added slices), our temporary hugepage
 * mappings are placed in the segment before the stack. This 'taints' that
 * segment for be hugepage-only for the lifetime of the process, resulting
 * in a maximum stack size of 256MB. If we instead create our hugepage
 * mappings in a child process, we can avoid this problem.
 *
 * This does not adversely affect non-PPC platforms so do it everywhere.
 */
static int fork_and_prepare_segment(struct seg_info *htlb_seg_info)
{
	int pid, ret, status;

	if ((pid = fork()) < 0) {
		WARNING("fork failed");
		return -1;
	}
	if (pid == 0) {
		ret = prepare_segment(htlb_seg_info);
		if (ret < 0) {
			WARNING("Failed to prepare segment\n");
			exit(1);
		}
		else
			exit(0);
	}
	ret = waitpid(pid, &status, 0);
	if (ret == -1) {
		WARNING("waitpid failed");
		return -1;
	}

	if (WEXITSTATUS(status) != 0)
		return -1;

	INFO("Prepare succeeded\n");
	return 0;
}

/**
 * find_or_prepare_shared_file - get one shareable file
 * @htlb_seg_info: pointer to program's segment data
 *
 * This function either locates a hugetlbfs file already containing
 * data for a given program segment, or creates one if it doesn't
 * already exist.
 *
 * We use the following algorithm to ensure that when processes race
 * to instantiate the hugepage file, we will never obtain an
 * incompletely prepared file or have multiple processes prepar
 * separate copies of the file.
 *	- first open 'filename.tmp' with O_EXCL (this acts as a lockfile)
 *	- second open 'filename' with O_RDONLY (even if the first open
 *	  succeeded).
 * Then:
 * 	- If both opens succeed, close the O_EXCL open, unlink
 * filename.tmp and use the O_RDONLY fd.  (Somebody else has prepared
 * the file already)
 * 	- If only the O_RDONLY open suceeds, and the O_EXCL open
 * fails with EEXIST, just used the O_RDONLY fd. (Somebody else has
 * prepared the file already, but we raced with their rename()).
 * 	- If only the O_EXCL open suceeds, and the O_RDONLY fails with
 * ENOENT, prepare the the O_EXCL open, then rename() filename.tmp to
 * filename. (We're the first in, we have to prepare the file).
 * 	- If both opens fail, with EEXIST and ENOENT, respectively,
 * wait for a little while, then try again from the beginning
 * (Somebody else is preparing the file, but hasn't finished yet)
 *
 * returns:
 *   -1, on failure
 *   0, on success
 */
static int find_or_prepare_shared_file(struct seg_info *htlb_seg_info)
{
	int fdx = -1, fds;
	int errnox, errnos;
	int ret;
	int i;
	char final_path[PATH_MAX+1];
	char tmp_path[PATH_MAX+1];

	ret = get_shared_file_name(htlb_seg_info, final_path);
	if (ret < 0)
		return -1;
	assemble_path(tmp_path, "%s.tmp", final_path);

	for (i = 0; i < SHARED_TIMEOUT; i++) {
		/* NB: mode is modified by umask */
		fdx = open(tmp_path, O_CREAT | O_EXCL | O_RDWR, 0666);
		errnox = errno;
		fds = open(final_path, O_RDONLY);
		errnos = errno;

		if (fds >= 0) {
			/* Got an already-prepared file -> use it */
			if (fdx > 0) {
				/* Also got an exclusive file -> clean up */
				ret = unlink(tmp_path);
				if (ret != 0)
					WARNING("shared_file: unable to clean "
					      "up unneeded file %s: %s\n",
					      tmp_path, strerror(errno));
				close(fdx);
			} else if (errnox != EEXIST) {
				WARNING("shared_file: Unexpected failure on exclusive"
					" open of %s: %s\n", tmp_path,
					strerror(errnox));
			}
			htlb_seg_info->fd = fds;
			return 0;
		}

		if (fdx >= 0) {
			/* It's our job to prepare */
			if (errnos != ENOENT)
				WARNING("shared_file: Unexpected failure on"
					" shared open of %s: %s\n", final_path,
					strerror(errnos));

			htlb_seg_info->fd = fdx;

			INFO("Got unpopulated shared fd -- Preparing\n");
			ret = fork_and_prepare_segment(htlb_seg_info);
			if (ret < 0)
				goto fail;

			INFO("Prepare succeeded\n");
			/* move to permanent location */
			ret = rename(tmp_path, final_path);
			if (ret != 0) {
				WARNING("shared_file: unable to rename %s"
				      " to %s: %s\n", tmp_path, final_path,
				      strerror(errno));
				goto fail;
			}

			return 0;
		}

		/* Both opens failed, somebody else is still preparing */
		/* Wait and try again */
		sleep(1);
	}

 fail:
	if (fdx > 0) {
		ret = unlink(tmp_path);
		if (ret != 0)
			WARNING("shared_file: Unable to clean up temp file %s "
			      "on failure: %s\n", tmp_path, strerror(errno));
		close(fdx);
	}

	return -1;
}

/**
 * obtain_prepared_file - multiplex callers depending on if
 * sharing or not
 * @htlb_seg_info: pointer to program's segment data
 *
 * returns:
 *  -1, on error
 *  0, on success
 */
static int obtain_prepared_file(struct seg_info *htlb_seg_info)
{
	int fd = -1;
	int ret;
	long hpage_size = htlb_seg_info->page_size;

	/* Share only read-only segments */
	if (__hugetlb_opts.sharing && !(htlb_seg_info->prot & PROT_WRITE)) {
		/* first, try to share */
		ret = find_or_prepare_shared_file(htlb_seg_info);
		if (ret == 0)
			return 0;
		/* but, fall through to unlinked files, if sharing fails */
		WARNING("Falling back to unlinked files\n");
	}
	fd = hugetlbfs_unlinked_fd_for_size(hpage_size);
	if (fd < 0)
		return -1;
	htlb_seg_info->fd = fd;

	return fork_and_prepare_segment(htlb_seg_info);
}

static void remap_segments(struct seg_info *seg, int num)
{
	int i;
	void *p;
	unsigned long start, offset, mapsize;
	long page_size = getpagesize();
	long hpage_size;
	int mmap_flags;

	/*
	 * XXX: The bogus call to mmap below forces ld.so to resolve the
	 * mmap symbol before we unmap the plt in the data segment
	 * below.  This might only be needed in the case where sharing
	 * is enabled and the hugetlbfs files have already been prepared
	 * by another process.
	 */
	 p = mmap(0, 0, 0, 0, 0, 0);

	/* This is the hairy bit, between unmap and remap we enter a
	 * black hole.  We can't call anything which uses static data
	 * (ie. essentially any library function...)
	 */
	for (i = 0; i < num; i++) {
		start = ALIGN_DOWN((unsigned long)seg[i].vaddr, page_size);
		offset = (unsigned long)(seg[i].vaddr - start);
		mapsize = ALIGN(offset + seg[i].memsz, page_size);
		munmap((void *) start, mapsize);
	}

	/* Step 4.  Rebuild the address space with hugetlb mappings */
	/* NB: we can't do the remap as hugepages within the main loop
	 * because of PowerPC: we may need to unmap all the normal
	 * segments before the MMU segment is ok for hugepages */
	for (i = 0; i < num; i++) {
		hpage_size = seg[i].page_size;
		start = ALIGN_DOWN((unsigned long)seg[i].vaddr, hpage_size);
		offset = (unsigned long)(seg[i].vaddr - start);
		mapsize = ALIGN(offset + seg[i].memsz, hpage_size);
		mmap_flags = MAP_PRIVATE|MAP_FIXED;

		/* If requested, make no reservations */
		if (__hugetlb_opts.no_reserve)
			mmap_flags |= MAP_NORESERVE;

		/*
		 * If this is a read-only mapping whose contents are
		 * entirely contained within the file, then use MAP_NORESERVE.
		 * The assumption is that the pages already exist in the
		 * page cache for the hugetlbfs file since it was prepared
		 * earlier and that mprotect() will not be called which would
		 * require a COW
		 */
		if (!(seg[i].prot & PROT_WRITE) &&
				seg[i].filesz == seg[i].memsz)
			mmap_flags |= MAP_NORESERVE;

		p = mmap((void *) start, mapsize, seg[i].prot,
			 mmap_flags, seg[i].fd, 0);
		if (p == MAP_FAILED)
			unmapped_abort("Failed to map hugepage segment %u: "
					"%p-%p (errno=%u)\n", i, start,
					start + mapsize, errno);
		if (p != (void *) start)
			unmapped_abort("Mapped hugepage segment %u (%p-%p) at "
				       "wrong address %p\n", i, seg[i].vaddr,
				       seg[i].vaddr+mapsize, p);
	}
	/* The segments are all back at this point.
	 * and it should be safe to reference static data
	 */
}

static int set_hpage_sizes(const char *env)
{
	char *pos;
	long size;
	char *key;
	char keys[5] = { "R\0" "W\0" "\0" };

	/* For each key in R,W */
	for (key = keys; *key != '\0'; key += 2) {
		pos = strcasestr(env, key);
		if (!pos)
			continue;

		if (*(++pos) == '=') {
			size = parse_page_size(pos + 1);
			if (size == -1)
				return size;
		} else
			size = gethugepagesize();

		if (size <= 0) {
			if (errno == ENOSYS)
				WARNING("Hugepages unavailable\n");
			else if (errno == EOVERFLOW)
				WARNING("Hugepage size too large\n");
			else
				WARNING("Hugepage size (%s)\n",
						strerror(errno));
			size = 0;
		} else if (!hugetlbfs_find_path_for_size(size)) {
			WARNING("Hugepage size %li unavailable", size);
			size = 0;
		}

		if (*key == 'R')
			hpage_readonly_size = size;
		else
			hpage_writable_size = size;
	}
	return 0;
}

static int check_env(void)
{
	extern Elf_Ehdr __executable_start __attribute__((weak));

	if (__hugetlb_opts.elfmap &&
		(strcasecmp(__hugetlb_opts.elfmap, "no") == 0)) {
		INFO("HUGETLB_ELFMAP=%s, not attempting to remap program "
		      "segments\n", __hugetlb_opts.elfmap);
		return -1;
	}
	if (__hugetlb_opts.elfmap && set_hpage_sizes(__hugetlb_opts.elfmap)) {
		WARNING("Cannot set elfmap page sizes: %s", strerror(errno));
		return -1;
	}

	if (__hugetlb_opts.ld_preload &&
		strstr(__hugetlb_opts.ld_preload, "libhugetlbfs")) {
		if (__hugetlb_opts.force_elfmap) {
			force_remap = 1;
			INFO("HUGETLB_FORCE_ELFMAP=yes, "
					"enabling partial segment "
					"remapping for non-relinked "
					"binaries\n");
			INFO("Disabling filesz copy optimization\n");
			__hugetlb_opts.min_copy = false;
		} else {
			if (&__executable_start) {
				WARNING("LD_PRELOAD is incompatible with "
					"segment remapping\n");
				WARNING("Segment remapping has been "
					"DISABLED\n");
				return -1;
			}
		}
	}

	if (__hugetlb_opts.sharing == 2) {
		WARNING("HUGETLB_SHARE=%d, however sharing of writable\n"
			"segments has been deprecated and is now disabled\n",
			__hugetlb_opts.sharing);
		__hugetlb_opts.sharing = 0;
	} else {
		INFO("HUGETLB_SHARE=%d, sharing ", __hugetlb_opts.sharing);
		if (__hugetlb_opts.sharing == 1) {
			INFO_CONT("enabled for only read-only segments\n");
		} else {
			INFO_CONT("disabled\n");
			__hugetlb_opts.sharing = 0;
		}
	}

	INFO("HUGETLB_NO_RESERVE=%s, reservations %s\n",
			__hugetlb_opts.no_reserve ? "yes" : "no",
			__hugetlb_opts.no_reserve ? "disabled" : "enabled");

	return 0;
}

/*
 * Parse an ELF header and record segment information for any segments
 * which contain hugetlb information.
 */
static int parse_elf()
{
	if (force_remap)
		dl_iterate_phdr(parse_elf_partial, NULL);
	else
		dl_iterate_phdr(parse_elf_normal, NULL);

	if (htlb_num_segs == 0) {
		INFO("No segments were appropriate for remapping\n");
		return -1;
	}

	return 0;
}

void hugetlbfs_setup_elflink(void)
{
	int i, ret;

	if (check_env())
		return;

	if (parse_elf())
		return;

	INFO("libhugetlbfs version: %s\n", VERSION);

	/* Do we need to find a share directory */
	if (__hugetlb_opts.sharing) {
		/*
		 * If HUGETLB_ELFMAP is undefined but a shareable segment has
		 * PF_LINUX_HUGETLB set, segment remapping will occur using the
		 * default huge page size.
		 */
		long page_size = hpage_readonly_size ?
			hpage_readonly_size : gethugepagesize();

		ret = find_or_create_share_path(page_size);
		if (ret != 0) {
			WARNING("Segment remapping is disabled");
			return;
		}
	}

	/* Step 1.  Obtain hugepage files with our program data */
	for (i = 0; i < htlb_num_segs; i++) {
		ret = obtain_prepared_file(&htlb_seg_table[i]);
		if (ret < 0) {
			WARNING("Failed to setup hugetlbfs file for segment "
					"%d\n", i);

			/* Close files we have already prepared */
			for (i--; i >= 0; i--)
				close(htlb_seg_table[i].fd);

			return;
		}
	}

	/* Step 3.  Unmap the old segments, map in the new ones */
	remap_segments(htlb_seg_table, htlb_num_segs);
}