Blame elflink.c

Packit 2d622a
/*
Packit 2d622a
 * libhugetlbfs - Easy use of Linux hugepages
Packit 2d622a
 * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation.
Packit 2d622a
 *
Packit 2d622a
 * This library is free software; you can redistribute it and/or
Packit 2d622a
 * modify it under the terms of the GNU Lesser General Public License
Packit 2d622a
 * as published by the Free Software Foundation; either version 2.1 of
Packit 2d622a
 * the License, or (at your option) any later version.
Packit 2d622a
 *
Packit 2d622a
 * This library is distributed in the hope that it will be useful, but
Packit 2d622a
 * WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 2d622a
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 2d622a
 * Lesser General Public License for more details.
Packit 2d622a
 *
Packit 2d622a
 * You should have received a copy of the GNU Lesser General Public
Packit 2d622a
 * License along with this library; if not, write to the Free Software
Packit 2d622a
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Packit 2d622a
 */
Packit 2d622a
Packit 2d622a
#define _GNU_SOURCE
Packit 2d622a
Packit 2d622a
#include <stdarg.h>
Packit 2d622a
#include <stdio.h>
Packit 2d622a
#include <stdlib.h>
Packit 2d622a
#include <link.h>
Packit 2d622a
#include <malloc.h>
Packit 2d622a
#include <string.h>
Packit 2d622a
#include <unistd.h>
Packit 2d622a
#include <fcntl.h>
Packit 2d622a
#include <signal.h>
Packit 2d622a
#include <sys/syscall.h>
Packit 2d622a
#include <sys/file.h>
Packit 2d622a
#include <linux/unistd.h>
Packit 2d622a
#include <sys/mman.h>
Packit 2d622a
#include <sys/wait.h>
Packit 2d622a
#include <sys/stat.h>
Packit 2d622a
#include <errno.h>
Packit 2d622a
#include <limits.h>
Packit 2d622a
#include <elf.h>
Packit 2d622a
#include <dlfcn.h>
Packit 2d622a
Packit 2d622a
#include "version.h"
Packit 2d622a
#include "hugetlbfs.h"
Packit 2d622a
#include "libhugetlbfs_internal.h"
Packit 2d622a
Packit 2d622a
#ifdef __LP64__
Packit 2d622a
#define Elf_Ehdr	Elf64_Ehdr
Packit 2d622a
#define Elf_Phdr	Elf64_Phdr
Packit 2d622a
#define Elf_Dyn		Elf64_Dyn
Packit 2d622a
#define Elf_Sym		Elf64_Sym
Packit 2d622a
#define ELF_ST_BIND(x)  ELF64_ST_BIND(x)
Packit 2d622a
#define ELF_ST_TYPE(x)  ELF64_ST_TYPE(x)
Packit 2d622a
#else
Packit 2d622a
#define Elf_Ehdr	Elf32_Ehdr
Packit 2d622a
#define Elf_Phdr	Elf32_Phdr
Packit 2d622a
#define Elf_Dyn		Elf32_Dyn
Packit 2d622a
#define Elf_Sym		Elf32_Sym
Packit 2d622a
#define ELF_ST_BIND(x)  ELF64_ST_BIND(x)
Packit 2d622a
#define ELF_ST_TYPE(x)  ELF64_ST_TYPE(x)
Packit 2d622a
#endif
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * SHARED_TIMEOUT is used by find_or_prepare_shared_file for when it
Packit 2d622a
 * should timeout while waiting for other users to finish preparing
Packit 2d622a
 * the file it wants.  The value is the number of tries before giving
Packit 2d622a
 * up with a 1 second wait between tries
Packit 2d622a
 */
Packit 2d622a
#define SHARED_TIMEOUT 10
Packit 2d622a
Packit 2d622a
/* This function prints an error message to stderr, then aborts.  It
Packit 2d622a
 * is safe to call, even if the executable segments are presently
Packit 2d622a
 * unmapped.
Packit 2d622a
 *
Packit 2d622a
 * Arguments are printf() like, but at present supports only %d and %p
Packit 2d622a
 * with no modifiers
Packit 2d622a
 *
Packit 2d622a
 * FIXME: This works in practice, but I suspect it
Packit 2d622a
 * is not guaranteed safe: the library functions we call could in
Packit 2d622a
 * theory call other functions via the PLT which will blow up. */
Packit 2d622a
static void write_err(const char *start, int len)
Packit 2d622a
{
Packit 2d622a
	direct_syscall(__NR_write, 2 /*stderr*/, start, len);
Packit 2d622a
}
Packit 2d622a
static void sys_abort(void)
Packit 2d622a
{
Packit 2d622a
	pid_t pid = direct_syscall(__NR_getpid);
Packit 2d622a
Packit 2d622a
	direct_syscall(__NR_kill, pid, SIGABRT);
Packit 2d622a
}
Packit 2d622a
static void write_err_base(unsigned long val, int base)
Packit 2d622a
{
Packit 2d622a
	const char digit[] = "0123456789abcdef";
Packit 2d622a
	char str1[sizeof(val)*8];
Packit 2d622a
	char str2[sizeof(val)*8];
Packit 2d622a
	int len = 0;
Packit 2d622a
	int i;
Packit 2d622a
Packit 2d622a
	str1[0] = '0';
Packit 2d622a
	while (val) {
Packit 2d622a
		str1[len++] = digit[val % base];
Packit 2d622a
		val /= base;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (len == 0)
Packit 2d622a
		len = 1;
Packit 2d622a
Packit 2d622a
	/* Reverse digits */
Packit 2d622a
	for (i = 0; i < len; i++)
Packit 2d622a
		str2[i] = str1[len-i-1];
Packit 2d622a
Packit 2d622a
	write_err(str2, len);
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static void unmapped_abort(const char *fmt, ...)
Packit 2d622a
{
Packit 2d622a
	const char *p, *q;
Packit 2d622a
	int done = 0;
Packit 2d622a
	unsigned long val;
Packit 2d622a
	va_list ap;
Packit 2d622a
Packit 2d622a
	/* World's worst printf()... */
Packit 2d622a
	va_start(ap, fmt);
Packit 2d622a
	p = q = fmt;
Packit 2d622a
	while (! done) {
Packit 2d622a
		switch (*p) {
Packit 2d622a
		case '\0':
Packit 2d622a
			write_err(q, p-q);
Packit 2d622a
			done = 1;
Packit 2d622a
			break;
Packit 2d622a
Packit 2d622a
		case '%':
Packit 2d622a
			write_err(q, p-q);
Packit 2d622a
			p++;
Packit 2d622a
			switch (*p) {
Packit 2d622a
			case 'u':
Packit 2d622a
				val = va_arg(ap, unsigned);
Packit 2d622a
				write_err_base(val, 10);
Packit 2d622a
				p++;
Packit 2d622a
				break;
Packit 2d622a
			case 'p':
Packit 2d622a
				val = (unsigned long)va_arg(ap, void *);
Packit 2d622a
				write_err_base(val, 16);
Packit 2d622a
				p++;
Packit 2d622a
				break;
Packit 2d622a
			}
Packit 2d622a
			q = p;
Packit 2d622a
			break;
Packit 2d622a
		default:
Packit 2d622a
			p++;
Packit 2d622a
		}
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	va_end(ap);
Packit 2d622a
Packit 2d622a
	sys_abort();
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/* The directory to use for sharing readonly segments */
Packit 2d622a
static char share_readonly_path[PATH_MAX+1];
Packit 2d622a
Packit 2d622a
#define MAX_HTLB_SEGS	3
Packit 2d622a
#define MAX_SEGS	10
Packit 2d622a
Packit 2d622a
struct seg_info {
Packit 2d622a
	void *vaddr;
Packit 2d622a
	unsigned long filesz, memsz, extrasz;
Packit 2d622a
	int prot;
Packit 2d622a
	int fd;
Packit 2d622a
	int index;
Packit 2d622a
	long page_size;
Packit 2d622a
};
Packit 2d622a
Packit 2d622a
struct seg_layout {
Packit 2d622a
	unsigned long start, end;
Packit 2d622a
	long page_size;
Packit 2d622a
};
Packit 2d622a
Packit 2d622a
static struct seg_info htlb_seg_table[MAX_HTLB_SEGS];
Packit 2d622a
static int htlb_num_segs;
Packit 2d622a
static unsigned long force_remap; /* =0 */
Packit 2d622a
static long hpage_readonly_size, hpage_writable_size;
Packit 2d622a
Packit 2d622a
/**
Packit 2d622a
 * assemble_path - handy wrapper around snprintf() for building paths
Packit 2d622a
 * @dst: buffer of size PATH_MAX+1 to assemble string into
Packit 2d622a
 * @fmt: format string for path
Packit 2d622a
 * @...: printf() style parameters for path
Packit 2d622a
 *
Packit 2d622a
 * assemble_path() builds a path in the target buffer (which must have
Packit 2d622a
 * PATH_MAX+1 available bytes), similar to sprintf().  However, f the
Packit 2d622a
 * assembled path would exceed PATH_MAX characters in length,
Packit 2d622a
 * assemble_path() prints an error and abort()s, so there is no need
Packit 2d622a
 * to check the return value and backout.
Packit 2d622a
 */
Packit 2d622a
static void assemble_path(char *dst, const char *fmt, ...)
Packit 2d622a
{
Packit 2d622a
	va_list ap;
Packit 2d622a
	int len;
Packit 2d622a
Packit 2d622a
	va_start(ap, fmt);
Packit 2d622a
	len = vsnprintf(dst, PATH_MAX+1, fmt, ap);
Packit 2d622a
	va_end(ap);
Packit 2d622a
Packit 2d622a
	if (len < 0) {
Packit 2d622a
		ERROR("vsnprintf() error\n");
Packit 2d622a
		abort();
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (len > PATH_MAX) {
Packit 2d622a
		ERROR("Overflow assembling path\n");
Packit 2d622a
		abort();
Packit 2d622a
	}
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static void check_memsz()
Packit 2d622a
{
Packit 2d622a
	int i;
Packit 2d622a
	unsigned long memsz_total = 0, memsz_max = 0;
Packit 2d622a
	if (htlb_num_segs == 0)
Packit 2d622a
		return;
Packit 2d622a
	/*
Packit 2d622a
	 * rough heuristic to see if we'll run out of address
Packit 2d622a
	 * space
Packit 2d622a
	 */
Packit 2d622a
	for (i = 0; i < htlb_num_segs; i++) {
Packit 2d622a
		memsz_total += htlb_seg_table[i].memsz;
Packit 2d622a
		if (htlb_seg_table[i].memsz > memsz_max)
Packit 2d622a
			memsz_max = htlb_seg_table[i].memsz;
Packit 2d622a
	}
Packit 2d622a
	/* avoid overflow checking by using two checks */
Packit 2d622a
	DEBUG("Total memsz = %#0lx, memsz of largest segment = %#0lx\n",
Packit 2d622a
			memsz_total, memsz_max);
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/**
Packit 2d622a
 * find_or_create_share_path - obtain a directory to store the shared
Packit 2d622a
 * hugetlbfs files
Packit 2d622a
 *
Packit 2d622a
 * Checks environment and filesystem to locate a suitable directory
Packit 2d622a
 * for shared hugetlbfs files, creating a new directory if necessary.
Packit 2d622a
 * The determined path is stored in global variable share_readonly_path.
Packit 2d622a
 *
Packit 2d622a
 * returns:
Packit 2d622a
 *  -1, on error
Packit 2d622a
 *  0, on success
Packit 2d622a
 */
Packit 2d622a
static int find_or_create_share_path(long page_size)
Packit 2d622a
{
Packit 2d622a
	const char *base_path;
Packit 2d622a
	struct stat sb;
Packit 2d622a
	int ret;
Packit 2d622a
Packit 2d622a
	/* If no remaping is planned for the read-only segments we are done */
Packit 2d622a
	if (!page_size)
Packit 2d622a
		return 0;
Packit 2d622a
Packit 2d622a
	if (__hugetlb_opts.share_path) {
Packit 2d622a
		/* Given an explicit path */
Packit 2d622a
		if (hugetlbfs_test_path(__hugetlb_opts.share_path) != 1) {
Packit 2d622a
			WARNING("HUGETLB_SHARE_PATH %s is not on a hugetlbfs"
Packit 2d622a
			      " filesystem\n", __hugetlb_opts.share_path);
Packit 2d622a
			return -1;
Packit 2d622a
		}
Packit 2d622a
Packit 2d622a
		/* Make sure the page size matches */
Packit 2d622a
		if (page_size !=
Packit 2d622a
			hugetlbfs_test_pagesize(__hugetlb_opts.share_path)) {
Packit 2d622a
			WARNING("HUGETLB_SHARE_PATH %s is not valid for a %li "
Packit 2d622a
			      "kB page size\n", __hugetlb_opts.share_path,
Packit 2d622a
				page_size / 1024);
Packit 2d622a
			return -1;
Packit 2d622a
		}
Packit 2d622a
		assemble_path(share_readonly_path, "%s",
Packit 2d622a
				__hugetlb_opts.share_path);
Packit 2d622a
		return 0;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	base_path = hugetlbfs_find_path_for_size(page_size);
Packit 2d622a
	if (!base_path)
Packit 2d622a
		return -1;
Packit 2d622a
Packit 2d622a
	assemble_path(share_readonly_path, "%s/elflink-uid-%d",
Packit 2d622a
			base_path, getuid());
Packit 2d622a
Packit 2d622a
	ret = mkdir(share_readonly_path, 0700);
Packit 2d622a
	if ((ret != 0) && (errno != EEXIST)) {
Packit 2d622a
		WARNING("Error creating share directory %s\n",
Packit 2d622a
			share_readonly_path);
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	/* Check the share directory is sane */
Packit 2d622a
	ret = lstat(share_readonly_path, &sb);
Packit 2d622a
	if (ret != 0) {
Packit 2d622a
		WARNING("Couldn't stat() %s: %s\n", share_readonly_path,
Packit 2d622a
			strerror(errno));
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (! S_ISDIR(sb.st_mode)) {
Packit 2d622a
		WARNING("%s is not a directory\n", share_readonly_path);
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (sb.st_uid != getuid()) {
Packit 2d622a
		WARNING("%s has wrong owner (uid=%d instead of %d)\n",
Packit 2d622a
		      share_readonly_path, sb.st_uid, getuid());
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (sb.st_mode & (S_IWGRP | S_IWOTH)) {
Packit 2d622a
		WARNING("%s has bad permissions 0%03o\n",
Packit 2d622a
		      share_readonly_path, sb.st_mode);
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Look for non-zero BSS data inside a range and print out any matches
Packit 2d622a
 */
Packit 2d622a
Packit 2d622a
static void check_bss(unsigned long *start, unsigned long *end)
Packit 2d622a
{
Packit 2d622a
	unsigned long *addr;
Packit 2d622a
Packit 2d622a
	for (addr = start; addr < end; addr++) {
Packit 2d622a
		if (*addr != 0)
Packit 2d622a
			DEBUG("Non-zero BSS data @ %p: %lx\n", addr, *addr);
Packit 2d622a
	}
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/**
Packit 2d622a
 * get_shared_file_name - create a shared file name from program name,
Packit 2d622a
 * segment number and current word size
Packit 2d622a
 * @htlb_seg_info: pointer to program's segment data
Packit 2d622a
 * @file_path: pointer to a PATH_MAX+1 array to store filename in
Packit 2d622a
 *
Packit 2d622a
 * The file name created is *not* intended to be unique, except when
Packit 2d622a
 * the name, gid or phdr number differ. The goal here is to have a
Packit 2d622a
 * standard means of accessing particular segments of particular
Packit 2d622a
 * executables.
Packit 2d622a
 *
Packit 2d622a
 * returns:
Packit 2d622a
 *   -1, on failure
Packit 2d622a
 *   0, on success
Packit 2d622a
 */
Packit 2d622a
static int get_shared_file_name(struct seg_info *htlb_seg_info, char *file_path)
Packit 2d622a
{
Packit 2d622a
	int ret;
Packit 2d622a
	char binary[PATH_MAX+1];
Packit 2d622a
	char *binary2;
Packit 2d622a
Packit 2d622a
	memset(binary, 0, sizeof(binary));
Packit 2d622a
	ret = readlink("/proc/self/exe", binary, PATH_MAX);
Packit 2d622a
	if (ret < 0) {
Packit 2d622a
		WARNING("shared_file: readlink() on /proc/self/exe "
Packit 2d622a
		      "failed: %s\n", strerror(errno));
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	binary2 = basename(binary);
Packit 2d622a
	if (!binary2) {
Packit 2d622a
		WARNING("shared_file: basename() on %s failed: %s\n",
Packit 2d622a
		      binary, strerror(errno));
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	assemble_path(file_path, "%s/%s_%zd_%d", share_readonly_path, binary2,
Packit 2d622a
		      sizeof(unsigned long) * 8, htlb_seg_info->index);
Packit 2d622a
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/* Find the .dynamic program header */
Packit ed3d6b
static int find_dynamic(Elf_Dyn **dyntab, const ElfW(Addr) addr,
Packit ed3d6b
			const Elf_Phdr *phdr, int phnum)
Packit 2d622a
{
Packit 2d622a
	int i = 1;
Packit 2d622a
Packit 2d622a
	while ((phdr[i].p_type != PT_DYNAMIC) && (i < phnum)) {
Packit 2d622a
		++i;
Packit 2d622a
	}
Packit 2d622a
	if (phdr[i].p_type == PT_DYNAMIC) {
Packit ed3d6b
		*dyntab = (Elf_Dyn *)(addr + phdr[i].p_vaddr);
Packit 2d622a
		return 0;
Packit 2d622a
	} else {
Packit 2d622a
		DEBUG("No dynamic segment found\n");
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/* Find the dynamic string and symbol tables */
Packit 2d622a
static int find_tables(Elf_Dyn *dyntab, Elf_Sym **symtab, char **strtab)
Packit 2d622a
{
Packit 2d622a
	int i = 1;
Packit 2d622a
	while ((dyntab[i].d_tag != DT_NULL)) {
Packit 2d622a
		if (dyntab[i].d_tag == DT_SYMTAB)
Packit 2d622a
			*symtab = (Elf_Sym *)dyntab[i].d_un.d_ptr;
Packit 2d622a
		else if (dyntab[i].d_tag == DT_STRTAB)
Packit 2d622a
			*strtab = (char *)dyntab[i].d_un.d_ptr;
Packit 2d622a
		i++;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (!*symtab) {
Packit 2d622a
		DEBUG("No symbol table found\n");
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
	if (!*strtab) {
Packit 2d622a
		DEBUG("No string table found\n");
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/* Find the number of symbol table entries */
Packit 2d622a
static int find_numsyms(Elf_Sym *symtab, char *strtab)
Packit 2d622a
{
Packit 2d622a
	/*
Packit 2d622a
	 * WARNING - The symbol table size calculation does not follow the ELF
Packit 2d622a
	 *           standard, but rather exploits an assumption we enforce in
Packit 2d622a
	 *           our linker scripts that the string table follows
Packit 2d622a
	 *           immediately after the symbol table. The linker scripts
Packit 2d622a
	 *           must maintain this assumption or this code will break.
Packit 2d622a
	 */
Packit 2d622a
	if ((void *)strtab <= (void *)symtab) {
Packit 2d622a
		DEBUG("Could not calculate dynamic symbol table size\n");
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
	return ((void *)strtab - (void *)symtab) / sizeof(Elf_Sym);
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * To reduce the size of the extra copy window, we can eliminate certain
Packit 2d622a
 * symbols based on information in the dynamic section. The following
Packit 2d622a
 * characteristics apply to symbols which may require copying:
Packit 2d622a
 * - Within the BSS
Packit 2d622a
 * - Global or Weak binding
Packit 2d622a
 * - Object type (variable)
Packit 2d622a
 * - Non-zero size (zero size means the symbol is just a marker with no data)
Packit 2d622a
 */
Packit 2d622a
static inline int keep_symbol(char *strtab, Elf_Sym *s, void *start, void *end)
Packit 2d622a
{
Packit 2d622a
	if ((void *)s->st_value < start)
Packit 2d622a
		return 0;
Packit 2d622a
	if ((void *)s->st_value > end)
Packit 2d622a
		return 0;
Packit 2d622a
	if ((ELF_ST_BIND(s->st_info) != STB_GLOBAL) &&
Packit 2d622a
	    (ELF_ST_BIND(s->st_info) != STB_WEAK))
Packit 2d622a
		return 0;
Packit 2d622a
	if (ELF_ST_TYPE(s->st_info) != STT_OBJECT)
Packit 2d622a
		return 0;
Packit 2d622a
	if (s->st_size == 0)
Packit 2d622a
		return 0;
Packit 2d622a
Packit 2d622a
	if (__hugetlbfs_debug)
Packit 2d622a
		DEBUG("symbol to copy at %p: %s\n", (void *)s->st_value,
Packit 2d622a
						strtab + s->st_name);
Packit 2d622a
Packit 2d622a
	return 1;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/* If unspecified by the architecture, no extra copying of the plt is needed */
Packit 2d622a
ElfW(Word) __attribute__ ((weak)) plt_extrasz(ElfW(Dyn) *dyntab)
Packit 2d622a
{
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Subtle:  Since libhugetlbfs depends on glibc, we allow it
Packit 2d622a
 * it to be loaded before us.  As part of its init functions, it
Packit 2d622a
 * initializes stdin, stdout, and stderr in the bss.  We need to
Packit 2d622a
 * include these initialized variables in our copy.
Packit 2d622a
 */
Packit 2d622a
Packit ed3d6b
static void get_extracopy(struct seg_info *seg, const ElfW(Addr) addr,
Packit ed3d6b
			  const Elf_Phdr *phdr, int phnum)
Packit 2d622a
{
Packit 2d622a
	Elf_Dyn *dyntab;        /* dynamic segment table */
Packit 2d622a
	Elf_Sym *symtab = NULL; /* dynamic symbol table */
Packit 2d622a
	Elf_Sym *sym;           /* a symbol */
Packit 2d622a
	char *strtab = NULL;    /* string table for dynamic symbols */
Packit 2d622a
	int ret, numsyms, found_sym = 0;
Packit 2d622a
	void *start, *end, *end_orig;
Packit 2d622a
	void *sym_end;
Packit 2d622a
	void *plt_end;
Packit 2d622a
Packit 2d622a
	end_orig = seg->vaddr + seg->memsz;
Packit 2d622a
	start = seg->vaddr + seg->filesz;
Packit 2d622a
	if (seg->filesz == seg->memsz)
Packit 2d622a
		return;
Packit 2d622a
	if (!__hugetlb_opts.min_copy)
Packit 2d622a
		goto bail2;
Packit 2d622a
Packit 2d622a
	/* Find dynamic program header */
Packit ed3d6b
	ret = find_dynamic(&dyntab, addr, phdr, phnum);
Packit 2d622a
	if (ret < 0)
Packit 2d622a
		goto bail;
Packit 2d622a
Packit 2d622a
	/* Find symbol and string tables */
Packit 2d622a
	ret = find_tables(dyntab, &symtab, &strtab);
Packit 2d622a
	if (ret < 0)
Packit 2d622a
		goto bail;
Packit 2d622a
Packit 2d622a
	numsyms = find_numsyms(symtab, strtab);
Packit 2d622a
	if (numsyms < 0)
Packit 2d622a
		goto bail;
Packit 2d622a
Packit 2d622a
	/*
Packit 2d622a
	 * We must ensure any returns done hereafter have sane start and end
Packit 2d622a
	 * values, as the criss-cross apple sauce algorithm is beginning
Packit 2d622a
	 */
Packit 2d622a
	end = start;
Packit 2d622a
Packit 2d622a
	for (sym = symtab; sym < symtab + numsyms; sym++) {
Packit 2d622a
		if (!keep_symbol(strtab, sym, start, end_orig))
Packit 2d622a
			continue;
Packit 2d622a
Packit 2d622a
		/* These are the droids we are looking for */
Packit 2d622a
		found_sym = 1;
Packit 2d622a
		sym_end = (void *)(sym->st_value + sym->st_size);
Packit 2d622a
		if (sym_end > end)
Packit 2d622a
			end = sym_end;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	/*
Packit 2d622a
	 * Some platforms (PowerPC 64bit ELF) place their PLT beyond the filesz
Packit 2d622a
	 * part of the data segment.  When this is the case, we must extend the
Packit 2d622a
	 * copy window to include this data which has been initialized by the
Packit 2d622a
	 * run-time linker.
Packit 2d622a
	 */
Packit 2d622a
	plt_end = start + plt_extrasz(dyntab);
Packit 2d622a
	if (plt_end > end) {
Packit 2d622a
		end = plt_end;
Packit 2d622a
		found_sym = 1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (__hugetlbfs_debug)
Packit 2d622a
		check_bss(end, end_orig);
Packit 2d622a
Packit 2d622a
	if (found_sym) {
Packit 2d622a
		seg->extrasz = end - start;
Packit 2d622a
	}
Packit 2d622a
	/*
Packit 2d622a
	 * else no need to copy anything, so leave seg->extrasz as zero
Packit 2d622a
	 */
Packit 2d622a
	return;
Packit 2d622a
Packit 2d622a
bail:
Packit 2d622a
	DEBUG("Unable to perform minimal copy\n");
Packit 2d622a
bail2:
Packit 2d622a
	seg->extrasz = end_orig - start;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
#if defined(__powerpc64__) || \
Packit 2d622a
	(defined(__powerpc__) && !defined(PPC_NO_SEGMENTS))
Packit 2d622a
#define SLICE_LOW_TOP		(0x100000000UL)
Packit 2d622a
#define SLICE_LOW_SIZE		(1UL << SLICE_LOW_SHIFT)
Packit 2d622a
#define SLICE_HIGH_SIZE		(1UL << SLICE_HIGH_SHIFT)
Packit 2d622a
#endif
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Return the address of the start and end of the hugetlb slice
Packit 2d622a
 * containing @addr. A slice is a range of addresses, start inclusive
Packit 2d622a
 * and end exclusive.
Packit 2d622a
 * Note, that since relinking is not supported on ia64, we can leave it
Packit 2d622a
 * out here.
Packit 2d622a
 */
Packit 2d622a
static unsigned long hugetlb_slice_start(unsigned long addr)
Packit 2d622a
{
Packit 5c4cef
	if (!arch_has_slice_support()) {
Packit 5c4cef
		return ALIGN_DOWN(addr, gethugepagesize());
Packit 5c4cef
	}
Packit 5c4cef
Packit 2d622a
#if defined(__powerpc64__)
Packit 2d622a
	if (addr < SLICE_LOW_TOP)
Packit 2d622a
		return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
Packit 2d622a
	else if (addr < SLICE_HIGH_SIZE)
Packit 2d622a
		return SLICE_LOW_TOP;
Packit 2d622a
	else
Packit 2d622a
		return ALIGN_DOWN(addr, SLICE_HIGH_SIZE);
Packit 2d622a
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
Packit 2d622a
	return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
Packit 2d622a
#endif
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static unsigned long hugetlb_slice_end(unsigned long addr)
Packit 2d622a
{
Packit 5c4cef
	if (!arch_has_slice_support()) {
Packit 5c4cef
		return ALIGN_UP(addr, gethugepagesize()) - 1;
Packit 5c4cef
	}
Packit 5c4cef
Packit 2d622a
#if defined(__powerpc64__)
Packit 2d622a
	if (addr < SLICE_LOW_TOP)
Packit 2d622a
		return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
Packit 2d622a
	else
Packit 2d622a
		return ALIGN_UP(addr, SLICE_HIGH_SIZE) - 1;
Packit 2d622a
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
Packit 2d622a
	return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
Packit 2d622a
#endif
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static unsigned long hugetlb_next_slice_start(unsigned long addr)
Packit 2d622a
{
Packit 2d622a
	return hugetlb_slice_end(addr) + 1;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static unsigned long hugetlb_prev_slice_end(unsigned long addr)
Packit 2d622a
{
Packit 2d622a
	return hugetlb_slice_start(addr) - 1;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Store a copy of the given program header
Packit 2d622a
 */
Packit ed3d6b
static int save_phdr(int table_idx, int phnum, const ElfW(Addr) addr,
Packit ed3d6b
		     const ElfW(Phdr) *phdr)
Packit 2d622a
{
Packit 2d622a
	int prot = 0;
Packit 2d622a
Packit 2d622a
	if (table_idx >= MAX_HTLB_SEGS) {
Packit 2d622a
		WARNING("Executable has too many segments (max %d)\n",
Packit 2d622a
			MAX_HTLB_SEGS);
Packit 2d622a
		htlb_num_segs = 0;
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (phdr->p_flags & PF_R)
Packit 2d622a
		prot |= PROT_READ;
Packit 2d622a
	if (phdr->p_flags & PF_W)
Packit 2d622a
		prot |= PROT_WRITE;
Packit 2d622a
	if (phdr->p_flags & PF_X)
Packit 2d622a
		prot |= PROT_EXEC;
Packit 2d622a
Packit ed3d6b
	htlb_seg_table[table_idx].vaddr = (void *)(addr + phdr->p_vaddr);
Packit 2d622a
	htlb_seg_table[table_idx].filesz = phdr->p_filesz;
Packit 2d622a
	htlb_seg_table[table_idx].memsz = phdr->p_memsz;
Packit 2d622a
	htlb_seg_table[table_idx].prot = prot;
Packit 2d622a
	htlb_seg_table[table_idx].index = phnum;
Packit 2d622a
Packit 2d622a
	INFO("Segment %d (phdr %d): %#0lx-%#0lx  (filesz=%#0lx) "
Packit 2d622a
		"(prot = %#0x)\n", table_idx, phnum,
Packit ed3d6b
		(unsigned long) addr + phdr->p_vaddr,
Packit ed3d6b
		(unsigned long) addr + phdr->p_vaddr + phdr->p_memsz,
Packit 2d622a
		(unsigned long) phdr->p_filesz, (unsigned int) prot);
Packit 2d622a
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static int verify_segment_layout(struct seg_layout *segs, int num_segs)
Packit 2d622a
{
Packit 2d622a
	int i;
Packit 2d622a
	long base_size = getpagesize();
Packit 2d622a
Packit 2d622a
	for (i = 1; i < num_segs; i++) {
Packit 2d622a
		unsigned long prev_end = segs[i - 1].end;
Packit 2d622a
		unsigned long start = segs[i].start;
Packit 2d622a
Packit 2d622a
		/*
Packit 2d622a
		 * Do not worry about the boundary between segments that will
Packit 2d622a
		 * not be remapped.
Packit 2d622a
		 */
Packit 2d622a
		if (segs[i - 1].page_size == base_size &&
Packit 2d622a
				segs[i].page_size == base_size)
Packit 2d622a
			continue;
Packit 2d622a
Packit 2d622a
		/* Make sure alignment hasn't caused segments to overlap */
Packit 2d622a
		if (prev_end > start) {
Packit 2d622a
			WARNING("Layout problem with segments %i and %i:\n\t"
Packit 2d622a
				"Segments would overlap\n", i - 1, i);
Packit 2d622a
			return 1;
Packit 2d622a
		}
Packit 2d622a
Packit 2d622a
		/* Make sure page size transitions occur on slice boundaries */
Packit 2d622a
		if ((segs[i - 1].page_size != segs[i].page_size) &&
Packit 2d622a
				hugetlb_slice_end(prev_end) >
Packit 2d622a
				hugetlb_slice_start(start)) {
Packit 2d622a
			WARNING("Layout problem with segments %i and %i:\n\t"
Packit 2d622a
				"Only one page size per slice\n", i - 1, i);
Packit 2d622a
			return 1;
Packit 2d622a
		}
Packit 2d622a
	}
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static long segment_requested_page_size(const ElfW(Phdr) *phdr)
Packit 2d622a
{
Packit 2d622a
	int writable = phdr->p_flags & PF_W;
Packit 2d622a
Packit 2d622a
	/* Check if a page size was requested by the user */
Packit 2d622a
	if (writable && hpage_writable_size)
Packit 2d622a
		return hpage_writable_size;
Packit 2d622a
	if (!writable && hpage_readonly_size)
Packit 2d622a
		return hpage_readonly_size;
Packit 2d622a
Packit 2d622a
	/* Check if this segment requests remapping by default */
Packit 2d622a
	if (!hpage_readonly_size && !hpage_writable_size &&
Packit 2d622a
			(phdr->p_flags & PF_LINUX_HUGETLB))
Packit 2d622a
		return gethugepagesize();
Packit 2d622a
Packit 2d622a
	/* No remapping selected, return the base page size */
Packit 2d622a
	return getpagesize();
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static
Packit 2d622a
int parse_elf_normal(struct dl_phdr_info *info, size_t size, void *data)
Packit 2d622a
{
Packit 2d622a
	int i, num_segs;
Packit 2d622a
	unsigned long page_size, seg_psize, start, end;
Packit 2d622a
	struct seg_layout segments[MAX_SEGS];
Packit 2d622a
Packit 2d622a
	page_size = getpagesize();
Packit 2d622a
	num_segs = 0;
Packit 2d622a
Packit 2d622a
	for (i = 0; i < info->dlpi_phnum; i++) {
Packit 2d622a
		if (info->dlpi_phdr[i].p_type != PT_LOAD)
Packit 2d622a
			continue;
Packit 2d622a
Packit 2d622a
		if (i >= MAX_SEGS) {
Packit 2d622a
			WARNING("Maximum number of PT_LOAD segments"
Packit 2d622a
					"exceeded\n");
Packit 2d622a
			return 1;
Packit 2d622a
		}
Packit 2d622a
Packit 2d622a
		seg_psize = segment_requested_page_size(&info->dlpi_phdr[i]);
Packit 2d622a
		if (seg_psize != page_size) {
Packit ed3d6b
			if (save_phdr(htlb_num_segs, i, info->dlpi_addr,
Packit ed3d6b
				      &info->dlpi_phdr[i]))
Packit 2d622a
				return 1;
Packit 2d622a
			get_extracopy(&htlb_seg_table[htlb_num_segs],
Packit ed3d6b
				      info->dlpi_addr, info->dlpi_phdr,
Packit ed3d6b
				      info->dlpi_phnum);
Packit 2d622a
			htlb_seg_table[htlb_num_segs].page_size = seg_psize;
Packit 2d622a
			htlb_num_segs++;
Packit 2d622a
		}
Packit ed3d6b
		start = ALIGN_DOWN(info->dlpi_addr +
Packit ed3d6b
				   info->dlpi_phdr[i].p_vaddr, seg_psize);
Packit ed3d6b
		end = ALIGN(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr +
Packit ed3d6b
			    info->dlpi_phdr[i].p_memsz, seg_psize);
Packit 2d622a
Packit 2d622a
		segments[num_segs].page_size = seg_psize;
Packit 2d622a
		segments[num_segs].start = start;
Packit 2d622a
		segments[num_segs].end = end;
Packit 2d622a
		num_segs++;
Packit 2d622a
	}
Packit 2d622a
	if (verify_segment_layout(segments, num_segs))
Packit 2d622a
		htlb_num_segs = 0;
Packit 2d622a
Packit 2d622a
	if (__hugetlbfs_debug)
Packit 2d622a
		check_memsz();
Packit 2d622a
Packit 2d622a
	return 1;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Parse the phdrs of a normal program to attempt partial segment remapping
Packit 2d622a
 */
Packit 2d622a
static
Packit 2d622a
int parse_elf_partial(struct dl_phdr_info *info, size_t size, void *data)
Packit 2d622a
{
Packit 2d622a
	unsigned long vaddr, memsz, gap;
Packit 2d622a
	unsigned long slice_end;
Packit 2d622a
	int i;
Packit 2d622a
Packit 2d622a
	/* This should never actually be called more than once in an
Packit 2d622a
	 * iteration: we assume that dl_iterate_phdrs() always gives
Packit 2d622a
	 * us the main program's phdrs on the first iteration, and
Packit 2d622a
	 * always return 1 to cease iteration at that point. */
Packit 2d622a
Packit 2d622a
	for (i = 0; i < info->dlpi_phnum; i++) {
Packit 2d622a
		if (info->dlpi_phdr[i].p_type != PT_LOAD)
Packit 2d622a
			continue;
Packit 2d622a
Packit 2d622a
		/*
Packit 2d622a
		 * Partial segment remapping only makes sense if the
Packit 2d622a
		 * memory size of the segment is larger than the
Packit 2d622a
		 * granularity at which hugepages can be used. This
Packit 2d622a
		 * mostly affects ppc, where the segment must be larger
Packit 2d622a
		 * than 256M. This guarantees that remapping the binary
Packit 2d622a
		 * in this forced way won't violate any contiguity
Packit 2d622a
		 * constraints.
Packit 2d622a
		 */
Packit ed3d6b
		vaddr = hugetlb_next_slice_start(info->dlpi_addr +
Packit ed3d6b
						 info->dlpi_phdr[i].p_vaddr);
Packit ed3d6b
		gap = vaddr - (info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
Packit 2d622a
		slice_end = hugetlb_slice_end(vaddr);
Packit 2d622a
		/*
Packit 2d622a
		 * we should stop remapping just before the slice
Packit 2d622a
		 * containing the end of the memsz portion (taking away
Packit 2d622a
		 * the gap of the memsz)
Packit 2d622a
		 */
Packit 2d622a
		memsz = info->dlpi_phdr[i].p_memsz;
Packit 2d622a
		if (memsz < gap) {
Packit 2d622a
			INFO("Segment %d's unaligned memsz is too small: "
Packit 2d622a
					"%#0lx < %#0lx\n",
Packit 2d622a
					i, memsz, gap);
Packit 2d622a
			continue;
Packit 2d622a
		}
Packit 2d622a
		memsz -= gap;
Packit 2d622a
		if (memsz < (slice_end - vaddr)) {
Packit 2d622a
			INFO("Segment %d's aligned memsz is too small: "
Packit 2d622a
					"%#0lx < %#0lx\n",
Packit 2d622a
					i, memsz, slice_end - vaddr);
Packit 2d622a
			continue;
Packit 2d622a
		}
Packit 2d622a
		memsz = hugetlb_prev_slice_end(vaddr + memsz) - vaddr;
Packit 2d622a
Packit ed3d6b
		if (save_phdr(htlb_num_segs, i, info->dlpi_addr,
Packit ed3d6b
			      &info->dlpi_phdr[i]))
Packit 2d622a
			return 1;
Packit 2d622a
Packit 2d622a
		/*
Packit 2d622a
		 * When remapping partial segments, we create a sub-segment
Packit 2d622a
		 * that is based on the original.  For this reason, we must
Packit 2d622a
		 * make some changes to the phdr captured by save_phdr():
Packit 2d622a
		 * 	vaddr is aligned upwards to a slice boundary
Packit 2d622a
		 * 	memsz is aligned downwards to a slice boundary
Packit 2d622a
		 * 	filesz is set to memsz to force all memory to be copied
Packit 2d622a
		 */
Packit 2d622a
		htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr;
Packit 2d622a
		htlb_seg_table[htlb_num_segs].filesz = memsz;
Packit 2d622a
		htlb_seg_table[htlb_num_segs].memsz = memsz;
Packit 2d622a
Packit 2d622a
		htlb_num_segs++;
Packit 2d622a
	}
Packit 2d622a
	return 1;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Verify that a range of memory is unoccupied and usable
Packit 2d622a
 */
Packit 2d622a
static void check_range_empty(void *addr, unsigned long len)
Packit 2d622a
{
Packit 2d622a
	void *p;
Packit 2d622a
Packit 2d622a
	p = mmap(addr, len, PROT_READ, MAP_PRIVATE|MAP_ANON, 0, 0);
Packit 2d622a
	if (p != addr) {
Packit 2d622a
		WARNING("Unable to verify address range %p - %p.  Not empty?\n",
Packit 2d622a
				addr, addr + len);
Packit 2d622a
		if (__hugetlbfs_debug)
Packit 2d622a
			dump_proc_pid_maps();
Packit 2d622a
	}
Packit 2d622a
	if (p != MAP_FAILED)
Packit 2d622a
		munmap(p, len);
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Copy a program segment into a huge page. If possible, try to copy the
Packit 2d622a
 * smallest amount of data possible, unless the user disables this
Packit 2d622a
 * optimization via the HUGETLB_ELFMAP environment variable.
Packit 2d622a
 */
Packit 2d622a
static int prepare_segment(struct seg_info *seg)
Packit 2d622a
{
Packit 2d622a
	void *start, *p, *end, *new_end;
Packit 2d622a
	unsigned long size, offset;
Packit 2d622a
	long page_size = getpagesize();
Packit 2d622a
	long hpage_size;
Packit 2d622a
	int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0;
Packit 2d622a
Packit 2d622a
	hpage_size = seg->page_size;
Packit 2d622a
Packit 2d622a
	/*
Packit 2d622a
	 * mmaps must begin at an address aligned to the page size.  If the
Packit 2d622a
	 * vaddr of this segment is not hpage_size aligned, align it downward
Packit 2d622a
	 * and begin the mmap there.  Note the offset so we can copy data to
Packit 2d622a
	 * the correct starting address within the temporary mmap.
Packit 2d622a
	 */
Packit 2d622a
	start = (void *) ALIGN_DOWN((unsigned long)seg->vaddr, hpage_size);
Packit 2d622a
	offset = seg->vaddr - start;
Packit 2d622a
Packit 2d622a
	/*
Packit 2d622a
	 * Calculate the size of the temporary mapping we must create.
Packit 2d622a
	 * This includes the offset (described above) and the filesz and
Packit 2d622a
	 * extrasz portions of the segment (described below).  We must align
Packit 2d622a
	 * this total to the huge page size so it will be valid for mmap.
Packit 2d622a
	 */
Packit 2d622a
	size = ALIGN(offset + seg->filesz + seg->extrasz, hpage_size);
Packit 2d622a
Packit 2d622a
	/*
Packit 2d622a
	 * If the segment's start or end addresses have been adjusted to align
Packit 2d622a
	 * them to the hpage_size, check to make sure nothing is mapped in the
Packit 2d622a
	 * padding before and after the segment.
Packit 2d622a
	 */
Packit 2d622a
	end = (void *) ALIGN((unsigned long)seg->vaddr + seg->memsz, page_size);
Packit 2d622a
	new_end = (void *) ALIGN((unsigned long)end, hpage_size);
Packit 2d622a
	if (ALIGN_DOWN(offset, page_size))
Packit 2d622a
		check_range_empty(start, ALIGN_DOWN(offset, page_size));
Packit 2d622a
	if (end != new_end)
Packit 2d622a
		check_range_empty(end, new_end - end);
Packit 2d622a
Packit 2d622a
	/* Create the temporary huge page mmap */
Packit 2d622a
	p = mmap(NULL, size, PROT_READ|PROT_WRITE,
Packit 2d622a
				MAP_SHARED|mmap_reserve, seg->fd, 0);
Packit 2d622a
	if (p == MAP_FAILED) {
Packit 2d622a
		WARNING("Couldn't map hugepage segment to copy data: %s\n",
Packit 2d622a
			strerror(errno));
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	/*
Packit 2d622a
	 * Minimizing the amount of data copied will maximize performance.
Packit 2d622a
	 * By definition, the filesz portion of the segment contains
Packit 2d622a
	 * initialized data and must be copied.  If part of the memsz portion
Packit 2d622a
	 * is known to be initialized already, extrasz will be non-zero and
Packit 2d622a
	 * that many addtional bytes will be copied from the beginning of the
Packit 2d622a
	 * memsz region.  The rest of the memsz is understood to be zeroes and
Packit 2d622a
	 * need not be copied.
Packit 2d622a
	 */
Packit 2d622a
	INFO("Mapped hugeseg at %p. Copying %#0lx bytes and %#0lx extra bytes"
Packit 2d622a
		" from %p...", p, seg->filesz, seg->extrasz, seg->vaddr);
Packit 2d622a
	memcpy(p + offset, seg->vaddr, seg->filesz + seg->extrasz);
Packit 2d622a
	INFO_CONT("done\n");
Packit 2d622a
Packit 2d622a
	munmap(p, size);
Packit 2d622a
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * [PPC] Prior to 2.6.22 (which added slices), our temporary hugepage
Packit 2d622a
 * mappings are placed in the segment before the stack. This 'taints' that
Packit 2d622a
 * segment for be hugepage-only for the lifetime of the process, resulting
Packit 2d622a
 * in a maximum stack size of 256MB. If we instead create our hugepage
Packit 2d622a
 * mappings in a child process, we can avoid this problem.
Packit 2d622a
 *
Packit 2d622a
 * This does not adversely affect non-PPC platforms so do it everywhere.
Packit 2d622a
 */
Packit 2d622a
static int fork_and_prepare_segment(struct seg_info *htlb_seg_info)
Packit 2d622a
{
Packit 2d622a
	int pid, ret, status;
Packit 2d622a
Packit 2d622a
	if ((pid = fork()) < 0) {
Packit 2d622a
		WARNING("fork failed");
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
	if (pid == 0) {
Packit 2d622a
		ret = prepare_segment(htlb_seg_info);
Packit 2d622a
		if (ret < 0) {
Packit 2d622a
			WARNING("Failed to prepare segment\n");
Packit 2d622a
			exit(1);
Packit 2d622a
		}
Packit 2d622a
		else
Packit 2d622a
			exit(0);
Packit 2d622a
	}
Packit 2d622a
	ret = waitpid(pid, &status, 0);
Packit 2d622a
	if (ret == -1) {
Packit 2d622a
		WARNING("waitpid failed");
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (WEXITSTATUS(status) != 0)
Packit 2d622a
		return -1;
Packit 2d622a
Packit 2d622a
	INFO("Prepare succeeded\n");
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/**
Packit 2d622a
 * find_or_prepare_shared_file - get one shareable file
Packit 2d622a
 * @htlb_seg_info: pointer to program's segment data
Packit 2d622a
 *
Packit 2d622a
 * This function either locates a hugetlbfs file already containing
Packit 2d622a
 * data for a given program segment, or creates one if it doesn't
Packit 2d622a
 * already exist.
Packit 2d622a
 *
Packit 2d622a
 * We use the following algorithm to ensure that when processes race
Packit 2d622a
 * to instantiate the hugepage file, we will never obtain an
Packit 2d622a
 * incompletely prepared file or have multiple processes prepar
Packit 2d622a
 * separate copies of the file.
Packit 2d622a
 *	- first open 'filename.tmp' with O_EXCL (this acts as a lockfile)
Packit 2d622a
 *	- second open 'filename' with O_RDONLY (even if the first open
Packit 2d622a
 *	  succeeded).
Packit 2d622a
 * Then:
Packit 2d622a
 * 	- If both opens succeed, close the O_EXCL open, unlink
Packit 2d622a
 * filename.tmp and use the O_RDONLY fd.  (Somebody else has prepared
Packit 2d622a
 * the file already)
Packit 2d622a
 * 	- If only the O_RDONLY open suceeds, and the O_EXCL open
Packit 2d622a
 * fails with EEXIST, just used the O_RDONLY fd. (Somebody else has
Packit 2d622a
 * prepared the file already, but we raced with their rename()).
Packit 2d622a
 * 	- If only the O_EXCL open suceeds, and the O_RDONLY fails with
Packit 2d622a
 * ENOENT, prepare the the O_EXCL open, then rename() filename.tmp to
Packit 2d622a
 * filename. (We're the first in, we have to prepare the file).
Packit 2d622a
 * 	- If both opens fail, with EEXIST and ENOENT, respectively,
Packit 2d622a
 * wait for a little while, then try again from the beginning
Packit 2d622a
 * (Somebody else is preparing the file, but hasn't finished yet)
Packit 2d622a
 *
Packit 2d622a
 * returns:
Packit 2d622a
 *   -1, on failure
Packit 2d622a
 *   0, on success
Packit 2d622a
 */
Packit 2d622a
static int find_or_prepare_shared_file(struct seg_info *htlb_seg_info)
Packit 2d622a
{
Packit 2d622a
	int fdx = -1, fds;
Packit 2d622a
	int errnox, errnos;
Packit 2d622a
	int ret;
Packit 2d622a
	int i;
Packit 2d622a
	char final_path[PATH_MAX+1];
Packit 2d622a
	char tmp_path[PATH_MAX+1];
Packit 2d622a
Packit 2d622a
	ret = get_shared_file_name(htlb_seg_info, final_path);
Packit 2d622a
	if (ret < 0)
Packit 2d622a
		return -1;
Packit 2d622a
	assemble_path(tmp_path, "%s.tmp", final_path);
Packit 2d622a
Packit 2d622a
	for (i = 0; i < SHARED_TIMEOUT; i++) {
Packit 2d622a
		/* NB: mode is modified by umask */
Packit 2d622a
		fdx = open(tmp_path, O_CREAT | O_EXCL | O_RDWR, 0666);
Packit 2d622a
		errnox = errno;
Packit 2d622a
		fds = open(final_path, O_RDONLY);
Packit 2d622a
		errnos = errno;
Packit 2d622a
Packit 2d622a
		if (fds >= 0) {
Packit 2d622a
			/* Got an already-prepared file -> use it */
Packit 2d622a
			if (fdx > 0) {
Packit 2d622a
				/* Also got an exclusive file -> clean up */
Packit 2d622a
				ret = unlink(tmp_path);
Packit 2d622a
				if (ret != 0)
Packit 2d622a
					WARNING("shared_file: unable to clean "
Packit 2d622a
					      "up unneeded file %s: %s\n",
Packit 2d622a
					      tmp_path, strerror(errno));
Packit 2d622a
				close(fdx);
Packit 2d622a
			} else if (errnox != EEXIST) {
Packit 2d622a
				WARNING("shared_file: Unexpected failure on exclusive"
Packit 2d622a
					" open of %s: %s\n", tmp_path,
Packit 2d622a
					strerror(errnox));
Packit 2d622a
			}
Packit 2d622a
			htlb_seg_info->fd = fds;
Packit 2d622a
			return 0;
Packit 2d622a
		}
Packit 2d622a
Packit 2d622a
		if (fdx >= 0) {
Packit 2d622a
			/* It's our job to prepare */
Packit 2d622a
			if (errnos != ENOENT)
Packit 2d622a
				WARNING("shared_file: Unexpected failure on"
Packit 2d622a
					" shared open of %s: %s\n", final_path,
Packit 2d622a
					strerror(errnos));
Packit 2d622a
Packit 2d622a
			htlb_seg_info->fd = fdx;
Packit 2d622a
Packit 2d622a
			INFO("Got unpopulated shared fd -- Preparing\n");
Packit 2d622a
			ret = fork_and_prepare_segment(htlb_seg_info);
Packit 2d622a
			if (ret < 0)
Packit 2d622a
				goto fail;
Packit 2d622a
Packit 2d622a
			INFO("Prepare succeeded\n");
Packit 2d622a
			/* move to permanent location */
Packit 2d622a
			ret = rename(tmp_path, final_path);
Packit 2d622a
			if (ret != 0) {
Packit 2d622a
				WARNING("shared_file: unable to rename %s"
Packit 2d622a
				      " to %s: %s\n", tmp_path, final_path,
Packit 2d622a
				      strerror(errno));
Packit 2d622a
				goto fail;
Packit 2d622a
			}
Packit 2d622a
Packit 2d622a
			return 0;
Packit 2d622a
		}
Packit 2d622a
Packit 2d622a
		/* Both opens failed, somebody else is still preparing */
Packit 2d622a
		/* Wait and try again */
Packit 2d622a
		sleep(1);
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
 fail:
Packit 2d622a
	if (fdx > 0) {
Packit 2d622a
		ret = unlink(tmp_path);
Packit 2d622a
		if (ret != 0)
Packit 2d622a
			WARNING("shared_file: Unable to clean up temp file %s "
Packit 2d622a
			      "on failure: %s\n", tmp_path, strerror(errno));
Packit 2d622a
		close(fdx);
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	return -1;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/**
Packit 2d622a
 * obtain_prepared_file - multiplex callers depending on if
Packit 2d622a
 * sharing or not
Packit 2d622a
 * @htlb_seg_info: pointer to program's segment data
Packit 2d622a
 *
Packit 2d622a
 * returns:
Packit 2d622a
 *  -1, on error
Packit 2d622a
 *  0, on success
Packit 2d622a
 */
Packit 2d622a
static int obtain_prepared_file(struct seg_info *htlb_seg_info)
Packit 2d622a
{
Packit 2d622a
	int fd = -1;
Packit 2d622a
	int ret;
Packit 2d622a
	long hpage_size = htlb_seg_info->page_size;
Packit 2d622a
Packit 2d622a
	/* Share only read-only segments */
Packit 2d622a
	if (__hugetlb_opts.sharing && !(htlb_seg_info->prot & PROT_WRITE)) {
Packit 2d622a
		/* first, try to share */
Packit 2d622a
		ret = find_or_prepare_shared_file(htlb_seg_info);
Packit 2d622a
		if (ret == 0)
Packit 2d622a
			return 0;
Packit 2d622a
		/* but, fall through to unlinked files, if sharing fails */
Packit 2d622a
		WARNING("Falling back to unlinked files\n");
Packit 2d622a
	}
Packit 2d622a
	fd = hugetlbfs_unlinked_fd_for_size(hpage_size);
Packit 2d622a
	if (fd < 0)
Packit 2d622a
		return -1;
Packit 2d622a
	htlb_seg_info->fd = fd;
Packit 2d622a
Packit 2d622a
	return fork_and_prepare_segment(htlb_seg_info);
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static void remap_segments(struct seg_info *seg, int num)
Packit 2d622a
{
Packit 2d622a
	int i;
Packit 2d622a
	void *p;
Packit 2d622a
	unsigned long start, offset, mapsize;
Packit 2d622a
	long page_size = getpagesize();
Packit 2d622a
	long hpage_size;
Packit 2d622a
	int mmap_flags;
Packit 2d622a
Packit 2d622a
	/*
Packit 2d622a
	 * XXX: The bogus call to mmap below forces ld.so to resolve the
Packit 2d622a
	 * mmap symbol before we unmap the plt in the data segment
Packit 2d622a
	 * below.  This might only be needed in the case where sharing
Packit 2d622a
	 * is enabled and the hugetlbfs files have already been prepared
Packit 2d622a
	 * by another process.
Packit 2d622a
	 */
Packit 2d622a
	 p = mmap(0, 0, 0, 0, 0, 0);
Packit 2d622a
Packit 2d622a
	/* This is the hairy bit, between unmap and remap we enter a
Packit 2d622a
	 * black hole.  We can't call anything which uses static data
Packit 2d622a
	 * (ie. essentially any library function...)
Packit 2d622a
	 */
Packit 2d622a
	for (i = 0; i < num; i++) {
Packit 2d622a
		start = ALIGN_DOWN((unsigned long)seg[i].vaddr, page_size);
Packit 2d622a
		offset = (unsigned long)(seg[i].vaddr - start);
Packit 2d622a
		mapsize = ALIGN(offset + seg[i].memsz, page_size);
Packit 2d622a
		munmap((void *) start, mapsize);
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	/* Step 4.  Rebuild the address space with hugetlb mappings */
Packit 2d622a
	/* NB: we can't do the remap as hugepages within the main loop
Packit 2d622a
	 * because of PowerPC: we may need to unmap all the normal
Packit 2d622a
	 * segments before the MMU segment is ok for hugepages */
Packit 2d622a
	for (i = 0; i < num; i++) {
Packit 2d622a
		hpage_size = seg[i].page_size;
Packit 2d622a
		start = ALIGN_DOWN((unsigned long)seg[i].vaddr, hpage_size);
Packit 2d622a
		offset = (unsigned long)(seg[i].vaddr - start);
Packit 2d622a
		mapsize = ALIGN(offset + seg[i].memsz, hpage_size);
Packit 2d622a
		mmap_flags = MAP_PRIVATE|MAP_FIXED;
Packit 2d622a
Packit 2d622a
		/* If requested, make no reservations */
Packit 2d622a
		if (__hugetlb_opts.no_reserve)
Packit 2d622a
			mmap_flags |= MAP_NORESERVE;
Packit 2d622a
Packit 2d622a
		/*
Packit 2d622a
		 * If this is a read-only mapping whose contents are
Packit 2d622a
		 * entirely contained within the file, then use MAP_NORESERVE.
Packit 2d622a
		 * The assumption is that the pages already exist in the
Packit 2d622a
		 * page cache for the hugetlbfs file since it was prepared
Packit 2d622a
		 * earlier and that mprotect() will not be called which would
Packit 2d622a
		 * require a COW
Packit 2d622a
		 */
Packit 2d622a
		if (!(seg[i].prot & PROT_WRITE) &&
Packit 2d622a
				seg[i].filesz == seg[i].memsz)
Packit 2d622a
			mmap_flags |= MAP_NORESERVE;
Packit 2d622a
Packit 2d622a
		p = mmap((void *) start, mapsize, seg[i].prot,
Packit 2d622a
			 mmap_flags, seg[i].fd, 0);
Packit 2d622a
		if (p == MAP_FAILED)
Packit 2d622a
			unmapped_abort("Failed to map hugepage segment %u: "
Packit 2d622a
					"%p-%p (errno=%u)\n", i, start,
Packit 2d622a
					start + mapsize, errno);
Packit 2d622a
		if (p != (void *) start)
Packit 2d622a
			unmapped_abort("Mapped hugepage segment %u (%p-%p) at "
Packit 2d622a
				       "wrong address %p\n", i, seg[i].vaddr,
Packit 2d622a
				       seg[i].vaddr+mapsize, p);
Packit 2d622a
	}
Packit 2d622a
	/* The segments are all back at this point.
Packit 2d622a
	 * and it should be safe to reference static data
Packit 2d622a
	 */
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static int set_hpage_sizes(const char *env)
Packit 2d622a
{
Packit 2d622a
	char *pos;
Packit 2d622a
	long size;
Packit 2d622a
	char *key;
Packit 2d622a
	char keys[5] = { "R\0" "W\0" "\0" };
Packit 2d622a
Packit 2d622a
	/* For each key in R,W */
Packit 2d622a
	for (key = keys; *key != '\0'; key += 2) {
Packit 2d622a
		pos = strcasestr(env, key);
Packit 2d622a
		if (!pos)
Packit 2d622a
			continue;
Packit 2d622a
Packit 2d622a
		if (*(++pos) == '=') {
Packit 2d622a
			size = parse_page_size(pos + 1);
Packit 2d622a
			if (size == -1)
Packit 2d622a
				return size;
Packit 2d622a
		} else
Packit 2d622a
			size = gethugepagesize();
Packit 2d622a
Packit 2d622a
		if (size <= 0) {
Packit 2d622a
			if (errno == ENOSYS)
Packit 2d622a
				WARNING("Hugepages unavailable\n");
Packit 2d622a
			else if (errno == EOVERFLOW)
Packit 2d622a
				WARNING("Hugepage size too large\n");
Packit 2d622a
			else
Packit 2d622a
				WARNING("Hugepage size (%s)\n",
Packit 2d622a
						strerror(errno));
Packit 2d622a
			size = 0;
Packit 2d622a
		} else if (!hugetlbfs_find_path_for_size(size)) {
Packit 2d622a
			WARNING("Hugepage size %li unavailable", size);
Packit 2d622a
			size = 0;
Packit 2d622a
		}
Packit 2d622a
Packit 2d622a
		if (*key == 'R')
Packit 2d622a
			hpage_readonly_size = size;
Packit 2d622a
		else
Packit 2d622a
			hpage_writable_size = size;
Packit 2d622a
	}
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
static int check_env(void)
Packit 2d622a
{
Packit 2d622a
	extern Elf_Ehdr __executable_start __attribute__((weak));
Packit 2d622a
Packit 2d622a
	if (__hugetlb_opts.elfmap &&
Packit 2d622a
		(strcasecmp(__hugetlb_opts.elfmap, "no") == 0)) {
Packit 2d622a
		INFO("HUGETLB_ELFMAP=%s, not attempting to remap program "
Packit 2d622a
		      "segments\n", __hugetlb_opts.elfmap);
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
	if (__hugetlb_opts.elfmap && set_hpage_sizes(__hugetlb_opts.elfmap)) {
Packit 2d622a
		WARNING("Cannot set elfmap page sizes: %s", strerror(errno));
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (__hugetlb_opts.ld_preload &&
Packit 2d622a
		strstr(__hugetlb_opts.ld_preload, "libhugetlbfs")) {
Packit 2d622a
		if (__hugetlb_opts.force_elfmap) {
Packit 2d622a
			force_remap = 1;
Packit 2d622a
			INFO("HUGETLB_FORCE_ELFMAP=yes, "
Packit 2d622a
					"enabling partial segment "
Packit 2d622a
					"remapping for non-relinked "
Packit 2d622a
					"binaries\n");
Packit 2d622a
			INFO("Disabling filesz copy optimization\n");
Packit 2d622a
			__hugetlb_opts.min_copy = false;
Packit 2d622a
		} else {
Packit 2d622a
			if (&__executable_start) {
Packit 2d622a
				WARNING("LD_PRELOAD is incompatible with "
Packit 2d622a
					"segment remapping\n");
Packit 2d622a
				WARNING("Segment remapping has been "
Packit 2d622a
					"DISABLED\n");
Packit 2d622a
				return -1;
Packit 2d622a
			}
Packit 2d622a
		}
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	if (__hugetlb_opts.sharing == 2) {
Packit 2d622a
		WARNING("HUGETLB_SHARE=%d, however sharing of writable\n"
Packit 2d622a
			"segments has been deprecated and is now disabled\n",
Packit 2d622a
			__hugetlb_opts.sharing);
Packit 2d622a
		__hugetlb_opts.sharing = 0;
Packit 2d622a
	} else {
Packit 2d622a
		INFO("HUGETLB_SHARE=%d, sharing ", __hugetlb_opts.sharing);
Packit 2d622a
		if (__hugetlb_opts.sharing == 1) {
Packit 2d622a
			INFO_CONT("enabled for only read-only segments\n");
Packit 2d622a
		} else {
Packit 2d622a
			INFO_CONT("disabled\n");
Packit 2d622a
			__hugetlb_opts.sharing = 0;
Packit 2d622a
		}
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	INFO("HUGETLB_NO_RESERVE=%s, reservations %s\n",
Packit 2d622a
			__hugetlb_opts.no_reserve ? "yes" : "no",
Packit 2d622a
			__hugetlb_opts.no_reserve ? "disabled" : "enabled");
Packit 2d622a
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
/*
Packit 2d622a
 * Parse an ELF header and record segment information for any segments
Packit 2d622a
 * which contain hugetlb information.
Packit 2d622a
 */
Packit 2d622a
static int parse_elf()
Packit 2d622a
{
Packit 2d622a
	if (force_remap)
Packit 2d622a
		dl_iterate_phdr(parse_elf_partial, NULL);
Packit 2d622a
	else
Packit 2d622a
		dl_iterate_phdr(parse_elf_normal, NULL);
Packit 2d622a
Packit 2d622a
	if (htlb_num_segs == 0) {
Packit 2d622a
		INFO("No segments were appropriate for remapping\n");
Packit 2d622a
		return -1;
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	return 0;
Packit 2d622a
}
Packit 2d622a
Packit 2d622a
void hugetlbfs_setup_elflink(void)
Packit 2d622a
{
Packit 2d622a
	int i, ret;
Packit 2d622a
Packit 2d622a
	if (check_env())
Packit 2d622a
		return;
Packit 2d622a
Packit 2d622a
	if (parse_elf())
Packit 2d622a
		return;
Packit 2d622a
Packit 2d622a
	INFO("libhugetlbfs version: %s\n", VERSION);
Packit 2d622a
Packit 2d622a
	/* Do we need to find a share directory */
Packit 2d622a
	if (__hugetlb_opts.sharing) {
Packit 2d622a
		/*
Packit 2d622a
		 * If HUGETLB_ELFMAP is undefined but a shareable segment has
Packit 2d622a
		 * PF_LINUX_HUGETLB set, segment remapping will occur using the
Packit 2d622a
		 * default huge page size.
Packit 2d622a
		 */
Packit 2d622a
		long page_size = hpage_readonly_size ?
Packit 2d622a
			hpage_readonly_size : gethugepagesize();
Packit 2d622a
Packit 2d622a
		ret = find_or_create_share_path(page_size);
Packit 2d622a
		if (ret != 0) {
Packit 2d622a
			WARNING("Segment remapping is disabled");
Packit 2d622a
			return;
Packit 2d622a
		}
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	/* Step 1.  Obtain hugepage files with our program data */
Packit 2d622a
	for (i = 0; i < htlb_num_segs; i++) {
Packit 2d622a
		ret = obtain_prepared_file(&htlb_seg_table[i]);
Packit 2d622a
		if (ret < 0) {
Packit 2d622a
			WARNING("Failed to setup hugetlbfs file for segment "
Packit 2d622a
					"%d\n", i);
Packit 2d622a
Packit 2d622a
			/* Close files we have already prepared */
Packit 2d622a
			for (i--; i >= 0; i--)
Packit 2d622a
				close(htlb_seg_table[i].fd);
Packit 2d622a
Packit 2d622a
			return;
Packit 2d622a
		}
Packit 2d622a
	}
Packit 2d622a
Packit 2d622a
	/* Step 3.  Unmap the old segments, map in the new ones */
Packit 2d622a
	remap_segments(htlb_seg_table, htlb_num_segs);
Packit 2d622a
}