|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* libhugetlbfs - Easy use of Linux hugepages
|
|
Packit |
2d622a |
* Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* This library is free software; you can redistribute it and/or
|
|
Packit |
2d622a |
* modify it under the terms of the GNU Lesser General Public License
|
|
Packit |
2d622a |
* as published by the Free Software Foundation; either version 2.1 of
|
|
Packit |
2d622a |
* the License, or (at your option) any later version.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* This library is distributed in the hope that it will be useful, but
|
|
Packit |
2d622a |
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
2d622a |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Packit |
2d622a |
* Lesser General Public License for more details.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* You should have received a copy of the GNU Lesser General Public
|
|
Packit |
2d622a |
* License along with this library; if not, write to the Free Software
|
|
Packit |
2d622a |
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
#define _GNU_SOURCE
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
#include <stdarg.h>
|
|
Packit |
2d622a |
#include <stdio.h>
|
|
Packit |
2d622a |
#include <stdlib.h>
|
|
Packit |
2d622a |
#include <link.h>
|
|
Packit |
2d622a |
#include <malloc.h>
|
|
Packit |
2d622a |
#include <string.h>
|
|
Packit |
2d622a |
#include <unistd.h>
|
|
Packit |
2d622a |
#include <fcntl.h>
|
|
Packit |
2d622a |
#include <signal.h>
|
|
Packit |
2d622a |
#include <sys/syscall.h>
|
|
Packit |
2d622a |
#include <sys/file.h>
|
|
Packit |
2d622a |
#include <linux/unistd.h>
|
|
Packit |
2d622a |
#include <sys/mman.h>
|
|
Packit |
2d622a |
#include <sys/wait.h>
|
|
Packit |
2d622a |
#include <sys/stat.h>
|
|
Packit |
2d622a |
#include <errno.h>
|
|
Packit |
2d622a |
#include <limits.h>
|
|
Packit |
2d622a |
#include <elf.h>
|
|
Packit |
2d622a |
#include <dlfcn.h>
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
#include "version.h"
|
|
Packit |
2d622a |
#include "hugetlbfs.h"
|
|
Packit |
2d622a |
#include "libhugetlbfs_internal.h"
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
#ifdef __LP64__
|
|
Packit |
2d622a |
#define Elf_Ehdr Elf64_Ehdr
|
|
Packit |
2d622a |
#define Elf_Phdr Elf64_Phdr
|
|
Packit |
2d622a |
#define Elf_Dyn Elf64_Dyn
|
|
Packit |
2d622a |
#define Elf_Sym Elf64_Sym
|
|
Packit |
2d622a |
#define ELF_ST_BIND(x) ELF64_ST_BIND(x)
|
|
Packit |
2d622a |
#define ELF_ST_TYPE(x) ELF64_ST_TYPE(x)
|
|
Packit |
2d622a |
#else
|
|
Packit |
2d622a |
#define Elf_Ehdr Elf32_Ehdr
|
|
Packit |
2d622a |
#define Elf_Phdr Elf32_Phdr
|
|
Packit |
2d622a |
#define Elf_Dyn Elf32_Dyn
|
|
Packit |
2d622a |
#define Elf_Sym Elf32_Sym
|
|
Packit |
2d622a |
#define ELF_ST_BIND(x) ELF64_ST_BIND(x)
|
|
Packit |
2d622a |
#define ELF_ST_TYPE(x) ELF64_ST_TYPE(x)
|
|
Packit |
2d622a |
#endif
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* SHARED_TIMEOUT is used by find_or_prepare_shared_file for when it
|
|
Packit |
2d622a |
* should timeout while waiting for other users to finish preparing
|
|
Packit |
2d622a |
* the file it wants. The value is the number of tries before giving
|
|
Packit |
2d622a |
* up with a 1 second wait between tries
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
#define SHARED_TIMEOUT 10
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* This function prints an error message to stderr, then aborts. It
|
|
Packit |
2d622a |
* is safe to call, even if the executable segments are presently
|
|
Packit |
2d622a |
* unmapped.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* Arguments are printf() like, but at present supports only %d and %p
|
|
Packit |
2d622a |
* with no modifiers
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* FIXME: This works in practice, but I suspect it
|
|
Packit |
2d622a |
* is not guaranteed safe: the library functions we call could in
|
|
Packit |
2d622a |
* theory call other functions via the PLT which will blow up. */
|
|
Packit |
2d622a |
static void write_err(const char *start, int len)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
direct_syscall(__NR_write, 2 /*stderr*/, start, len);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
static void sys_abort(void)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
pid_t pid = direct_syscall(__NR_getpid);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
direct_syscall(__NR_kill, pid, SIGABRT);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
static void write_err_base(unsigned long val, int base)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
const char digit[] = "0123456789abcdef";
|
|
Packit |
2d622a |
char str1[sizeof(val)*8];
|
|
Packit |
2d622a |
char str2[sizeof(val)*8];
|
|
Packit |
2d622a |
int len = 0;
|
|
Packit |
2d622a |
int i;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
str1[0] = '0';
|
|
Packit |
2d622a |
while (val) {
|
|
Packit |
2d622a |
str1[len++] = digit[val % base];
|
|
Packit |
2d622a |
val /= base;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (len == 0)
|
|
Packit |
2d622a |
len = 1;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Reverse digits */
|
|
Packit |
2d622a |
for (i = 0; i < len; i++)
|
|
Packit |
2d622a |
str2[i] = str1[len-i-1];
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
write_err(str2, len);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static void unmapped_abort(const char *fmt, ...)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
const char *p, *q;
|
|
Packit |
2d622a |
int done = 0;
|
|
Packit |
2d622a |
unsigned long val;
|
|
Packit |
2d622a |
va_list ap;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* World's worst printf()... */
|
|
Packit |
2d622a |
va_start(ap, fmt);
|
|
Packit |
2d622a |
p = q = fmt;
|
|
Packit |
2d622a |
while (! done) {
|
|
Packit |
2d622a |
switch (*p) {
|
|
Packit |
2d622a |
case '\0':
|
|
Packit |
2d622a |
write_err(q, p-q);
|
|
Packit |
2d622a |
done = 1;
|
|
Packit |
2d622a |
break;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
case '%':
|
|
Packit |
2d622a |
write_err(q, p-q);
|
|
Packit |
2d622a |
p++;
|
|
Packit |
2d622a |
switch (*p) {
|
|
Packit |
2d622a |
case 'u':
|
|
Packit |
2d622a |
val = va_arg(ap, unsigned);
|
|
Packit |
2d622a |
write_err_base(val, 10);
|
|
Packit |
2d622a |
p++;
|
|
Packit |
2d622a |
break;
|
|
Packit |
2d622a |
case 'p':
|
|
Packit |
2d622a |
val = (unsigned long)va_arg(ap, void *);
|
|
Packit |
2d622a |
write_err_base(val, 16);
|
|
Packit |
2d622a |
p++;
|
|
Packit |
2d622a |
break;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
q = p;
|
|
Packit |
2d622a |
break;
|
|
Packit |
2d622a |
default:
|
|
Packit |
2d622a |
p++;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
va_end(ap);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
sys_abort();
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* The directory to use for sharing readonly segments */
|
|
Packit |
2d622a |
static char share_readonly_path[PATH_MAX+1];
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
#define MAX_HTLB_SEGS 3
|
|
Packit |
2d622a |
#define MAX_SEGS 10
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
struct seg_info {
|
|
Packit |
2d622a |
void *vaddr;
|
|
Packit |
2d622a |
unsigned long filesz, memsz, extrasz;
|
|
Packit |
2d622a |
int prot;
|
|
Packit |
2d622a |
int fd;
|
|
Packit |
2d622a |
int index;
|
|
Packit |
2d622a |
long page_size;
|
|
Packit |
2d622a |
};
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
struct seg_layout {
|
|
Packit |
2d622a |
unsigned long start, end;
|
|
Packit |
2d622a |
long page_size;
|
|
Packit |
2d622a |
};
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static struct seg_info htlb_seg_table[MAX_HTLB_SEGS];
|
|
Packit |
2d622a |
static int htlb_num_segs;
|
|
Packit |
2d622a |
static unsigned long force_remap; /* =0 */
|
|
Packit |
2d622a |
static long hpage_readonly_size, hpage_writable_size;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/**
|
|
Packit |
2d622a |
* assemble_path - handy wrapper around snprintf() for building paths
|
|
Packit |
2d622a |
* @dst: buffer of size PATH_MAX+1 to assemble string into
|
|
Packit |
2d622a |
* @fmt: format string for path
|
|
Packit |
2d622a |
* @...: printf() style parameters for path
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* assemble_path() builds a path in the target buffer (which must have
|
|
Packit |
2d622a |
* PATH_MAX+1 available bytes), similar to sprintf(). However, f the
|
|
Packit |
2d622a |
* assembled path would exceed PATH_MAX characters in length,
|
|
Packit |
2d622a |
* assemble_path() prints an error and abort()s, so there is no need
|
|
Packit |
2d622a |
* to check the return value and backout.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static void assemble_path(char *dst, const char *fmt, ...)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
va_list ap;
|
|
Packit |
2d622a |
int len;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
va_start(ap, fmt);
|
|
Packit |
2d622a |
len = vsnprintf(dst, PATH_MAX+1, fmt, ap);
|
|
Packit |
2d622a |
va_end(ap);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (len < 0) {
|
|
Packit |
2d622a |
ERROR("vsnprintf() error\n");
|
|
Packit |
2d622a |
abort();
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (len > PATH_MAX) {
|
|
Packit |
2d622a |
ERROR("Overflow assembling path\n");
|
|
Packit |
2d622a |
abort();
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static void check_memsz()
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int i;
|
|
Packit |
2d622a |
unsigned long memsz_total = 0, memsz_max = 0;
|
|
Packit |
2d622a |
if (htlb_num_segs == 0)
|
|
Packit |
2d622a |
return;
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* rough heuristic to see if we'll run out of address
|
|
Packit |
2d622a |
* space
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
for (i = 0; i < htlb_num_segs; i++) {
|
|
Packit |
2d622a |
memsz_total += htlb_seg_table[i].memsz;
|
|
Packit |
2d622a |
if (htlb_seg_table[i].memsz > memsz_max)
|
|
Packit |
2d622a |
memsz_max = htlb_seg_table[i].memsz;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
/* avoid overflow checking by using two checks */
|
|
Packit |
2d622a |
DEBUG("Total memsz = %#0lx, memsz of largest segment = %#0lx\n",
|
|
Packit |
2d622a |
memsz_total, memsz_max);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/**
|
|
Packit |
2d622a |
* find_or_create_share_path - obtain a directory to store the shared
|
|
Packit |
2d622a |
* hugetlbfs files
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* Checks environment and filesystem to locate a suitable directory
|
|
Packit |
2d622a |
* for shared hugetlbfs files, creating a new directory if necessary.
|
|
Packit |
2d622a |
* The determined path is stored in global variable share_readonly_path.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* returns:
|
|
Packit |
2d622a |
* -1, on error
|
|
Packit |
2d622a |
* 0, on success
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static int find_or_create_share_path(long page_size)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
const char *base_path;
|
|
Packit |
2d622a |
struct stat sb;
|
|
Packit |
2d622a |
int ret;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* If no remaping is planned for the read-only segments we are done */
|
|
Packit |
2d622a |
if (!page_size)
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (__hugetlb_opts.share_path) {
|
|
Packit |
2d622a |
/* Given an explicit path */
|
|
Packit |
2d622a |
if (hugetlbfs_test_path(__hugetlb_opts.share_path) != 1) {
|
|
Packit |
2d622a |
WARNING("HUGETLB_SHARE_PATH %s is not on a hugetlbfs"
|
|
Packit |
2d622a |
" filesystem\n", __hugetlb_opts.share_path);
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Make sure the page size matches */
|
|
Packit |
2d622a |
if (page_size !=
|
|
Packit |
2d622a |
hugetlbfs_test_pagesize(__hugetlb_opts.share_path)) {
|
|
Packit |
2d622a |
WARNING("HUGETLB_SHARE_PATH %s is not valid for a %li "
|
|
Packit |
2d622a |
"kB page size\n", __hugetlb_opts.share_path,
|
|
Packit |
2d622a |
page_size / 1024);
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
assemble_path(share_readonly_path, "%s",
|
|
Packit |
2d622a |
__hugetlb_opts.share_path);
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
base_path = hugetlbfs_find_path_for_size(page_size);
|
|
Packit |
2d622a |
if (!base_path)
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
assemble_path(share_readonly_path, "%s/elflink-uid-%d",
|
|
Packit |
2d622a |
base_path, getuid());
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
ret = mkdir(share_readonly_path, 0700);
|
|
Packit |
2d622a |
if ((ret != 0) && (errno != EEXIST)) {
|
|
Packit |
2d622a |
WARNING("Error creating share directory %s\n",
|
|
Packit |
2d622a |
share_readonly_path);
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Check the share directory is sane */
|
|
Packit |
2d622a |
ret = lstat(share_readonly_path, &sb);
|
|
Packit |
2d622a |
if (ret != 0) {
|
|
Packit |
2d622a |
WARNING("Couldn't stat() %s: %s\n", share_readonly_path,
|
|
Packit |
2d622a |
strerror(errno));
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (! S_ISDIR(sb.st_mode)) {
|
|
Packit |
2d622a |
WARNING("%s is not a directory\n", share_readonly_path);
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (sb.st_uid != getuid()) {
|
|
Packit |
2d622a |
WARNING("%s has wrong owner (uid=%d instead of %d)\n",
|
|
Packit |
2d622a |
share_readonly_path, sb.st_uid, getuid());
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (sb.st_mode & (S_IWGRP | S_IWOTH)) {
|
|
Packit |
2d622a |
WARNING("%s has bad permissions 0%03o\n",
|
|
Packit |
2d622a |
share_readonly_path, sb.st_mode);
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Look for non-zero BSS data inside a range and print out any matches
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static void check_bss(unsigned long *start, unsigned long *end)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
unsigned long *addr;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
for (addr = start; addr < end; addr++) {
|
|
Packit |
2d622a |
if (*addr != 0)
|
|
Packit |
2d622a |
DEBUG("Non-zero BSS data @ %p: %lx\n", addr, *addr);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/**
|
|
Packit |
2d622a |
* get_shared_file_name - create a shared file name from program name,
|
|
Packit |
2d622a |
* segment number and current word size
|
|
Packit |
2d622a |
* @htlb_seg_info: pointer to program's segment data
|
|
Packit |
2d622a |
* @file_path: pointer to a PATH_MAX+1 array to store filename in
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* The file name created is *not* intended to be unique, except when
|
|
Packit |
2d622a |
* the name, gid or phdr number differ. The goal here is to have a
|
|
Packit |
2d622a |
* standard means of accessing particular segments of particular
|
|
Packit |
2d622a |
* executables.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* returns:
|
|
Packit |
2d622a |
* -1, on failure
|
|
Packit |
2d622a |
* 0, on success
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static int get_shared_file_name(struct seg_info *htlb_seg_info, char *file_path)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int ret;
|
|
Packit |
2d622a |
char binary[PATH_MAX+1];
|
|
Packit |
2d622a |
char *binary2;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
memset(binary, 0, sizeof(binary));
|
|
Packit |
2d622a |
ret = readlink("/proc/self/exe", binary, PATH_MAX);
|
|
Packit |
2d622a |
if (ret < 0) {
|
|
Packit |
2d622a |
WARNING("shared_file: readlink() on /proc/self/exe "
|
|
Packit |
2d622a |
"failed: %s\n", strerror(errno));
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
binary2 = basename(binary);
|
|
Packit |
2d622a |
if (!binary2) {
|
|
Packit |
2d622a |
WARNING("shared_file: basename() on %s failed: %s\n",
|
|
Packit |
2d622a |
binary, strerror(errno));
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
assemble_path(file_path, "%s/%s_%zd_%d", share_readonly_path, binary2,
|
|
Packit |
2d622a |
sizeof(unsigned long) * 8, htlb_seg_info->index);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Find the .dynamic program header */
|
|
Packit |
ed3d6b |
static int find_dynamic(Elf_Dyn **dyntab, const ElfW(Addr) addr,
|
|
Packit |
ed3d6b |
const Elf_Phdr *phdr, int phnum)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int i = 1;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
while ((phdr[i].p_type != PT_DYNAMIC) && (i < phnum)) {
|
|
Packit |
2d622a |
++i;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
if (phdr[i].p_type == PT_DYNAMIC) {
|
|
Packit |
ed3d6b |
*dyntab = (Elf_Dyn *)(addr + phdr[i].p_vaddr);
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
} else {
|
|
Packit |
2d622a |
DEBUG("No dynamic segment found\n");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Find the dynamic string and symbol tables */
|
|
Packit |
2d622a |
static int find_tables(Elf_Dyn *dyntab, Elf_Sym **symtab, char **strtab)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int i = 1;
|
|
Packit |
2d622a |
while ((dyntab[i].d_tag != DT_NULL)) {
|
|
Packit |
2d622a |
if (dyntab[i].d_tag == DT_SYMTAB)
|
|
Packit |
2d622a |
*symtab = (Elf_Sym *)dyntab[i].d_un.d_ptr;
|
|
Packit |
2d622a |
else if (dyntab[i].d_tag == DT_STRTAB)
|
|
Packit |
2d622a |
*strtab = (char *)dyntab[i].d_un.d_ptr;
|
|
Packit |
2d622a |
i++;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (!*symtab) {
|
|
Packit |
2d622a |
DEBUG("No symbol table found\n");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
if (!*strtab) {
|
|
Packit |
2d622a |
DEBUG("No string table found\n");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Find the number of symbol table entries */
|
|
Packit |
2d622a |
static int find_numsyms(Elf_Sym *symtab, char *strtab)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* WARNING - The symbol table size calculation does not follow the ELF
|
|
Packit |
2d622a |
* standard, but rather exploits an assumption we enforce in
|
|
Packit |
2d622a |
* our linker scripts that the string table follows
|
|
Packit |
2d622a |
* immediately after the symbol table. The linker scripts
|
|
Packit |
2d622a |
* must maintain this assumption or this code will break.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
if ((void *)strtab <= (void *)symtab) {
|
|
Packit |
2d622a |
DEBUG("Could not calculate dynamic symbol table size\n");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
return ((void *)strtab - (void *)symtab) / sizeof(Elf_Sym);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* To reduce the size of the extra copy window, we can eliminate certain
|
|
Packit |
2d622a |
* symbols based on information in the dynamic section. The following
|
|
Packit |
2d622a |
* characteristics apply to symbols which may require copying:
|
|
Packit |
2d622a |
* - Within the BSS
|
|
Packit |
2d622a |
* - Global or Weak binding
|
|
Packit |
2d622a |
* - Object type (variable)
|
|
Packit |
2d622a |
* - Non-zero size (zero size means the symbol is just a marker with no data)
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static inline int keep_symbol(char *strtab, Elf_Sym *s, void *start, void *end)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
if ((void *)s->st_value < start)
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
if ((void *)s->st_value > end)
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
if ((ELF_ST_BIND(s->st_info) != STB_GLOBAL) &&
|
|
Packit |
2d622a |
(ELF_ST_BIND(s->st_info) != STB_WEAK))
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
if (ELF_ST_TYPE(s->st_info) != STT_OBJECT)
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
if (s->st_size == 0)
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (__hugetlbfs_debug)
|
|
Packit |
2d622a |
DEBUG("symbol to copy at %p: %s\n", (void *)s->st_value,
|
|
Packit |
2d622a |
strtab + s->st_name);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* If unspecified by the architecture, no extra copying of the plt is needed */
|
|
Packit |
2d622a |
ElfW(Word) __attribute__ ((weak)) plt_extrasz(ElfW(Dyn) *dyntab)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Subtle: Since libhugetlbfs depends on glibc, we allow it
|
|
Packit |
2d622a |
* it to be loaded before us. As part of its init functions, it
|
|
Packit |
2d622a |
* initializes stdin, stdout, and stderr in the bss. We need to
|
|
Packit |
2d622a |
* include these initialized variables in our copy.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
|
|
Packit |
ed3d6b |
static void get_extracopy(struct seg_info *seg, const ElfW(Addr) addr,
|
|
Packit |
ed3d6b |
const Elf_Phdr *phdr, int phnum)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
Elf_Dyn *dyntab; /* dynamic segment table */
|
|
Packit |
2d622a |
Elf_Sym *symtab = NULL; /* dynamic symbol table */
|
|
Packit |
2d622a |
Elf_Sym *sym; /* a symbol */
|
|
Packit |
2d622a |
char *strtab = NULL; /* string table for dynamic symbols */
|
|
Packit |
2d622a |
int ret, numsyms, found_sym = 0;
|
|
Packit |
2d622a |
void *start, *end, *end_orig;
|
|
Packit |
2d622a |
void *sym_end;
|
|
Packit |
2d622a |
void *plt_end;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
end_orig = seg->vaddr + seg->memsz;
|
|
Packit |
2d622a |
start = seg->vaddr + seg->filesz;
|
|
Packit |
2d622a |
if (seg->filesz == seg->memsz)
|
|
Packit |
2d622a |
return;
|
|
Packit |
2d622a |
if (!__hugetlb_opts.min_copy)
|
|
Packit |
2d622a |
goto bail2;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Find dynamic program header */
|
|
Packit |
ed3d6b |
ret = find_dynamic(&dyntab, addr, phdr, phnum);
|
|
Packit |
2d622a |
if (ret < 0)
|
|
Packit |
2d622a |
goto bail;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Find symbol and string tables */
|
|
Packit |
2d622a |
ret = find_tables(dyntab, &symtab, &strtab);
|
|
Packit |
2d622a |
if (ret < 0)
|
|
Packit |
2d622a |
goto bail;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
numsyms = find_numsyms(symtab, strtab);
|
|
Packit |
2d622a |
if (numsyms < 0)
|
|
Packit |
2d622a |
goto bail;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* We must ensure any returns done hereafter have sane start and end
|
|
Packit |
2d622a |
* values, as the criss-cross apple sauce algorithm is beginning
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
end = start;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
for (sym = symtab; sym < symtab + numsyms; sym++) {
|
|
Packit |
2d622a |
if (!keep_symbol(strtab, sym, start, end_orig))
|
|
Packit |
2d622a |
continue;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* These are the droids we are looking for */
|
|
Packit |
2d622a |
found_sym = 1;
|
|
Packit |
2d622a |
sym_end = (void *)(sym->st_value + sym->st_size);
|
|
Packit |
2d622a |
if (sym_end > end)
|
|
Packit |
2d622a |
end = sym_end;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Some platforms (PowerPC 64bit ELF) place their PLT beyond the filesz
|
|
Packit |
2d622a |
* part of the data segment. When this is the case, we must extend the
|
|
Packit |
2d622a |
* copy window to include this data which has been initialized by the
|
|
Packit |
2d622a |
* run-time linker.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
plt_end = start + plt_extrasz(dyntab);
|
|
Packit |
2d622a |
if (plt_end > end) {
|
|
Packit |
2d622a |
end = plt_end;
|
|
Packit |
2d622a |
found_sym = 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (__hugetlbfs_debug)
|
|
Packit |
2d622a |
check_bss(end, end_orig);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (found_sym) {
|
|
Packit |
2d622a |
seg->extrasz = end - start;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* else no need to copy anything, so leave seg->extrasz as zero
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
return;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
bail:
|
|
Packit |
2d622a |
DEBUG("Unable to perform minimal copy\n");
|
|
Packit |
2d622a |
bail2:
|
|
Packit |
2d622a |
seg->extrasz = end_orig - start;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
#if defined(__powerpc64__) || \
|
|
Packit |
2d622a |
(defined(__powerpc__) && !defined(PPC_NO_SEGMENTS))
|
|
Packit |
2d622a |
#define SLICE_LOW_TOP (0x100000000UL)
|
|
Packit |
2d622a |
#define SLICE_LOW_SIZE (1UL << SLICE_LOW_SHIFT)
|
|
Packit |
2d622a |
#define SLICE_HIGH_SIZE (1UL << SLICE_HIGH_SHIFT)
|
|
Packit |
2d622a |
#endif
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Return the address of the start and end of the hugetlb slice
|
|
Packit |
2d622a |
* containing @addr. A slice is a range of addresses, start inclusive
|
|
Packit |
2d622a |
* and end exclusive.
|
|
Packit |
2d622a |
* Note, that since relinking is not supported on ia64, we can leave it
|
|
Packit |
2d622a |
* out here.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static unsigned long hugetlb_slice_start(unsigned long addr)
|
|
Packit |
2d622a |
{
|
|
Packit |
5c4cef |
if (!arch_has_slice_support()) {
|
|
Packit |
5c4cef |
return ALIGN_DOWN(addr, gethugepagesize());
|
|
Packit |
5c4cef |
}
|
|
Packit |
5c4cef |
|
|
Packit |
2d622a |
#if defined(__powerpc64__)
|
|
Packit |
2d622a |
if (addr < SLICE_LOW_TOP)
|
|
Packit |
2d622a |
return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
|
|
Packit |
2d622a |
else if (addr < SLICE_HIGH_SIZE)
|
|
Packit |
2d622a |
return SLICE_LOW_TOP;
|
|
Packit |
2d622a |
else
|
|
Packit |
2d622a |
return ALIGN_DOWN(addr, SLICE_HIGH_SIZE);
|
|
Packit |
2d622a |
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
|
|
Packit |
2d622a |
return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
|
|
Packit |
2d622a |
#endif
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static unsigned long hugetlb_slice_end(unsigned long addr)
|
|
Packit |
2d622a |
{
|
|
Packit |
5c4cef |
if (!arch_has_slice_support()) {
|
|
Packit |
5c4cef |
return ALIGN_UP(addr, gethugepagesize()) - 1;
|
|
Packit |
5c4cef |
}
|
|
Packit |
5c4cef |
|
|
Packit |
2d622a |
#if defined(__powerpc64__)
|
|
Packit |
2d622a |
if (addr < SLICE_LOW_TOP)
|
|
Packit |
2d622a |
return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
|
|
Packit |
2d622a |
else
|
|
Packit |
2d622a |
return ALIGN_UP(addr, SLICE_HIGH_SIZE) - 1;
|
|
Packit |
2d622a |
#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)
|
|
Packit |
2d622a |
return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
|
|
Packit |
2d622a |
#endif
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static unsigned long hugetlb_next_slice_start(unsigned long addr)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
return hugetlb_slice_end(addr) + 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static unsigned long hugetlb_prev_slice_end(unsigned long addr)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
return hugetlb_slice_start(addr) - 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Store a copy of the given program header
|
|
Packit |
2d622a |
*/
|
|
Packit |
ed3d6b |
static int save_phdr(int table_idx, int phnum, const ElfW(Addr) addr,
|
|
Packit |
ed3d6b |
const ElfW(Phdr) *phdr)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int prot = 0;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (table_idx >= MAX_HTLB_SEGS) {
|
|
Packit |
2d622a |
WARNING("Executable has too many segments (max %d)\n",
|
|
Packit |
2d622a |
MAX_HTLB_SEGS);
|
|
Packit |
2d622a |
htlb_num_segs = 0;
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (phdr->p_flags & PF_R)
|
|
Packit |
2d622a |
prot |= PROT_READ;
|
|
Packit |
2d622a |
if (phdr->p_flags & PF_W)
|
|
Packit |
2d622a |
prot |= PROT_WRITE;
|
|
Packit |
2d622a |
if (phdr->p_flags & PF_X)
|
|
Packit |
2d622a |
prot |= PROT_EXEC;
|
|
Packit |
2d622a |
|
|
Packit |
ed3d6b |
htlb_seg_table[table_idx].vaddr = (void *)(addr + phdr->p_vaddr);
|
|
Packit |
2d622a |
htlb_seg_table[table_idx].filesz = phdr->p_filesz;
|
|
Packit |
2d622a |
htlb_seg_table[table_idx].memsz = phdr->p_memsz;
|
|
Packit |
2d622a |
htlb_seg_table[table_idx].prot = prot;
|
|
Packit |
2d622a |
htlb_seg_table[table_idx].index = phnum;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
INFO("Segment %d (phdr %d): %#0lx-%#0lx (filesz=%#0lx) "
|
|
Packit |
2d622a |
"(prot = %#0x)\n", table_idx, phnum,
|
|
Packit |
ed3d6b |
(unsigned long) addr + phdr->p_vaddr,
|
|
Packit |
ed3d6b |
(unsigned long) addr + phdr->p_vaddr + phdr->p_memsz,
|
|
Packit |
2d622a |
(unsigned long) phdr->p_filesz, (unsigned int) prot);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static int verify_segment_layout(struct seg_layout *segs, int num_segs)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int i;
|
|
Packit |
2d622a |
long base_size = getpagesize();
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
for (i = 1; i < num_segs; i++) {
|
|
Packit |
2d622a |
unsigned long prev_end = segs[i - 1].end;
|
|
Packit |
2d622a |
unsigned long start = segs[i].start;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Do not worry about the boundary between segments that will
|
|
Packit |
2d622a |
* not be remapped.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
if (segs[i - 1].page_size == base_size &&
|
|
Packit |
2d622a |
segs[i].page_size == base_size)
|
|
Packit |
2d622a |
continue;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Make sure alignment hasn't caused segments to overlap */
|
|
Packit |
2d622a |
if (prev_end > start) {
|
|
Packit |
2d622a |
WARNING("Layout problem with segments %i and %i:\n\t"
|
|
Packit |
2d622a |
"Segments would overlap\n", i - 1, i);
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Make sure page size transitions occur on slice boundaries */
|
|
Packit |
2d622a |
if ((segs[i - 1].page_size != segs[i].page_size) &&
|
|
Packit |
2d622a |
hugetlb_slice_end(prev_end) >
|
|
Packit |
2d622a |
hugetlb_slice_start(start)) {
|
|
Packit |
2d622a |
WARNING("Layout problem with segments %i and %i:\n\t"
|
|
Packit |
2d622a |
"Only one page size per slice\n", i - 1, i);
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static long segment_requested_page_size(const ElfW(Phdr) *phdr)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int writable = phdr->p_flags & PF_W;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Check if a page size was requested by the user */
|
|
Packit |
2d622a |
if (writable && hpage_writable_size)
|
|
Packit |
2d622a |
return hpage_writable_size;
|
|
Packit |
2d622a |
if (!writable && hpage_readonly_size)
|
|
Packit |
2d622a |
return hpage_readonly_size;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Check if this segment requests remapping by default */
|
|
Packit |
2d622a |
if (!hpage_readonly_size && !hpage_writable_size &&
|
|
Packit |
2d622a |
(phdr->p_flags & PF_LINUX_HUGETLB))
|
|
Packit |
2d622a |
return gethugepagesize();
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* No remapping selected, return the base page size */
|
|
Packit |
2d622a |
return getpagesize();
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static
|
|
Packit |
2d622a |
int parse_elf_normal(struct dl_phdr_info *info, size_t size, void *data)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int i, num_segs;
|
|
Packit |
2d622a |
unsigned long page_size, seg_psize, start, end;
|
|
Packit |
2d622a |
struct seg_layout segments[MAX_SEGS];
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
page_size = getpagesize();
|
|
Packit |
2d622a |
num_segs = 0;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
for (i = 0; i < info->dlpi_phnum; i++) {
|
|
Packit |
2d622a |
if (info->dlpi_phdr[i].p_type != PT_LOAD)
|
|
Packit |
2d622a |
continue;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (i >= MAX_SEGS) {
|
|
Packit |
2d622a |
WARNING("Maximum number of PT_LOAD segments"
|
|
Packit |
2d622a |
"exceeded\n");
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
seg_psize = segment_requested_page_size(&info->dlpi_phdr[i]);
|
|
Packit |
2d622a |
if (seg_psize != page_size) {
|
|
Packit |
ed3d6b |
if (save_phdr(htlb_num_segs, i, info->dlpi_addr,
|
|
Packit |
ed3d6b |
&info->dlpi_phdr[i]))
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
get_extracopy(&htlb_seg_table[htlb_num_segs],
|
|
Packit |
ed3d6b |
info->dlpi_addr, info->dlpi_phdr,
|
|
Packit |
ed3d6b |
info->dlpi_phnum);
|
|
Packit |
2d622a |
htlb_seg_table[htlb_num_segs].page_size = seg_psize;
|
|
Packit |
2d622a |
htlb_num_segs++;
|
|
Packit |
2d622a |
}
|
|
Packit |
ed3d6b |
start = ALIGN_DOWN(info->dlpi_addr +
|
|
Packit |
ed3d6b |
info->dlpi_phdr[i].p_vaddr, seg_psize);
|
|
Packit |
ed3d6b |
end = ALIGN(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr +
|
|
Packit |
ed3d6b |
info->dlpi_phdr[i].p_memsz, seg_psize);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
segments[num_segs].page_size = seg_psize;
|
|
Packit |
2d622a |
segments[num_segs].start = start;
|
|
Packit |
2d622a |
segments[num_segs].end = end;
|
|
Packit |
2d622a |
num_segs++;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
if (verify_segment_layout(segments, num_segs))
|
|
Packit |
2d622a |
htlb_num_segs = 0;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (__hugetlbfs_debug)
|
|
Packit |
2d622a |
check_memsz();
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Parse the phdrs of a normal program to attempt partial segment remapping
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static
|
|
Packit |
2d622a |
int parse_elf_partial(struct dl_phdr_info *info, size_t size, void *data)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
unsigned long vaddr, memsz, gap;
|
|
Packit |
2d622a |
unsigned long slice_end;
|
|
Packit |
2d622a |
int i;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* This should never actually be called more than once in an
|
|
Packit |
2d622a |
* iteration: we assume that dl_iterate_phdrs() always gives
|
|
Packit |
2d622a |
* us the main program's phdrs on the first iteration, and
|
|
Packit |
2d622a |
* always return 1 to cease iteration at that point. */
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
for (i = 0; i < info->dlpi_phnum; i++) {
|
|
Packit |
2d622a |
if (info->dlpi_phdr[i].p_type != PT_LOAD)
|
|
Packit |
2d622a |
continue;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Partial segment remapping only makes sense if the
|
|
Packit |
2d622a |
* memory size of the segment is larger than the
|
|
Packit |
2d622a |
* granularity at which hugepages can be used. This
|
|
Packit |
2d622a |
* mostly affects ppc, where the segment must be larger
|
|
Packit |
2d622a |
* than 256M. This guarantees that remapping the binary
|
|
Packit |
2d622a |
* in this forced way won't violate any contiguity
|
|
Packit |
2d622a |
* constraints.
|
|
Packit |
2d622a |
*/
|
|
Packit |
ed3d6b |
vaddr = hugetlb_next_slice_start(info->dlpi_addr +
|
|
Packit |
ed3d6b |
info->dlpi_phdr[i].p_vaddr);
|
|
Packit |
ed3d6b |
gap = vaddr - (info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
|
|
Packit |
2d622a |
slice_end = hugetlb_slice_end(vaddr);
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* we should stop remapping just before the slice
|
|
Packit |
2d622a |
* containing the end of the memsz portion (taking away
|
|
Packit |
2d622a |
* the gap of the memsz)
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
memsz = info->dlpi_phdr[i].p_memsz;
|
|
Packit |
2d622a |
if (memsz < gap) {
|
|
Packit |
2d622a |
INFO("Segment %d's unaligned memsz is too small: "
|
|
Packit |
2d622a |
"%#0lx < %#0lx\n",
|
|
Packit |
2d622a |
i, memsz, gap);
|
|
Packit |
2d622a |
continue;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
memsz -= gap;
|
|
Packit |
2d622a |
if (memsz < (slice_end - vaddr)) {
|
|
Packit |
2d622a |
INFO("Segment %d's aligned memsz is too small: "
|
|
Packit |
2d622a |
"%#0lx < %#0lx\n",
|
|
Packit |
2d622a |
i, memsz, slice_end - vaddr);
|
|
Packit |
2d622a |
continue;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
memsz = hugetlb_prev_slice_end(vaddr + memsz) - vaddr;
|
|
Packit |
2d622a |
|
|
Packit |
ed3d6b |
if (save_phdr(htlb_num_segs, i, info->dlpi_addr,
|
|
Packit |
ed3d6b |
&info->dlpi_phdr[i]))
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* When remapping partial segments, we create a sub-segment
|
|
Packit |
2d622a |
* that is based on the original. For this reason, we must
|
|
Packit |
2d622a |
* make some changes to the phdr captured by save_phdr():
|
|
Packit |
2d622a |
* vaddr is aligned upwards to a slice boundary
|
|
Packit |
2d622a |
* memsz is aligned downwards to a slice boundary
|
|
Packit |
2d622a |
* filesz is set to memsz to force all memory to be copied
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr;
|
|
Packit |
2d622a |
htlb_seg_table[htlb_num_segs].filesz = memsz;
|
|
Packit |
2d622a |
htlb_seg_table[htlb_num_segs].memsz = memsz;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
htlb_num_segs++;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
return 1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Verify that a range of memory is unoccupied and usable
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static void check_range_empty(void *addr, unsigned long len)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
void *p;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
p = mmap(addr, len, PROT_READ, MAP_PRIVATE|MAP_ANON, 0, 0);
|
|
Packit |
2d622a |
if (p != addr) {
|
|
Packit |
2d622a |
WARNING("Unable to verify address range %p - %p. Not empty?\n",
|
|
Packit |
2d622a |
addr, addr + len);
|
|
Packit |
2d622a |
if (__hugetlbfs_debug)
|
|
Packit |
2d622a |
dump_proc_pid_maps();
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
if (p != MAP_FAILED)
|
|
Packit |
2d622a |
munmap(p, len);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Copy a program segment into a huge page. If possible, try to copy the
|
|
Packit |
2d622a |
* smallest amount of data possible, unless the user disables this
|
|
Packit |
2d622a |
* optimization via the HUGETLB_ELFMAP environment variable.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static int prepare_segment(struct seg_info *seg)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
void *start, *p, *end, *new_end;
|
|
Packit |
2d622a |
unsigned long size, offset;
|
|
Packit |
2d622a |
long page_size = getpagesize();
|
|
Packit |
2d622a |
long hpage_size;
|
|
Packit |
2d622a |
int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
hpage_size = seg->page_size;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* mmaps must begin at an address aligned to the page size. If the
|
|
Packit |
2d622a |
* vaddr of this segment is not hpage_size aligned, align it downward
|
|
Packit |
2d622a |
* and begin the mmap there. Note the offset so we can copy data to
|
|
Packit |
2d622a |
* the correct starting address within the temporary mmap.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
start = (void *) ALIGN_DOWN((unsigned long)seg->vaddr, hpage_size);
|
|
Packit |
2d622a |
offset = seg->vaddr - start;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Calculate the size of the temporary mapping we must create.
|
|
Packit |
2d622a |
* This includes the offset (described above) and the filesz and
|
|
Packit |
2d622a |
* extrasz portions of the segment (described below). We must align
|
|
Packit |
2d622a |
* this total to the huge page size so it will be valid for mmap.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
size = ALIGN(offset + seg->filesz + seg->extrasz, hpage_size);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* If the segment's start or end addresses have been adjusted to align
|
|
Packit |
2d622a |
* them to the hpage_size, check to make sure nothing is mapped in the
|
|
Packit |
2d622a |
* padding before and after the segment.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
end = (void *) ALIGN((unsigned long)seg->vaddr + seg->memsz, page_size);
|
|
Packit |
2d622a |
new_end = (void *) ALIGN((unsigned long)end, hpage_size);
|
|
Packit |
2d622a |
if (ALIGN_DOWN(offset, page_size))
|
|
Packit |
2d622a |
check_range_empty(start, ALIGN_DOWN(offset, page_size));
|
|
Packit |
2d622a |
if (end != new_end)
|
|
Packit |
2d622a |
check_range_empty(end, new_end - end);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Create the temporary huge page mmap */
|
|
Packit |
2d622a |
p = mmap(NULL, size, PROT_READ|PROT_WRITE,
|
|
Packit |
2d622a |
MAP_SHARED|mmap_reserve, seg->fd, 0);
|
|
Packit |
2d622a |
if (p == MAP_FAILED) {
|
|
Packit |
2d622a |
WARNING("Couldn't map hugepage segment to copy data: %s\n",
|
|
Packit |
2d622a |
strerror(errno));
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Minimizing the amount of data copied will maximize performance.
|
|
Packit |
2d622a |
* By definition, the filesz portion of the segment contains
|
|
Packit |
2d622a |
* initialized data and must be copied. If part of the memsz portion
|
|
Packit |
2d622a |
* is known to be initialized already, extrasz will be non-zero and
|
|
Packit |
2d622a |
* that many addtional bytes will be copied from the beginning of the
|
|
Packit |
2d622a |
* memsz region. The rest of the memsz is understood to be zeroes and
|
|
Packit |
2d622a |
* need not be copied.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
INFO("Mapped hugeseg at %p. Copying %#0lx bytes and %#0lx extra bytes"
|
|
Packit |
2d622a |
" from %p...", p, seg->filesz, seg->extrasz, seg->vaddr);
|
|
Packit |
2d622a |
memcpy(p + offset, seg->vaddr, seg->filesz + seg->extrasz);
|
|
Packit |
2d622a |
INFO_CONT("done\n");
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
munmap(p, size);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* [PPC] Prior to 2.6.22 (which added slices), our temporary hugepage
|
|
Packit |
2d622a |
* mappings are placed in the segment before the stack. This 'taints' that
|
|
Packit |
2d622a |
* segment for be hugepage-only for the lifetime of the process, resulting
|
|
Packit |
2d622a |
* in a maximum stack size of 256MB. If we instead create our hugepage
|
|
Packit |
2d622a |
* mappings in a child process, we can avoid this problem.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* This does not adversely affect non-PPC platforms so do it everywhere.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static int fork_and_prepare_segment(struct seg_info *htlb_seg_info)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int pid, ret, status;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if ((pid = fork()) < 0) {
|
|
Packit |
2d622a |
WARNING("fork failed");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
if (pid == 0) {
|
|
Packit |
2d622a |
ret = prepare_segment(htlb_seg_info);
|
|
Packit |
2d622a |
if (ret < 0) {
|
|
Packit |
2d622a |
WARNING("Failed to prepare segment\n");
|
|
Packit |
2d622a |
exit(1);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
else
|
|
Packit |
2d622a |
exit(0);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
ret = waitpid(pid, &status, 0);
|
|
Packit |
2d622a |
if (ret == -1) {
|
|
Packit |
2d622a |
WARNING("waitpid failed");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (WEXITSTATUS(status) != 0)
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
INFO("Prepare succeeded\n");
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/**
|
|
Packit |
2d622a |
* find_or_prepare_shared_file - get one shareable file
|
|
Packit |
2d622a |
* @htlb_seg_info: pointer to program's segment data
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* This function either locates a hugetlbfs file already containing
|
|
Packit |
2d622a |
* data for a given program segment, or creates one if it doesn't
|
|
Packit |
2d622a |
* already exist.
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* We use the following algorithm to ensure that when processes race
|
|
Packit |
2d622a |
* to instantiate the hugepage file, we will never obtain an
|
|
Packit |
2d622a |
* incompletely prepared file or have multiple processes prepar
|
|
Packit |
2d622a |
* separate copies of the file.
|
|
Packit |
2d622a |
* - first open 'filename.tmp' with O_EXCL (this acts as a lockfile)
|
|
Packit |
2d622a |
* - second open 'filename' with O_RDONLY (even if the first open
|
|
Packit |
2d622a |
* succeeded).
|
|
Packit |
2d622a |
* Then:
|
|
Packit |
2d622a |
* - If both opens succeed, close the O_EXCL open, unlink
|
|
Packit |
2d622a |
* filename.tmp and use the O_RDONLY fd. (Somebody else has prepared
|
|
Packit |
2d622a |
* the file already)
|
|
Packit |
2d622a |
* - If only the O_RDONLY open suceeds, and the O_EXCL open
|
|
Packit |
2d622a |
* fails with EEXIST, just used the O_RDONLY fd. (Somebody else has
|
|
Packit |
2d622a |
* prepared the file already, but we raced with their rename()).
|
|
Packit |
2d622a |
* - If only the O_EXCL open suceeds, and the O_RDONLY fails with
|
|
Packit |
2d622a |
* ENOENT, prepare the the O_EXCL open, then rename() filename.tmp to
|
|
Packit |
2d622a |
* filename. (We're the first in, we have to prepare the file).
|
|
Packit |
2d622a |
* - If both opens fail, with EEXIST and ENOENT, respectively,
|
|
Packit |
2d622a |
* wait for a little while, then try again from the beginning
|
|
Packit |
2d622a |
* (Somebody else is preparing the file, but hasn't finished yet)
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* returns:
|
|
Packit |
2d622a |
* -1, on failure
|
|
Packit |
2d622a |
* 0, on success
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static int find_or_prepare_shared_file(struct seg_info *htlb_seg_info)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int fdx = -1, fds;
|
|
Packit |
2d622a |
int errnox, errnos;
|
|
Packit |
2d622a |
int ret;
|
|
Packit |
2d622a |
int i;
|
|
Packit |
2d622a |
char final_path[PATH_MAX+1];
|
|
Packit |
2d622a |
char tmp_path[PATH_MAX+1];
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
ret = get_shared_file_name(htlb_seg_info, final_path);
|
|
Packit |
2d622a |
if (ret < 0)
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
assemble_path(tmp_path, "%s.tmp", final_path);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
for (i = 0; i < SHARED_TIMEOUT; i++) {
|
|
Packit |
2d622a |
/* NB: mode is modified by umask */
|
|
Packit |
2d622a |
fdx = open(tmp_path, O_CREAT | O_EXCL | O_RDWR, 0666);
|
|
Packit |
2d622a |
errnox = errno;
|
|
Packit |
2d622a |
fds = open(final_path, O_RDONLY);
|
|
Packit |
2d622a |
errnos = errno;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (fds >= 0) {
|
|
Packit |
2d622a |
/* Got an already-prepared file -> use it */
|
|
Packit |
2d622a |
if (fdx > 0) {
|
|
Packit |
2d622a |
/* Also got an exclusive file -> clean up */
|
|
Packit |
2d622a |
ret = unlink(tmp_path);
|
|
Packit |
2d622a |
if (ret != 0)
|
|
Packit |
2d622a |
WARNING("shared_file: unable to clean "
|
|
Packit |
2d622a |
"up unneeded file %s: %s\n",
|
|
Packit |
2d622a |
tmp_path, strerror(errno));
|
|
Packit |
2d622a |
close(fdx);
|
|
Packit |
2d622a |
} else if (errnox != EEXIST) {
|
|
Packit |
2d622a |
WARNING("shared_file: Unexpected failure on exclusive"
|
|
Packit |
2d622a |
" open of %s: %s\n", tmp_path,
|
|
Packit |
2d622a |
strerror(errnox));
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
htlb_seg_info->fd = fds;
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (fdx >= 0) {
|
|
Packit |
2d622a |
/* It's our job to prepare */
|
|
Packit |
2d622a |
if (errnos != ENOENT)
|
|
Packit |
2d622a |
WARNING("shared_file: Unexpected failure on"
|
|
Packit |
2d622a |
" shared open of %s: %s\n", final_path,
|
|
Packit |
2d622a |
strerror(errnos));
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
htlb_seg_info->fd = fdx;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
INFO("Got unpopulated shared fd -- Preparing\n");
|
|
Packit |
2d622a |
ret = fork_and_prepare_segment(htlb_seg_info);
|
|
Packit |
2d622a |
if (ret < 0)
|
|
Packit |
2d622a |
goto fail;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
INFO("Prepare succeeded\n");
|
|
Packit |
2d622a |
/* move to permanent location */
|
|
Packit |
2d622a |
ret = rename(tmp_path, final_path);
|
|
Packit |
2d622a |
if (ret != 0) {
|
|
Packit |
2d622a |
WARNING("shared_file: unable to rename %s"
|
|
Packit |
2d622a |
" to %s: %s\n", tmp_path, final_path,
|
|
Packit |
2d622a |
strerror(errno));
|
|
Packit |
2d622a |
goto fail;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Both opens failed, somebody else is still preparing */
|
|
Packit |
2d622a |
/* Wait and try again */
|
|
Packit |
2d622a |
sleep(1);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
fail:
|
|
Packit |
2d622a |
if (fdx > 0) {
|
|
Packit |
2d622a |
ret = unlink(tmp_path);
|
|
Packit |
2d622a |
if (ret != 0)
|
|
Packit |
2d622a |
WARNING("shared_file: Unable to clean up temp file %s "
|
|
Packit |
2d622a |
"on failure: %s\n", tmp_path, strerror(errno));
|
|
Packit |
2d622a |
close(fdx);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/**
|
|
Packit |
2d622a |
* obtain_prepared_file - multiplex callers depending on if
|
|
Packit |
2d622a |
* sharing or not
|
|
Packit |
2d622a |
* @htlb_seg_info: pointer to program's segment data
|
|
Packit |
2d622a |
*
|
|
Packit |
2d622a |
* returns:
|
|
Packit |
2d622a |
* -1, on error
|
|
Packit |
2d622a |
* 0, on success
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static int obtain_prepared_file(struct seg_info *htlb_seg_info)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int fd = -1;
|
|
Packit |
2d622a |
int ret;
|
|
Packit |
2d622a |
long hpage_size = htlb_seg_info->page_size;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Share only read-only segments */
|
|
Packit |
2d622a |
if (__hugetlb_opts.sharing && !(htlb_seg_info->prot & PROT_WRITE)) {
|
|
Packit |
2d622a |
/* first, try to share */
|
|
Packit |
2d622a |
ret = find_or_prepare_shared_file(htlb_seg_info);
|
|
Packit |
2d622a |
if (ret == 0)
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
/* but, fall through to unlinked files, if sharing fails */
|
|
Packit |
2d622a |
WARNING("Falling back to unlinked files\n");
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
fd = hugetlbfs_unlinked_fd_for_size(hpage_size);
|
|
Packit |
2d622a |
if (fd < 0)
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
htlb_seg_info->fd = fd;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return fork_and_prepare_segment(htlb_seg_info);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static void remap_segments(struct seg_info *seg, int num)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int i;
|
|
Packit |
2d622a |
void *p;
|
|
Packit |
2d622a |
unsigned long start, offset, mapsize;
|
|
Packit |
2d622a |
long page_size = getpagesize();
|
|
Packit |
2d622a |
long hpage_size;
|
|
Packit |
2d622a |
int mmap_flags;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* XXX: The bogus call to mmap below forces ld.so to resolve the
|
|
Packit |
2d622a |
* mmap symbol before we unmap the plt in the data segment
|
|
Packit |
2d622a |
* below. This might only be needed in the case where sharing
|
|
Packit |
2d622a |
* is enabled and the hugetlbfs files have already been prepared
|
|
Packit |
2d622a |
* by another process.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
p = mmap(0, 0, 0, 0, 0, 0);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* This is the hairy bit, between unmap and remap we enter a
|
|
Packit |
2d622a |
* black hole. We can't call anything which uses static data
|
|
Packit |
2d622a |
* (ie. essentially any library function...)
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
for (i = 0; i < num; i++) {
|
|
Packit |
2d622a |
start = ALIGN_DOWN((unsigned long)seg[i].vaddr, page_size);
|
|
Packit |
2d622a |
offset = (unsigned long)(seg[i].vaddr - start);
|
|
Packit |
2d622a |
mapsize = ALIGN(offset + seg[i].memsz, page_size);
|
|
Packit |
2d622a |
munmap((void *) start, mapsize);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Step 4. Rebuild the address space with hugetlb mappings */
|
|
Packit |
2d622a |
/* NB: we can't do the remap as hugepages within the main loop
|
|
Packit |
2d622a |
* because of PowerPC: we may need to unmap all the normal
|
|
Packit |
2d622a |
* segments before the MMU segment is ok for hugepages */
|
|
Packit |
2d622a |
for (i = 0; i < num; i++) {
|
|
Packit |
2d622a |
hpage_size = seg[i].page_size;
|
|
Packit |
2d622a |
start = ALIGN_DOWN((unsigned long)seg[i].vaddr, hpage_size);
|
|
Packit |
2d622a |
offset = (unsigned long)(seg[i].vaddr - start);
|
|
Packit |
2d622a |
mapsize = ALIGN(offset + seg[i].memsz, hpage_size);
|
|
Packit |
2d622a |
mmap_flags = MAP_PRIVATE|MAP_FIXED;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* If requested, make no reservations */
|
|
Packit |
2d622a |
if (__hugetlb_opts.no_reserve)
|
|
Packit |
2d622a |
mmap_flags |= MAP_NORESERVE;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* If this is a read-only mapping whose contents are
|
|
Packit |
2d622a |
* entirely contained within the file, then use MAP_NORESERVE.
|
|
Packit |
2d622a |
* The assumption is that the pages already exist in the
|
|
Packit |
2d622a |
* page cache for the hugetlbfs file since it was prepared
|
|
Packit |
2d622a |
* earlier and that mprotect() will not be called which would
|
|
Packit |
2d622a |
* require a COW
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
if (!(seg[i].prot & PROT_WRITE) &&
|
|
Packit |
2d622a |
seg[i].filesz == seg[i].memsz)
|
|
Packit |
2d622a |
mmap_flags |= MAP_NORESERVE;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
p = mmap((void *) start, mapsize, seg[i].prot,
|
|
Packit |
2d622a |
mmap_flags, seg[i].fd, 0);
|
|
Packit |
2d622a |
if (p == MAP_FAILED)
|
|
Packit |
2d622a |
unmapped_abort("Failed to map hugepage segment %u: "
|
|
Packit |
2d622a |
"%p-%p (errno=%u)\n", i, start,
|
|
Packit |
2d622a |
start + mapsize, errno);
|
|
Packit |
2d622a |
if (p != (void *) start)
|
|
Packit |
2d622a |
unmapped_abort("Mapped hugepage segment %u (%p-%p) at "
|
|
Packit |
2d622a |
"wrong address %p\n", i, seg[i].vaddr,
|
|
Packit |
2d622a |
seg[i].vaddr+mapsize, p);
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
/* The segments are all back at this point.
|
|
Packit |
2d622a |
* and it should be safe to reference static data
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static int set_hpage_sizes(const char *env)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
char *pos;
|
|
Packit |
2d622a |
long size;
|
|
Packit |
2d622a |
char *key;
|
|
Packit |
2d622a |
char keys[5] = { "R\0" "W\0" "\0" };
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* For each key in R,W */
|
|
Packit |
2d622a |
for (key = keys; *key != '\0'; key += 2) {
|
|
Packit |
2d622a |
pos = strcasestr(env, key);
|
|
Packit |
2d622a |
if (!pos)
|
|
Packit |
2d622a |
continue;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (*(++pos) == '=') {
|
|
Packit |
2d622a |
size = parse_page_size(pos + 1);
|
|
Packit |
2d622a |
if (size == -1)
|
|
Packit |
2d622a |
return size;
|
|
Packit |
2d622a |
} else
|
|
Packit |
2d622a |
size = gethugepagesize();
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (size <= 0) {
|
|
Packit |
2d622a |
if (errno == ENOSYS)
|
|
Packit |
2d622a |
WARNING("Hugepages unavailable\n");
|
|
Packit |
2d622a |
else if (errno == EOVERFLOW)
|
|
Packit |
2d622a |
WARNING("Hugepage size too large\n");
|
|
Packit |
2d622a |
else
|
|
Packit |
2d622a |
WARNING("Hugepage size (%s)\n",
|
|
Packit |
2d622a |
strerror(errno));
|
|
Packit |
2d622a |
size = 0;
|
|
Packit |
2d622a |
} else if (!hugetlbfs_find_path_for_size(size)) {
|
|
Packit |
2d622a |
WARNING("Hugepage size %li unavailable", size);
|
|
Packit |
2d622a |
size = 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (*key == 'R')
|
|
Packit |
2d622a |
hpage_readonly_size = size;
|
|
Packit |
2d622a |
else
|
|
Packit |
2d622a |
hpage_writable_size = size;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
static int check_env(void)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
extern Elf_Ehdr __executable_start __attribute__((weak));
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (__hugetlb_opts.elfmap &&
|
|
Packit |
2d622a |
(strcasecmp(__hugetlb_opts.elfmap, "no") == 0)) {
|
|
Packit |
2d622a |
INFO("HUGETLB_ELFMAP=%s, not attempting to remap program "
|
|
Packit |
2d622a |
"segments\n", __hugetlb_opts.elfmap);
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
if (__hugetlb_opts.elfmap && set_hpage_sizes(__hugetlb_opts.elfmap)) {
|
|
Packit |
2d622a |
WARNING("Cannot set elfmap page sizes: %s", strerror(errno));
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (__hugetlb_opts.ld_preload &&
|
|
Packit |
2d622a |
strstr(__hugetlb_opts.ld_preload, "libhugetlbfs")) {
|
|
Packit |
2d622a |
if (__hugetlb_opts.force_elfmap) {
|
|
Packit |
2d622a |
force_remap = 1;
|
|
Packit |
2d622a |
INFO("HUGETLB_FORCE_ELFMAP=yes, "
|
|
Packit |
2d622a |
"enabling partial segment "
|
|
Packit |
2d622a |
"remapping for non-relinked "
|
|
Packit |
2d622a |
"binaries\n");
|
|
Packit |
2d622a |
INFO("Disabling filesz copy optimization\n");
|
|
Packit |
2d622a |
__hugetlb_opts.min_copy = false;
|
|
Packit |
2d622a |
} else {
|
|
Packit |
2d622a |
if (&__executable_start) {
|
|
Packit |
2d622a |
WARNING("LD_PRELOAD is incompatible with "
|
|
Packit |
2d622a |
"segment remapping\n");
|
|
Packit |
2d622a |
WARNING("Segment remapping has been "
|
|
Packit |
2d622a |
"DISABLED\n");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (__hugetlb_opts.sharing == 2) {
|
|
Packit |
2d622a |
WARNING("HUGETLB_SHARE=%d, however sharing of writable\n"
|
|
Packit |
2d622a |
"segments has been deprecated and is now disabled\n",
|
|
Packit |
2d622a |
__hugetlb_opts.sharing);
|
|
Packit |
2d622a |
__hugetlb_opts.sharing = 0;
|
|
Packit |
2d622a |
} else {
|
|
Packit |
2d622a |
INFO("HUGETLB_SHARE=%d, sharing ", __hugetlb_opts.sharing);
|
|
Packit |
2d622a |
if (__hugetlb_opts.sharing == 1) {
|
|
Packit |
2d622a |
INFO_CONT("enabled for only read-only segments\n");
|
|
Packit |
2d622a |
} else {
|
|
Packit |
2d622a |
INFO_CONT("disabled\n");
|
|
Packit |
2d622a |
__hugetlb_opts.sharing = 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
INFO("HUGETLB_NO_RESERVE=%s, reservations %s\n",
|
|
Packit |
2d622a |
__hugetlb_opts.no_reserve ? "yes" : "no",
|
|
Packit |
2d622a |
__hugetlb_opts.no_reserve ? "disabled" : "enabled");
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* Parse an ELF header and record segment information for any segments
|
|
Packit |
2d622a |
* which contain hugetlb information.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
static int parse_elf()
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
if (force_remap)
|
|
Packit |
2d622a |
dl_iterate_phdr(parse_elf_partial, NULL);
|
|
Packit |
2d622a |
else
|
|
Packit |
2d622a |
dl_iterate_phdr(parse_elf_normal, NULL);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (htlb_num_segs == 0) {
|
|
Packit |
2d622a |
INFO("No segments were appropriate for remapping\n");
|
|
Packit |
2d622a |
return -1;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return 0;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
void hugetlbfs_setup_elflink(void)
|
|
Packit |
2d622a |
{
|
|
Packit |
2d622a |
int i, ret;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (check_env())
|
|
Packit |
2d622a |
return;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
if (parse_elf())
|
|
Packit |
2d622a |
return;
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
INFO("libhugetlbfs version: %s\n", VERSION);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Do we need to find a share directory */
|
|
Packit |
2d622a |
if (__hugetlb_opts.sharing) {
|
|
Packit |
2d622a |
/*
|
|
Packit |
2d622a |
* If HUGETLB_ELFMAP is undefined but a shareable segment has
|
|
Packit |
2d622a |
* PF_LINUX_HUGETLB set, segment remapping will occur using the
|
|
Packit |
2d622a |
* default huge page size.
|
|
Packit |
2d622a |
*/
|
|
Packit |
2d622a |
long page_size = hpage_readonly_size ?
|
|
Packit |
2d622a |
hpage_readonly_size : gethugepagesize();
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
ret = find_or_create_share_path(page_size);
|
|
Packit |
2d622a |
if (ret != 0) {
|
|
Packit |
2d622a |
WARNING("Segment remapping is disabled");
|
|
Packit |
2d622a |
return;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Step 1. Obtain hugepage files with our program data */
|
|
Packit |
2d622a |
for (i = 0; i < htlb_num_segs; i++) {
|
|
Packit |
2d622a |
ret = obtain_prepared_file(&htlb_seg_table[i]);
|
|
Packit |
2d622a |
if (ret < 0) {
|
|
Packit |
2d622a |
WARNING("Failed to setup hugetlbfs file for segment "
|
|
Packit |
2d622a |
"%d\n", i);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Close files we have already prepared */
|
|
Packit |
2d622a |
for (i--; i >= 0; i--)
|
|
Packit |
2d622a |
close(htlb_seg_table[i].fd);
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
return;
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
}
|
|
Packit |
2d622a |
|
|
Packit |
2d622a |
/* Step 3. Unmap the old segments, map in the new ones */
|
|
Packit |
2d622a |
remap_segments(htlb_seg_table, htlb_num_segs);
|
|
Packit |
2d622a |
}
|