From b439df6648d848b1459ed49247aaf0eddd8efd8c Mon Sep 17 00:00:00 2001 From: Packit Service Date: Dec 09 2020 19:20:15 +0000 Subject: libhugetlbfs-2.21 base --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7123f29 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.d +*~ +version.h +version +obj32 +obj64 +obj diff --git a/HOWTO b/HOWTO new file mode 100644 index 0000000..21eb34e --- /dev/null +++ b/HOWTO @@ -0,0 +1,766 @@ +libhugetlbfs HOWTO +================== + +Author: David Gibson , Adam Litke , and others +Last updated: December 07, 2011 + +Introduction +============ + +In Linux(TM), access to hugepages is provided through a virtual file +system, "hugetlbfs". The libhugetlbfs library interface works with +hugetlbfs to provide more convenient specific application-level +services. In particular libhugetlbfs has three main functions: + + * library functions +libhugetlbfs provides functions that allow an applications to +explicitly allocate and use hugepages more easily they could by +directly accessing the hugetblfs filesystem + + * hugepage malloc() +libhugetlbfs can be used to make an existing application use hugepages +for all its malloc() calls. This works on an existing (dynamically +linked) application binary without modification. + + * hugepage text/data/BSS +libhugetlbfs, in conjunction with included special linker scripts can +be used to make an application which will store its executable text, +its initialized data or BSS, or all of the above in hugepages. This +requires relinking an application, but does not require source-level +modifications. + +This HOWTO explains how to use the libhugetlbfs library. It is for +application developers or system administrators who wish to use any of +the above functions. + +The libhugetlbfs library is a focal point to simplify and standardise +the use of the kernel API. + +Prerequisites +============= + +Hardware prerequisites +---------------------- + +You will need a CPU with some sort of hugepage support, which is +handled by your kernel. This covers recent x86, AMD64, 64-bit +PowerPC(R) (POWER4, PPC970 and later), and IBM System z CPUs. + +Currently, only x86, AMD64 and PowerPC are fully supported by +libhugetlbfs. IA64 and Sparc64 have a working malloc, and SH64 +should also but it has not been tested. IA64, Sparc64, and SH64 +do not support segment remapping at this time. IBM System z supports +malloc and also segment remapping with --hugetlbfs-align. + +Kernel prerequisites +-------------------- + +To use all the features of libhugetlbfs you will need a 2.6.16 or +later kernel. Many things will work with earlier kernels, but they +have important bugs and missing features. The later sections of the +HOWTO assume a 2.6.16 or later kernel. The kernel must also have +hugepages enabled, that is to say the CONFIG_HUGETLB_PAGE and +CONFIG_HUGETLBFS options must be switched on. + +To check if hugetlbfs is enabled, use one of the following methods: + + * (Preferred) Use "grep hugetlbfs /proc/filesystems" to see if + hugetlbfs is a supported file system. + * On kernels which support /proc/config.gz (for example SLES10 + kernels), you can search for the CONFIG_HUGETLB_PAGE and + CONFIG_HUGETLBFS options in /proc/config.gz + * Finally, attempt to mount hugetlbfs. If it works, the required + hugepage support is enabled. + +Any kernel which meets the above test (even old ones) should support +at least basic libhugetlbfs functions, although old kernels may have +serious bugs. + +The MAP_PRIVATE flag instructs the kernel to return a memory area that +is private to the requesting process. To use MAP_PRIVATE mappings, +libhugetlbfs's automatic malloc() (morecore) feature, or the hugepage +text, data, or BSS features, you will need a kernel with hugepage +Copy-on-Write (CoW) support. The 2.6.16 kernel has this. + +PowerPC note: The malloc()/morecore features will generate warnings if +used on PowerPC chips with a kernel where hugepage mappings don't +respect the mmap() hint address (the "hint address" is the first +parameter to mmap(), when MAP_FIXED is not specified; the kernel is +not required to mmap() at this address, but should do so when +possible). 2.6.16 and later kernels do honor the hint address. +Hugepage malloc()/morecore should still work without this patch, but +the size of the hugepage heap will be limited (to around 256M for +32-bit and 1TB for 64-bit). + +The 2.6.27 kernel introduced support for multiple huge page sizes for +systems with the appropriate hardware support. Unless specifically +requested, libhugetlbfs will continue to use the default huge page size. + +Toolchain prerequisites +----------------------- + +The library uses a number of GNU specific features, so you will need to use +both gcc and GNU binutils. For PowerPC and AMD64 systems you will need a +"biarch" compiler, which can build both 32-bit and 64-bit binaries. To use +hugepage text and data segments, GNU binutils version 2.17 (or later) is +recommended. Older versions will work with restricted functionality. + +Configuration prerequisites +--------------------------- + +Direct access to hugepage pool has been deprecated in favor of the +hugeadm utility. This utility can be used for finding the available +hugepage pools and adjusting their minimum and maximum sizes depending +on kernel support. + +To list all availabe hugepage pools and their current min and max values: + hugeadm --pool-list + +To set the 2MB pool minimum to 10 pages: + hugeadm --pool-pages-min 2MB:10 + +Note: that the max pool size will be adjusted to keep the same number of +overcommit pages available if the kernel support is available when min +pages are adjusted + +To add 15 pages to the maximum for 2MB pages: + hugeadm --pool-pages-min 2MB:-5 + +For more information see man 8 hugeadm + +The raw kernel interfaces (as described below) are still available. + +In kernels before 2.6.24, hugepages must be allocated at boot-time via +the hugepages= command-line parameter or at run-time via the +/proc/sys/vm/nr_hugepages sysctl. If memory is restricted on the system, +boot-time allocation is recommended. Hugepages so allocated will be in +the static hugepage pool. + +In kernels starting with 2.6.24, the hugepage pool can grown on-demand. +If this feature should be used, /proc/sys/vm/nr_overcommit_hugepages +should be set to the maximum size of the hugepage pool. No hugepages +need to be allocated via /proc/sys/vm/nr_hugepages or hugepages= in this +case. Hugepages so allocated will be in the dynamic hugepage pool. + +For the running of the libhugetlbfs testsuite (see below), allocating 25 +static hugepages is recommended. Due to memory restrictions, the number +of hugepages requested may not be allocated if the allocation is +attempted at run-time. Users should verify the actual number of +hugepages allocated by: + + hugeadm --pool-list + +or + + grep HugePages_Total /proc/meminfo + +With 25 hugepages allocated, most tests should succeed. However, with +smaller hugepages sizes, many more hugepages may be necessary. + +To use libhugetlbfs features, as well as to run the testsuite, hugetlbfs +must be mounted. Each hugetlbfs mount point is associated with a page +size. To choose the size, use the pagesize mount option. If this option +is omitted, the default huge page size will be used. + +To mount the default huge page size: + + mkdir -p /mnt/hugetlbfs + mount -t hugetlbfs none /mnt/hugetlbfs + +To mount 64KB pages (assuming hardware support): + + mkdir -p /mnt/hugetlbfs-64K + mount -t hugetlbfs none -opagesize=64k /mnt/hugetlbfs-64K + +If hugepages should be available to non-root users, the permissions on +the mountpoint need to be set appropriately. + +Installation +============ + +1. Type "make" to build the library + +This will create "obj32" and/or "obj64" under the top level +libhugetlbfs directory, and build, respectively, 32-bit and 64-bit +shared and static versions (as applicable) of the library into each +directory. This will also build (but not run) the testsuite. + +On i386 systems, only the 32-bit library will be built. On PowerPC +and AMD64 systems, both 32-bit and 64-bit versions will be built (the +32-bit AMD64 version is identical to the i386 version). + +2. Run the testsuite with "make check" + +Running the testsuite is a good idea to ensure that the library is +working properly, and is quite quick (under 3 minutes on a 2GHz Apple +G5). "make func" will run the just the functionality tests, rather +than stress tests (a subset of "make check") which is much quicker. +The testsuite contains tests both for the library's features and for +the underlying kernel hugepage functionality. + +NOTE: The testsuite must be run as the root user. + +WARNING: The testsuite contains testcases explicitly designed to test +for a number of hugepage related kernel bugs uncovered during the +library's development. Some of these testcases WILL CRASH HARD a +kernel without the relevant fixes. 2.6.16 contains all such fixes for +all testcases included as of this writing. + +3. (Optional) Install to system paths with "make install" + +This will install the library images to the system lib/lib32/lib64 as +appropriate, the helper utilities and the manual pages. By default +it will install under /usr/local. To put it somewhere else use +PREFIX=/path/to/install on the make command line. For example: + + make install PREFIX=/opt/hugetlbfs +Will install under /opt/hugetlbfs. + +"make install" will also install the linker scripts and wrapper for ld +used for hugepage test/data/BSS (see below for details). + +Alternatively, you can use the library from the directory in which it +was built, using the LD_LIBRARY_PATH environment variable. + +To only install library with linker scripts, the manual pages or the helper +utilities separetly, use the install-libs, install-man and install-bin targets +respectively. This can be useful when you with to install the utilities but +not override the distribution-supported version of libhugetlbfs for example. + +Usage +===== + +Using hugepages for malloc() (morecore) +--------------------------------------- + +This feature allows an existing (dynamically linked) binary executable +to use hugepages for all its malloc() calls. To run a program using +the automatic hugepage malloc() feature, you must set several +environment variables: + +1. Set LD_PRELOAD=libhugetlbfs.so + This tells the dynamic linker to load the libhugetlbfs shared + library, even though the program wasn't originally linked against it. + + Note: If the program is linked against libhugetlbfs, preloading the + library may lead to application crashes. You should skip this + step in that case. + +2. Set LD_LIBRARY_PATH to the directory containing libhugetlbfs.so + This is only necessary if you haven't installed libhugetlbfs.so to a + system default path. If you set LD_LIBRARY_PATH, make sure the + directory referenced contains the right version of the library + (32-bit or 64-bit) as appropriate to the binary you want to run. + +3. Set HUGETLB_MORECORE + This enables the hugepage malloc() feature, instructing libhugetlbfs + to override libc's normal morecore() function with a hugepage + version and use it for malloc(). From this point all malloc()s + should come from hugepage memory until it runs out. This option can + be specified in two ways: + + To use the default huge page size: + HUGETLB_MORECORE=yes + + To use a specific huge page size: + HUGETLB_MORECORE= + + To use Transparent Huge Pages (THP): + HUGETLB_MORECORE=thp + +Note: This option requires a kernel that supports Transparent Huge Pages + +Usually it's preferable to set these environment variables on the +command line of the program you wish to run, rather than using +"export", because you'll only want to enable the hugepage malloc() for +particular programs, not everything. + +Examples: + +If you've installed libhugetlbfs in the default place (under +/usr/local) which is in the system library search path use: + $ LD_PRELOAD=libhugetlbfs.so HUGETLB_MORECORE=yes + +If you have built libhugetlbfs in ~/libhugetlbfs and haven't installed +it yet, the following would work for a 64-bit program: + + $ LD_PRELOAD=libhugetlbfs.so LD_LIBRARY_PATH=~/libhugetlbfs/obj64 \ + HUGETLB_MORECORE=yes + +Under some circumstances, you might want to specify the address where +the hugepage heap is located. You can do this by setting the +HUGETLB_MORECORE_HEAPBASE environment variable to the heap address in +hexadecimal. NOTE: this will not work on PowerPC systems with old kernels +which don't respect the hugepage hint address; see Kernel Prerequisites +above. Also note that this option is ignored for THP morecore. + +By default, the hugepage heap begins at roughly the same place a +normal page heap would, rounded up by an amount determined by your +platform. For 32-bit PowerPC binaries the normal page heap address is +rounded-up to a multiple of 256MB (that is, putting it in the next MMU +segment); for 64-bit PowerPC binaries the address is rounded-up to a +multiple of 1TB. On all other platforms the address is rounded-up to +the size of a hugepage. + +By default, the hugepage heap will be prefaulted by libhugetlbfs to +guarantee enough hugepages exist and are reserved for the application +(if this was not done, applications could receive a SIGKILL signal if +hugepages needed for the heap are used by another application before +they are faulted in). This leads to local-node allocations when no +memory policy is in place for hugepages. Therefore, it is recommended to +use + + $ numactl --interleave=all + +to regain some of the performance impact of local-node allocations on +large NUMA systems. This can still result in poor performance for those +applications which carefully place their threads on particular nodes +(such as by using OpenMP). In that case, thread-local allocation is +preferred so users should select a memory policy that corresponds to +the run-time behavior of the process' CPU usage. Users can specify +HUGETLB_NO_PREFAULT to prevent the prefaulting of hugepages and instead +rely on run-time faulting of hugepages. NOTE: specifying +HUGETLB_NO_PREFAULT on a system where hugepages are available to and +used by many process can result in some applications receving SIGKILL, +so its use is not recommended in high-availability or production +environments. + +By default, the hugepage heap does not shrink. To enable hugepage heap +shrinking, set HUGETLB_MORECORE_SHRINK=yes. NB: We have been seeing some +unexpected behavior from glibc's malloc when this is enabled. + +Using hugepage shared memory +---------------------------- + +Hugepages are used for shared memory segments if the SHM_HUGETLB flag is +set when calling shmget() and the pool is large enough. For hugepage-unaware +applications, libhugetlbfs overrides shmget and adds the SHM_HUGETLB if the +environment variable HUGETLB_SHM is set to "yes". The steps to use hugepages +with applications not linked to libhugetlbfs are similar to morecore except +for step 3. + +1. Set LD_PRELOAD=libhugetlbfs.so + This tells the dynamic linker to load the libhugetlbfs shared + library, even though the program wasn't originally linked against it. + + Note: If the program is linked against libhugetlbfs, preloading the + library may lead to application crashes. You should skip this + step in that case. + +2. Set LD_LIBRARY_PATH to the directory containing libhugetlbfs.so + This is only necessary if you haven't installed libhugetlbfs.so to a + system default path. If you set LD_LIBRARY_PATH, make sure the + directory referenced contains the right version of the library + (32-bit or 64-bit) as appropriate to the binary you want to run. + +3. Set HUGETLB_SHM=yes + The shmget() call is overridden whether the application is linked or the + libhugetlbfs library is preloaded. When this environment variable is set, + the SHM_HUGETLB flag is added to the call and the size parameter is aligned + to back the shared memory segment with huge pages. In the event hugepages + cannot be used, small pages will be used instead and a warning will be + printed to explain the failure. + + Note: It is not possible to select any huge page size other than the + system default for this option. If the kernel supports multiple + huge page sizes, the size used for shared memory can be changed by + altering the default huge page size via the default_hugepagesz + kernel boot parameter. + +Using hugepage text, data, or BSS +--------------------------------- + +To use the hugepage text, data, or BSS segments feature, you need to specially +link your application. How this is done depends on the version of GNU ld. To +support ld versions older than 2.17, libhugetlbfs provides custom linker +scripts that must be used to achieve the required binary layout. With version +2.17 or later, the system default linker scripts should be used. + +To link an application for hugepages, you should use the the ld.hugetlbfs +script included with libhugetlbfs in place of your normal linker. Without any +special options this will simply invoke GNU ld with the same parameters. When +it is invoked with options detailed in the following sections, ld.hugetlbfs +will call the system linker with all of the options necessary to link for +hugepages. If a custom linker script is required, it will also be selected. + +If you installed ld.hugetlbfs using "make install", or if you run it +from the place where you built libhugetlbfs, it should automatically +be able to find the libhugetlbfs linker scripts. Otherwise you may +need to explicitly instruct it where to find the scripts with the +option: + --hugetlbfs-script-path=/path/to/scripts +(The linker scripts are in the ldscripts/ subdirectory of the +libhugetlbfs source tree). + + Linking the application with binutils-2.17 or later: + ---------------------------------------------------- + +This method will use the system default linker scripts. Only one linker option +is required to prepare the application for hugepages: + + --hugetlbfs-align + +will instruct ld.hugetlbfs to call GNU ld with two options that increase the +alignment of the resulting binary. For reference, the options passed to ld are: + + -z common-page-size= and + -z max-page-size= + + Linking the application with binutils-2.16 or older: + ---------------------------------------------------- + +To link a program with a custom linker script, one of the following linker +options should be specified: + + --hugetlbfs-link=B + +will link the application to store BSS data (only) into hugepages + + --hugetlbfs-link=BDT + +will link the application to store text, initialized data and BSS data +into hugepages. + +These are the only two available options when using custom linker scripts. + + A note about the custom libhugetlbfs linker scripts: + ---------------------------------------------------- + +Linker scripts are usually distributed with GNU binutils and they may contain a +partial implementation of new linker features. As binutils evolves, the linker +scripts supplied with previous versions become obsolete and are upgraded. + +Libhugetlbfs distributes one set of linker scripts that must work across +several Linux distributions and binutils versions. This has worked well for +some time but binutils-2.17 (including some late 2.16 builds) have made changes +that are impossible to accomodate without breaking the libhugetlbfs linker +scripts for older versions of binutils. This is why the linker scripts (and +the --hugetlbfs-link ld.hugetlbfs option) have been deprecated for binutils >= +2.17 configurations. + +If you are using a late 2.16 binutils version (such as 2.16.91) and are +experiencing problems with huge page text, data, and bss, you can check +binutils for the incompatibility with the following command: + + ld --verbose | grep SPECIAL + +If any matches are returned, then the libhugetlbfs linker scripts may not work +correctly. In this case you should upgrade to binutils >= 2.17 and use the +--hugetlbfs-align linking method. + + Linking via gcc: + ---------------- + +In many cases it's normal to link an application by invoking gcc, +which will then invoke the linker with appropriate options, rather +than invoking ld directly. In such cases it's usually best to +convince gcc to invoke the ld.hugetlbfs script instead of the system +linker, rather than modifying your build procedure to invoke the +ld.hugetlbfs directly; the compilers may often add special libraries +or other linker options which can be fiddly to reproduce by hand. +To make this easier, 'make install' will install ld.hugetlbfs into +$PREFIX/share/libhugetlbfs and create an 'ld' symlink to it. + +Then with gcc, you invoke it as a linker with two options: + + -B $PREFIX/share/libhugetlbfs + +This option tells gcc to look in a non-standard location for the +linker, thus finding our script rather than the normal linker. This +can optionally be set in the CFLAGS environment variable. + + -Wl,--hugetlbfs-align +OR -Wl,--hugetlbfs-link=B +OR -Wl,--hugetlbfs-link=BDT + +This option instructs gcc to pass the option after the comma down to the +linker, thus invoking the special behaviour of the ld.hugetblfs script. This +can optionally be set in the LDFLAGS environment variable. + +If you use a compiler other than gcc, you will need to consult its +documentation to see how to convince it to invoke ld.hugetlbfs in +place of the system linker. + + Running the application: + ------------------------ + +The specially-linked application needs the libhugetlbfs library, so +you might need to set the LD_LIBRARY_PATH environment variable so the +application can locate libhugetlbfs.so. Depending on the method used to link +the application, the HUGETLB_ELFMAP environment variable can be used to control +how hugepages will be used. + + When using --hugetlbfs-link: + ---------------------------- + +The custom linker script determines which segments may be remapped into +hugepages and this remapping will occur by default. The following setting will +disable remapping entirely: + + HUGETLB_ELFMAP=no + + When using --hugetlbfs-align: + ----------------------------- + +This method of linking an application permits greater flexibility at runtime. +Using HUGETLB_ELFMAP, it is possible to control which program segments are +placed in hugepages. The following four settings will cause the indicated +segments to be placed in hugepages: + + HUGETLB_ELFMAP=R Read-only segments (text) + HUGETLB_ELFMAP=W Writable segments (data/BSS) + HUGETLB_ELFMAP=RW All segments (text/data/BSS) + HUGETLB_ELFMAP=no No segments + +It is possible to select specific huge page sizes for read-only and writable +segments by using the following advanced syntax: + + HUGETLB_ELFMAP=[R[=]:[W[=]] + +For example: + + Place read-only segments into 64k pages and writable into 16M pages + HUGETLB_ELFMAP=R=64k:W=16M + + Use the default for read-only segments, 1G pages for writable segments + HUGETLB_ELFMAP=R:W=1G + + Use 16M pages for writable segments only + HUGETLB_ELFMAP=W=16M + + Default remapping behavior: + --------------------------- + +If --hugetlbfs-link was used to link an application, the chosen remapping mode +is saved in the binary and becomes the default behavior. Setting +HUGETLB_ELFMAP=no will disable all remapping and is the only way to modify the +default behavior. + +For applications linked with --hugetlbfs-align, the default behavior is to not +remap any segments into huge pages. To set or display the default remapping +mode for a binary, the included hugeedit command can be used: + +hugeedit [options] target-executable + options: + --text,--data Remap the specified segment into huge pages by default + --disable Do not remap any segments by default + +When target-executable is the only argument, hugeedit will display the default +remapping mode without making any modifications. + +When a binary is remapped according to its default remapping policy, the +system default huge page size will be used. + + Environment variables: + ---------------------- + +There are a number of private environment variables which can affect +libhugetlbfs: + HUGETLB_DEFAULT_PAGE_SIZE + Override the system default huge page size for all uses + except hugetlb-backed shared memory + + HUGETLB_RESTRICT_EXE + By default, libhugetlbfs will act on any program that it + is loaded with, either via LD_PRELOAD or by explicitly + linking with -lhugetlbfs. + + There are situations in which it is desirable to restrict + libhugetlbfs' actions to specific programs. For example, + some ISV applications are wrapped in a series of scripts + that invoke bash, python, and/or perl. It is more + convenient to set the environment variables related + to libhugetlbfs before invoking the wrapper scripts, + yet this has the unintended and undesirable consequence + of causing the script interpreters to use and consume + hugepages. There is no obvious benefit to causing the + script interpreters to use hugepages, and there is a + clear disadvantage: fewer hugepages are available to + the actual application. + + To address this scenario, set HUGETLB_RESTRICT_EXE to a + colon-separated list of programs to which the other + libhugetlbfs environment variables should apply. (If + not set, libhugetlbfs will attempt to apply the requested + actions to all programs.) For example, + + HUGETLB_RESTRICT_EXE="hpcc:long_hpcc" + + will restrict libhugetlbfs' actions to programs named + /home/fred/hpcc and /bench/long_hpcc but not /usr/hpcc_no. + + HUGETLB_ELFMAP + Control or disable segment remapping (see above) + + HUGETLB_MINIMAL_COPY + If equal to "no", the entire segment will be copied; + otherwise, only the necessary parts will be, which can + be much more efficient (default) + + HUGETLB_FORCE_ELFMAP + Explained in "Partial segment remapping" + + HUGETLB_MORECORE + HUGETLB_MORECORE_HEAPBASE + HUGETLB_NO_PREFAULT + Explained in "Using hugepages for malloc() + (morecore)" + + HUGETLB_VERBOSE + Specify the verbosity level of debugging output from 1 + to 99 (default is 1) + HUGETLB_PATH + Specify the path to the hugetlbfs mount point + HUGETLB_SHARE + Explained in "Sharing remapped segments" + HUGETLB_DEBUG + Set to 1 if an application segfaults. Gives very detailed output + and runs extra diagnostics. + + Sharing remapped segments: + -------------------------- + +By default, when libhugetlbfs uses anonymous, unlinked hugetlbfs files +to store remapped program segment data. This means that if the same +program is started multiple times using hugepage segments, multiple +huge pages will be used to store the same program data. + +The reduce this wastage, libugetlbfs can be instructed to allow +sharing segments between multiple invocations of a program. To do +this, you must set the HUGETLB_SHARE variable must be set for all the +processes in question. This variable has two possible values: + anything but 1: the default, indicates no segments should be shared + 1: indicates that read-only segments (i.e. the program text, +in most cases) should be shared, read-write segments (data and bss) +will not be shared. + +If the HUGETLB_MINIMAL_COPY variable is set for any program using +shared segments, it must be set to the same value for all invocations +of that program. + +Segment sharing is implemented by creating persistent files in a +hugetlbfs containing the necessary segment data. By default, these +files are stored in a subdirectory of the first located hugetlbfs +filesystem, named 'elflink-uid-XXX' where XXX is the uid of the +process using sharing. This directory must be owned by the uid in +question, and have mode 0700. If it doesn't exist, libhugetlbfs will +create it automatically. This means that (by default) separate +invocations of the same program by different users will not share huge +pages. + +The location for storing the hugetlbfs page files can be changed by +setting the HUGETLB_SHARE_PATH environment variable. If set, this +variable must contain the path of an accessible, already created +directory located in a hugetlbfs filesystem. The owner and mode of +this directory are not checked, so this method can be used to allow +processes of multiple uids to share huge pages. IMPORTANT SECURITY +NOTE: any process sharing hugepages can insert arbitrary executable +code into any other process sharing hugepages in the same directory. +Therefore, when using HUGETLB_SHARE_PATH, the directory created *must* +allow access only to a set of uids who are mutually trusted. + +The files created in hugetlbfs for sharing are persistent, and must be +manually deleted to free the hugepages in question. Future versions +of libhugetlbfs should include tools and scripts to automate this +cleanup. + + Partial segment remapping + ------------------------- + +libhugetlbfs has limited support for remapping a normal, non-relinked +binary's data, text and BSS into hugepages. To enable this feature, +HUGETLB_FORCE_ELFMAP must be set to "yes". + +Partial segment remapping is not guaranteed to work. Most importantly, a +binary's segments must be large enough even when not relinked by +libhugetlbfs: + + architecture address minimum segment size + ------------ ------- -------------------- + i386, x86_64 all hugepage size + ppc32 all 256M + ppc64 0-4G 256M + ppc64 4G-1T 1020G + ppc64 1T+ 1T + +The raw size, though, is not sufficient to indicate if the code will +succeed, due to alignment. Since the binary is not relinked, however, +this is relatively straightforward to 'test and see'. + +NOTE: You must use LD_PRELOAD to load libhugetlbfs.so when using +partial remapping. + + +Examples +======== + +Example 1: Application Developer +--------------------------------- + +To have a program use hugepages, complete the following steps: + +1. Make sure you are working with kernel 2.6.16 or greater. + +2. Modify the build procedure so your application is linked against +libhugetlbfs. + +For the remapping, you link against the library with the appropriate +linker script (if necessary or desired). Linking against the library +should result in transparent usage of hugepages. + +Example 2: End Users and System Administrators +----------------------------------------------- + +To have an application use libhugetlbfs, complete the following steps: + +1. Make sure you are using kernel 2.6.16. + +2. Make sure the library is in the path, which you can set with the +LD_LIBRARY_PATH environment variable. You might need to set other +environment variables, including LD_PRELOAD as described above. + + +Troubleshooting +=============== + +The library has a certain amount of debugging code built in, which can +be controlled with the environment variable HUGETLB_VERBOSE. By +default the debug level is "1" which means the library will only print +relatively serious error messages. Setting HUGETLB_VERBOSE=2 or +higher will enable more debug messages (at present 2 is the highest +debug level, but that may change). Setting HUGETLB_VERBOSE=0 will +silence the library completely, even in the case of errors - the only +exception is in cases where the library has to abort(), which can +happen if something goes wrong in the middle of unmapping and +remapping segments for the text/data/bss feature. + +If an application fails to run, set the environment variable HUGETLB_DEBUG +to 1. This causes additional diagnostics to be run. This information should +be included when sending bug reports to the libhugetlbfs team. + +Specific Scenarios: +------------------- + +ISSUE: When using the --hugetlbfs-align or -zmax-page-size link options, the + linker complains about truncated relocations and the build fails. + +TRY: Compile the program with the --relax linker option. Either add + -Wl,--relax to CFLAGS or --relax to LDFLAGS. + +ISSUE: When using the xB linker script with a 32 bit binary on an x86 host with + NX support enabled, the binary segfaults. + +TRY: Recompiling with the --hugetlbfs-align options and use the new relinking + method or booting your kernel with noexec32=off. + + +Trademarks +========== + +This work represents the view of the author and does not necessarily +represent the view of IBM. + +PowerPC is a registered trademark of International Business Machines +Corporation in the United States, other countries, or both. Linux is +a trademark of Linus Torvalds in the United States, other countries, +or both. diff --git a/LGPL-2.1 b/LGPL-2.1 new file mode 100644 index 0000000..2d2d780 --- /dev/null +++ b/LGPL-2.1 @@ -0,0 +1,510 @@ + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations +below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it +becomes a de-facto standard. To achieve this, non-free programs must +be allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control +compilation and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at least + three years, to give the same user the materials specified in + Subsection 6a, above, for a charge no more than the cost of + performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply, and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License +may add an explicit geographical distribution limitation excluding those +countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms +of the ordinary General Public License). + + To apply these terms, attach the following notices to the library. +It is safest to attach them to the start of each source file to most +effectively convey the exclusion of warranty; and each file should +have at least the "copyright" line and a pointer to where the full +notice is found. + + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or +your school, if any, to sign a "copyright disclaimer" for the library, +if necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James + Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..51e41f0 --- /dev/null +++ b/Makefile @@ -0,0 +1,471 @@ +PREFIX ?= /usr/local +EXEDIR ?= /bin + +LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o shm.o kernel-features.o +LIBPUOBJS = init_privutils.o debug.o hugeutils.o kernel-features.o +INSTALL_OBJ_LIBS = libhugetlbfs.so libhugetlbfs.a libhugetlbfs_privutils.so +BIN_OBJ_DIR=obj +INSTALL_BIN = hugectl hugeedit hugeadm pagesize +INSTALL_HELPER = huge_page_setup_helper.py +INSTALL_PERLMOD = DataCollect.pm OpCollect.pm PerfCollect.pm Report.pm +INSTALL_HEADERS = hugetlbfs.h +INSTALL_MAN1 = ld.hugetlbfs.1 pagesize.1 +INSTALL_MAN3 = get_huge_pages.3 get_hugepage_region.3 gethugepagesize.3 \ + gethugepagesizes.3 getpagesizes.3 hugetlbfs_find_path.3 \ + hugetlbfs_test_path.3 hugetlbfs_unlinked_fd.3 +INSTALL_MAN7 = libhugetlbfs.7 +INSTALL_MAN8 = hugectl.8 hugeedit.8 hugeadm.8 cpupcstat.8 +LDSCRIPT_TYPES = B BDT +LDSCRIPT_DIST_ELF = elf32ppclinux elf64ppc elf_i386 elf_x86_64 +INSTALL_OBJSCRIPT = ld.hugetlbfs +VERSION=version.h +SOURCE = $(shell find . -maxdepth 1 ! -name version.h -a -name '*.[h]') +SOURCE += *.c *.lds Makefile +NODEPTARGETS= + +INSTALL = install + +LDFLAGS += -ldl +CFLAGS ?= -O2 -g +CFLAGS += -Wall -fPIC +CPPFLAGS += -D__LIBHUGETLBFS__ + +ARCH ?= $(shell uname -m | sed -e s/i.86/i386/) +CC ?= gcc + +CUSTOM_LDSCRIPTS = yes + +ifeq ($(ARCH),ppc64) +CC64 = $(CC) -m64 +ELF64 = elf64ppc +TMPLIB64 = lib64 +TMPLIB32 = lib +ifneq ($(BUILDTYPE),NATIVEONLY) +CC32 = $(CC) -m32 +ELF32 = elf32ppclinux +endif +else +ifeq ($(ARCH),ppc64le) +CC64 = $(CC) -m64 +ELF64 = elf64lppc +TMPLIB64 = lib64 +CUSTOM_LDSCRIPTS = no +else +ifeq ($(ARCH),ppc) +CC32 = $(CC) -m32 +ELF32 = elf32ppclinux +TMPLIB32 = lib +CPPFLAGS += -DPPC_NO_SEGMENTS +else +ifneq (,$(findstring arm,$(ARCH))) +CC32 = $(CC) +TMPLIB32 = lib +ELF32 += armelf_linux_eabi +CUSTOM_LDSCRIPTS = no +else +ifneq (,$(findstring aarch64,$(ARCH))) +CC64 = $(CC) +ELF64 = aarch64elf +TMPLIB64 = lib64 +CUSTOM_LDSCRIPTS = no +else +ifneq (,$(filter i386 i486 i586 i686,$(ARCH))) +CC32 = $(CC) +ELF32 = elf_i386 +TMPLIB32 = lib +else +ifeq ($(ARCH),x86_64) +CC64 = $(CC) -m64 +ELF64 = elf_x86_64 +TMPLIB64 = lib64 +TMPLIB32 = lib +ifneq ($(BUILDTYPE),NATIVEONLY) +CC32 = $(CC) -m32 +ELF32 = elf_i386 +endif +else +ifeq ($(ARCH),ia64) +CC64 = $(CC) +TMPLIB64 = lib64 +CFLAGS += -DNO_ELFLINK +else +ifeq ($(ARCH),sparc64) +CC64 = $(CC) -m64 +TMPLIB64 = lib64 +CFLAGS += -DNO_ELFLINK +else +ifeq ($(ARCH),s390x) +CC64 = $(CC) -m64 +ELF64 = elf64_s390 +TMPLIB64 = lib64 +TMPLIB32 = lib +CUSTOM_LDSCRIPTS = no +ifneq ($(BUILDTYPE),NATIVEONLY) +CC32 = $(CC) -m31 +ELF32 = elf_s390 +endif +else +ifeq ($(ARCH),s390) +CC32 = $(CC) -m31 +ELF32 = elf_s390 +TMPLIB32 = lib +CUSTOM_LDSCRIPTS = no +else +$(error "Unrecognized architecture ($(ARCH))") +endif +endif +endif +endif +endif +endif +endif +endif +endif +endif +endif + +ifdef CC32 +OBJDIRS += obj32 +endif +ifdef CC64 +OBJDIRS += obj64 +endif + +ifdef CC64 +CCBIN = $(CC64) +else +CCBIN = $(CC32) +endif + +ifdef ELF32 +LIBOBJS32 = obj32/elflink.o obj32/sys-$(ELF32).o +endif +ifdef ELF64 +LIBOBJS64 = obj64/elflink.o obj64/sys-$(ELF64).o +endif +ifeq ($(ELF32),elf32ppclinux) +LIBOBJS32 += obj32/$(ELF32).o +endif +ifeq ($(ELF64),elf64ppc) +LIBOBJS64 += obj64/$(ELF64).o +endif +ifeq ($(ELF64),elf64lppc) +LIBOBJS64 += obj64/$(ELF64).o +endif +LIBOBJS32 += $(LIBOBJS:%=obj32/%) +LIBOBJS64 += $(LIBOBJS:%=obj64/%) + +ifeq ($(LIB32),) +LIB32 = $(TMPLIB32) +endif + +ifdef TMPLIB64 +ifeq ($(LIB64),) +LIB64 = $(TMPLIB64) +endif +endif + +ifeq ($(CUSTOM_LDSCRIPTS),yes) +TEST_LDSCRIPTS = -l +endif + +# If TMPLIB64 is set, then sure we are not resolving LIB32 and LIB64 to the +# same place +ifdef TMPLIB64 + +REALLIB32 = $(realpath $(PREFIX)/$(LIB32)) +REALLIB64 = $(realpath $(PREFIX)/$(LIB64)) +ifneq ($(realpath $(PREFIX)),) +ifeq ($(REALLIB32),$(REALLIB64)) +$(error LIB32 ($(PREFIX)/$(LIB32) to $(REALLIB32)) and LIB64 ($(PREFIX)/$(LIB64) to $(REALLIB64)) are resolving to the same place. Manually specify LIB32 and LIB64. e.g. make PREFIX=$(PREFIX) LIB32=lib32 LIB64=lib64) +endif +endif + +endif + +HEADERDIR = $(PREFIX)/include +LIBDIR32 = $(PREFIX)/$(LIB32) +LIBDIR64 = $(PREFIX)/$(LIB64) +LDSCRIPTDIR = $(PREFIX)/share/libhugetlbfs/ldscripts +BINDIR = $(PREFIX)/share/libhugetlbfs +EXEDIR = $(PREFIX)/bin +DOCDIR = $(PREFIX)/share/doc/libhugetlbfs +MANDIR1 = $(PREFIX)/share/man/man1 +MANDIR3 = $(PREFIX)/share/man/man3 +MANDIR7 = $(PREFIX)/share/man/man7 +MANDIR8 = $(PREFIX)/share/man/man8 + +ifdef LIB32 +LIBPATHS += -DLIB32='"$(LIB32)"' -DLIBDIR32='"$(LIBDIR32)"' +endif +ifdef LIB64 +LIBPATHS += -DLIB64='"$(LIB64)"' -DLIBDIR64='"$(LIBDIR64)"' +endif + +EXTRA_DIST = \ + README \ + HOWTO \ + LGPL-2.1 + +INSTALL_LDSCRIPTS = $(foreach type,$(LDSCRIPT_TYPES),$(LDSCRIPT_DIST_ELF:%=%.x$(type))) + +ifdef V +VECHO = : +else +VECHO = echo " " +ARFLAGS = rc +.SILENT: +endif + +DEPFILES = $(LIBOBJS:%.o=%.d) + +export ARCH +export OBJDIRS +export CC32 +export CC64 +export ELF32 +export ELF64 +export LIBDIR32 +export LIBDIR64 +export CUSTOM_LDSCRIPTS + +all: libs tests tools + +.PHONY: tests libs + +libs: $(foreach file,$(INSTALL_OBJ_LIBS),$(OBJDIRS:%=%/$(file))) $(BIN_OBJ_DIR)/libhugetlbfs_privutils.a + +tests: libs # Force make to build the library first +tests: tests/all + +tests/%: libs + $(MAKE) -C tests $* + +tools: $(foreach file,$(INSTALL_BIN),$(BIN_OBJ_DIR)/$(file)) + +check: all + cd tests; ./run_tests.py $(TEST_LDSCRIPTS) + +checkv: all + cd tests; ./run_tests.py -vV $(TEST_LDSCRIPTS) + +func: all + cd tests; ./run_tests.py -t func $(TEST_LDSCRIPTS) + +funcv: all + cd tests; ./run_tests.py -t func -vV $(TEST_LDSCRIPTS) + +stress: all + cd tests; ./run_tests.py -t stress + +stressv: all + cd tests; ./run_tests.py -t stress -vV + +# Don't want to remake objects just 'cos the directory timestamp changes +$(OBJDIRS): %: + @mkdir -p $@ + +# +$(VERSION): always + @$(VECHO) VERSION + ./localversion version $(SOURCE) +always: +# + +snapshot: $(VERSION) + +.SECONDARY: + +obj32/%.o: %.c + @$(VECHO) CC32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj64/%.o: %.c + @$(VECHO) CC64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj32/%.o: %.S + @$(VECHO) AS32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) -o $@ -c $< + +obj64/%.o: %.S + @$(VECHO) AS64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) -o $@ -c $< + +obj32/libhugetlbfs.a: $(LIBOBJS32) + @$(VECHO) AR32 $@ + $(AR) $(ARFLAGS) $@ $^ + +obj64/libhugetlbfs.a: $(LIBOBJS64) + @$(VECHO) AR64 $@ + $(AR) $(ARFLAGS) $@ $^ + +obj32/libhugetlbfs.so: $(LIBOBJS32) + @$(VECHO) LD32 "(shared)" $@ + $(CC32) $(LDFLAGS) -Wl,--version-script=version.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +obj64/libhugetlbfs.so: $(LIBOBJS64) + @$(VECHO) LD64 "(shared)" $@ + $(CC64) $(LDFLAGS) -Wl,--version-script=version.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +#obj32/libhugetlbfs_privutils.a: $(LIBPUOBJS:%=obj32/%) +# @$(VECHO) AR32 $@ +# $(AR) $(ARFLAGS) $@ $^ +# +#obj64/libhugetlbfs_privutils.a: $(LIBPUOBJS:%=obj64/%) +# @$(VECHO) AR64 $@ +# $(AR) $(ARFLAGS) $@ $^ + +$(BIN_OBJ_DIR)/libhugetlbfs_privutils.a: $(LIBPUOBJS:%=$(BIN_OBJ_DIR)/%) + @$(VECHO) ARHOST $@ + $(AR) $(ARFLAGS) $@ $^ + +obj32/libhugetlbfs_privutils.so: $(LIBPUOBJS:%=obj32/%) + @$(VECHO) LD32 "(shared)" $@ + $(CC32) $(LDFLAGS) -Wl,--version-script=privutils.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +obj64/libhugetlbfs_privutils.so: $(LIBPUOBJS:%=obj64/%) + @$(VECHO) LD64 "(shared)" $@ + $(CC64) $(LDFLAGS) -Wl,--version-script=privutils.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +obj32/%.i: %.c + @$(VECHO) CPP $@ + $(CC32) $(CPPFLAGS) -E $< > $@ + +obj64/%.i: %.c + @$(VECHO) CPP $@ + $(CC64) $(CPPFLAGS) -E $< > $@ + +obj32/%.s: %.c + @$(VECHO) CC32 -S $@ + $(CC32) $(CPPFLAGS) $(CFLAGS) -o $@ -S $< + +obj64/%.s: %.c + @$(VECHO) CC64 -S $@ + $(CC64) $(CPPFLAGS) $(CFLAGS) -o $@ -S $< + +$(BIN_OBJ_DIR)/%.o: %.c + @$(VECHO) CCHOST $@ + @mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ -c $< + +$(BIN_OBJ_DIR)/hugectl: $(BIN_OBJ_DIR)/hugectl.o + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +$(BIN_OBJ_DIR)/hugeedit: $(BIN_OBJ_DIR)/hugeedit.o + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ $^ $(LDFLAGS) + +HUGEADM_OBJ=hugeadm.o libhugetlbfs_privutils.a +$(BIN_OBJ_DIR)/hugeadm: $(foreach file,$(HUGEADM_OBJ),$(BIN_OBJ_DIR)/$(file)) + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ $^ $(LDFLAGS) + +PAGESIZE_OBJ=pagesize.o libhugetlbfs_privutils.a +$(BIN_OBJ_DIR)/pagesize: $(foreach file,$(PAGESIZE_OBJ),$(BIN_OBJ_DIR)/$(file)) + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ $^ $(LDFLAGS) + +clean: + @$(VECHO) CLEAN + rm -f *~ *.o *.so *.a *.d *.i core a.out $(VERSION) + rm -rf obj* + rm -f ldscripts/*~ + rm -f libhugetlbfs-sock + $(MAKE) -C tests clean + +%.d: %.c $(VERSION) + @$(CC) $(CPPFLAGS) -MM -MT "$(foreach DIR,$(OBJDIRS),$(DIR)/$*.o) $@" $< > $@ + +# Workaround: Don't build dependencies for certain targets +# When the include below is executed, make will use the %.d target above to +# generate missing files. For certain targets (clean, version.h, etc) we don't +# need or want these dependency files, so don't include them in this case. +ifeq (,$(findstring <$(MAKECMDGOALS)>,$(NODEPTARGETS))) +-include $(DEPFILES) +endif + +obj32/install: + @$(VECHO) INSTALL-LIB32 $(LIBDIR32) + $(INSTALL) -d $(DESTDIR)$(LIBDIR32) + $(INSTALL) $(INSTALL_OBJ_LIBS:%=obj32/%) $(DESTDIR)$(LIBDIR32) + +obj64/install: + @$(VECHO) INSTALL-LIB64 $(LIBDIR64) + $(INSTALL) -d $(DESTDIR)$(LIBDIR64) + $(INSTALL) $(INSTALL_OBJ_LIBS:%=obj64/%) $(DESTDIR)$(LIBDIR64) + +objscript.%: % + @$(VECHO) OBJSCRIPT $* + sed "s!### SET DEFAULT LDSCRIPT PATH HERE ###!HUGETLB_LDSCRIPT_PATH=$(LDSCRIPTDIR)!;s!### SET CUSTOM_LDSCRIPTS HERE ###!CUSTOM_LDSCRIPTS=\"$(CUSTOM_LDSCRIPTS)\"!" < $< > $@ + +install-libs: libs $(OBJDIRS:%=%/install) $(INSTALL_OBJSCRIPT:%=objscript.%) + $(INSTALL) -d $(DESTDIR)$(HEADERDIR) + $(INSTALL) -d $(DESTDIR)$(LDSCRIPTDIR) + $(INSTALL) -d $(DESTDIR)$(BINDIR) + $(INSTALL) -m 644 -t $(DESTDIR)$(HEADERDIR) $(INSTALL_HEADERS) + $(INSTALL) -m 644 $(INSTALL_LDSCRIPTS:%=ldscripts/%) $(DESTDIR)$(LDSCRIPTDIR) + for x in $(INSTALL_OBJSCRIPT); do \ + $(INSTALL) -m 755 objscript.$$x $(DESTDIR)$(BINDIR)/$$x; done + cd $(DESTDIR)$(BINDIR) && ln -sf ld.hugetlbfs ld + +install-man: + @$(VECHO) INSTALL_MAN $(DESTDIR)manX + $(INSTALL) -d $(DESTDIR)$(MANDIR1) + $(INSTALL) -d $(DESTDIR)$(MANDIR3) + $(INSTALL) -d $(DESTDIR)$(MANDIR7) + $(INSTALL) -d $(DESTDIR)$(MANDIR8) + for x in $(INSTALL_MAN1); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR1); \ + gzip -f $(DESTDIR)$(MANDIR1)/$$x; \ + done + for x in $(INSTALL_MAN3); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR3); \ + gzip -f $(DESTDIR)$(MANDIR3)/$$x; \ + done + rm -f $(DESTDIR)$(MANDIR3)/free_huge_pages.3.gz + rm -f $(DESTDIR)$(MANDIR3)/free_hugepage_region.3.gz + rm -f $(DESTDIR)$(MANDIR3)/hugetlbfs_unlinked_fd_for_size.3.gz + rm -f $(DESTDIR)$(MANDIR3)/hugetlbfs_find_path_for_size.3.gz + ln -s get_huge_pages.3.gz $(DESTDIR)$(MANDIR3)/free_huge_pages.3.gz + ln -s get_hugepage_region.3.gz $(DESTDIR)$(MANDIR3)/free_hugepage_region.3.gz + ln -s hugetlbfs_unlinked_fd.3.gz $(DESTDIR)$(MANDIR3)/hugetlbfs_unlinked_fd_for_size.3.gz + ln -s hugetlbfs_find_path.3.gz $(DESTDIR)$(MANDIR3)/hugetlbfs_find_path_for_size.3.gz + for x in $(INSTALL_MAN7); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR7); \ + gzip -f $(DESTDIR)$(MANDIR7)/$$x; \ + done + for x in $(INSTALL_MAN8); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR8); \ + gzip -f $(DESTDIR)$(MANDIR8)/$$x; \ + done + +install-bin: + @$(VECHO) INSTALL_BIN $(DESTDIR)$(EXEDIR) + $(INSTALL) -d $(DESTDIR)$(EXEDIR) + for x in $(INSTALL_BIN); do \ + $(INSTALL) -m 755 $(BIN_OBJ_DIR)/$$x $(DESTDIR)$(EXEDIR); done + +install: install-libs install-bin install-man + +install-helper: + @$(VECHO) INSTALL_HELPER $(DESTDIR)$(EXEDIR) + $(INSTALL) -d $(DESTDIR)$(EXEDIR) + for x in $(INSTALL_HELPER); do \ + $(INSTALL) -m 755 $$x $(DESTDIR)$(EXEDIR); done + +install-docs: + $(INSTALL) -d $(DESTDIR)$(DOCDIR) + for x in $(EXTRA_DIST); do $(INSTALL) -m 755 $$x $(DESTDIR)$(DOCDIR)/$$x; done + +install-tests: tests install # Force make to build tests and install the library first + ${MAKE} -C tests install DESTDIR=$(DESTDIR) OBJDIRS="$(OBJDIRS)" LIB32=$(LIB32) LIB64=$(LIB64) diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..28713a9 --- /dev/null +++ b/NEWS @@ -0,0 +1,465 @@ +libhugetlbfs 2.21 "The Bloor Annex" +====================================================================== +New Features +* Suport for 512M huge pages on aarch64 + +Bug Fixes +* The noexec stack markers are set directly +* We no longer lie to glibc about shrinking the heap by less than HPAGE_SIZE + +Test Suite +* No existent tests are no longer marked Killed By Signal +* Disable malloc per-thread cache for heap shrinking tests + +libhugetlbfs 2.20 "View Across the Charles" +====================================================================== +Bug Fixes +* Fix keyword collisions and warnings from GCC 5 +* hugeadm output is parsed for default size in huge_page_setup_helper + +Test Suite +* fallocate tests + +libhugetlbfs 2.19 "Missing Longfellow Lanes" +====================================================================== +Bug Fixes +* Fixed hugeadm behavior when mtab is a symlink +* fix plt_extrasz() on ppc64le +* ARM: fix page size and text offset setup +* Fix hugeadm handling of negative pool resize amounts + +Test Suite +* remove duplicate mremap-* tests + +libhugetlbfs 2.18 "Infinite Corridor" +====================================================================== +New Features +* Add support for ppc64le +* ARM 64 big endian support + +Bug Fixes +* Reverted 3bdd9924 which broke building on PPC64 +* Fix parsing of kernel version string +* ARM: mark direct_syscall as a FUNC + +Test Suite +* mmap of huge page files with misaligned offset success treated as PASS + +libhugetlbfs 2.17 "Fowl and Fetus" +====================================================================== +New Features +* PPC segement alignment restrictions can be disabled +* Added Aarch64 support + +Bug Fixes +* Allow compiler overrides for 64 and 32 bit builds +* hugeadm now handles /etc/mtab being a simlink properly + +Test Suite +* Add corrupt-by-cow-opt test +* Add noresv-preserve-resv-page test +* Add noresv-regarded-as-resv test + +libhugetlbfs 2.16 "Joe E. Parker" +====================================================================== +New Features +* ARM Support +* s390x Dynamic TASK_SIZE support + +Bug Fixes +* find_mounts() now properly NULL terminates mount point names + +Test Suite +* ARM Support +* mremap-fixed-huge-near-normal no longer calls munmap with 0 length + +libhugetlbfs 2.15 "Minature Panda" +====================================================================== +New Features +* Add variable in Makefile to disable using deprecated linker scripts +* s390 support + +Bug Fixes +* Disable Unable to verify address range warning when offset < page_size +* Remove sscanf in library setup to avoid heap allocation before _morecore + override +* Revert heap exhaustion patch +* hugectl no longer clips LD_LIBRARY_PATH variable + + +Test Suite +* Fix mremap-expand-slice-collision expanding anon shm +* mremap-expand-slice-collision now asks kernel for availble slices +* Make 4GB boundary tests only fail when allocation fails on free slice +* Link 4GB test cases with --static so libraries are not placed in the way + +libhugetlbfs 2.14 "The White Album" +====================================================================== +New Features +* Updated man pages +* Added basic events for core_i7 to oprofile_map_events + +Bug Fixes +* Fix clean on failure code to avoid closing stdout + +Test Suite +* Fixed readahead, malloc, and hugepagesizes tests +* Avoid core dumps on stack_grow_into_huge test + +libhugetlbfs 2.13 "Insert Clever Title Here" +====================================================================== +New Features +* hugeadm can now be used to control Transparent Huge Page tunables +* New morecore mode to better support THP + +Bug Fixes +* Check permissions on hugetlbfs mount point before marking it as available + +Test Suite +* Fix shm tests to use random address instead of fixed, old address failed + on ARM + +libhugetlbfs 2.12 "Serrano" +====================================================================== +New Features +* libhugetlbfs usage can now be restricted to certain binary names +* lihugetlbfs now supports static linking +* hugeadm uses more human readable directory names for mount points + +Bug Fixes +* hugeadm would segfault if specified user was not in passwd, failure in + getpwuid() is now checked + +Test Suite +* Added missing tests to driver script +* Added tests for static linking + +libhugetlbfs 2.11 "Ghost Pepper" +====================================================================== +New Features +* cpupcstat reports time servicing tlb misses when requested +* When supported by the kernel and glibc, MAP_HUGETLB is used + for the heap and to back memory returned by get_huge_pages. + These features can now be used without mounting hugetlbfs + +Bug Fixes +* tlbmiss_cost.sh supresses oprofile errors +* numerous fixes to setup_helper.py +* Corrected usage of hugetlbfs_test_feature return value +* find_mounts now correctly ignores non-hugetlbfs mount points +* When prefaulting pages for get_huge_pages readv was using the fd + for the mapping, this caused the prefault to fail on older libc. + Now /dev/zero is used for all prefaulting + +libhugetlbfs 2.10 "Another Hottie" +====================================================================== +Bug Fixes +* hugeadm now handles pool size deltas properly +* Makefile uses ?= to assign PREFIX and EXEDIR to allow for easier build + modification + +libhugetlbfs 2.9 "Something Spicy" +====================================================================== +New Features +* Add --no-reseve to hugectl to request mmap'd pages are not reserved + for kernels newer than 2.6.34 +* POWER6 now supported by TLB miss cost estimator +* Add --obey-numa-mempol to hugeadm to request static pool pages are + allocated following the process NUMA memory policy + +Test Suite +* Fix gethugepagesizes test case + +libhugetlbfs 2.8 "Scotch Bonnet" +====================================================================== +New Features +* Add switch to let administrator limit new mount points by size or inodes +* cpupcstat now caches the value returned by tlmiss_cost.sh to avoid + rerunning the script + +Bug Fixes +* errno values are saved in get_huge_pages +* tlbmiss_cost.sh patches calibrator to fix round naming collision +* Fixed ALIGN_UP macro for aligning huge page segments +* Fix --create-mounts switch in hugeadm +* Library and helpers are all linked with -z noexecstack + +Test Suite +* run_tests.py detects valid word sizes + +libhugetlbfs 2.7 "Adobo" +====================================================================== +New Features +* When specifying huge page pool sizes with hugeadm, memory sizes can + be used as well as the number of huge pages +* DEFAULT is now a valid huge page pool for resizing, it will adjust + the pool for the default huge page size +* tlbmiss_cost.sh in the contrib/ sub directory will estimate the cost + in CPU cycles of a TLB miss on the arch where it is run +* Add python script which automates huge page pool setup with minimal + input required from user + +Bug Fixes +* The --dry-run switch in hugeadm is now obeyed +* hugeadm now uses unsigned long long for page resizes to avoid + overflow errors +* --set-recommended-shmmax no longer overflows if the number of + available huge pages is bigger than the address space + +Test Suite +* Updated linkhuge_nofd to override proper functions when testing +* run_tests.py can now monitor the pool sizes between tests to help + identify accounting errors +* Add test for mremap bug on architectures with holes in address space + +libhugetlbfs 2.6 "Adovada" +====================================================================== +New Features +* cpupcstat now supports data collection using the perf tool as well as + oprofile +* --explain reports if min_free_kbytes is too small +* add --set-min_free_kbytes to hugeadm + +Bug Fixes +* Admin utils (hugeadm, hugectl, etc) are now built as 64 bit binaries + if possible to support adminstration of larger huge page sizes + +Test Suite +* Suppress ld.hugetlbfs warnings during test suite build +* Make SPECIAL keyword test cross-compile safe +* Test script sets proper rlimits for mlock and stack_grow_into_huge + tests +* Ensure that all elflink tests are run with both HUGETLB_SHARE=0 and + HUGETLB_SHARE=1 + +libhugetlbfs 2.5 "Self Titled" +====================================================================== +New Features +* added --add-ramdisk-swap option to hugeadm to use ramdisks as + temporary swap space for diskless systems +* added --persist option to hugeadm to be used with either --add-*-swap + option. Makes swap added stay until removed or the machine is rebooted +* added cpupcstat script which uses oprofile to monitor tlb miss rate + of a target program + +Bug Fixes +* --add-temp-swap option now takes an optional integer argument that is + the size in number of hugepages to make the swap space + +libhugetlbfs 2.4 "Birdseye" +====================================================================== +New Features +* added --add-temp-swap option to hugeadm to add a swap file for a pool + resize +* added --[enable|disable]-zone-moveable options to hugeadm to control + /proc/sys/vm/hugepages_treat_as_movable + +Bug Fixes +* Fix pool-pages-max processing by using the proper array for its + requests +* Move private reservation check out of morecore setup + +Test Suite +* Added regression tests for leaking reserve count due to madvise and + fadvise and readahead +* Add test for mixed permissions on shm segments +* Wrap tests that can hang the machine to fail is kernel is too old +* Add -f option to run_tests.py to force running of tests that can hang + the machine + +libhugetlbfs 2.3 "NuMex Sunglo" +====================================================================== +New Features +* added --force-preload option to hugectl for backing segments with + 64kb pages on ppc64 when app was not linked with libhugetlbfs +* added --explain swtich to hugadm to give a quick overview of the + system wrt huge pages +* hugeadm warns if min pool size is being adjusted without sufficient + swap space configured +* added --hard switch to ask hugeadm to try multiple times to resize + a huge page pool +* added --create-*-mounts switches to create mount points for hugetlbfs + usable by specific users, groups, or globally + +Bug Fixes +* hugeadm will no longer mount a directory multiple times +* hugeadm adds all new mount points to /etc/mtab + +libhugetlbfs 2.2 "Barkers Extra Hot" +====================================================================== +New Features +* Refactored environment variable parsing to read once and store values +* Add --list-mounts and --list-all-mounts options to hugeadm +* Rework test suite to run for all avaialbe page sizes +* Add --create-mounts for root only, --create-user-mounts, + --create-group-mounts, and --create-global-mounts options to hugeadm +* Add --share-text option to hugectl + +Test Suite Fixes +* Added wrapper to shm-fork and shm-getraw tests that makes runs on + hpage sizes different from default expected failures +* Reworked shmoverride tests to handle new env parsing + +libhugetlbfs 2.1 "NM 64" +====================================================================== +New Features +* Multiple page size support +* Add a more user friendly allocator that handles cache coloring +* Add pagesize utility to display supported page sizes +* Add hugeadm utility for managing hugepage pools +* Add hugectl utility for running programs using hugepages +* Add hugeedit utility for marking segments in aligned binaries for + huge page backing by default +* Deprecated linker linker scripts +* gethugepagesize() and getpagesizes() API added to programatically + discover supported hugepages +* Manual pages for all API functions and utilities +* Allow automatic backing of shared memory segments with hugepages +* huge page regions will no longer prefault for kernels >= 2.6.27 + improving mmap() performance and NUMA layout + +Bug Fixes +* Add missing segment to interp section in linker scripts +* Fix free_hugepage_region to handle segments that fell back to small + pages +* Fix install when lib32 and lib64 resolve to the same place +* Install header files so APIs can be used +* Fix installation paths to make life easier for package maintainers +* Do not export internal symbols unnecessarily +* Prefault regions allocated by direct allocation API on kernels older + than 2.6.27 +* Gracefully fallback to using base pages for text/data when the + hugepage pool is too small +* Fix handling of HUGETLB_SHARE_PATH environment variable +* Relax remapping requirements + +Test suite Fixes +* Added expected failure support +* gethugepagesizes override for getting meminfo +* Increase debug output for tests that fail +* Summarise pass and failure counts + +libhugetlbfs 2.0 "Sandia Red" +====================================================================== + +New Features +* New scriptless relinking for binutils >= 2.17 +* Added direct allocator API for huge pages + +Bug Fixes +* /proc/mounts is parsed line at a time to handle file larger than 4kb +* Read-only mappings use MAP_NORESERVE + +Test suite fixes +* Add test for /proc/mounts file larger than 4kb +* Fix quota test with private reservations +* Output strerror on failure +* linkhuge tests are skipped when known to be broken + +libhugetlbfs 1.3 "Big Jim" +====================================================================== + +New features +* Add HUGETLB_NO_PREFAULT to control prefaulting of huge pages via mlock +* Add "[hostname:pid]" to output messages +* Setup for handling larger huge page sizes e.g. 16G huge pages +* Update for new upstream sysctl +* Add support for hugetlbfs_morecore to shrink the heap + +Bug fixes +* Disable heap shrinking by default to avoid bug in glibc malloc +* Skip elflink calls in setup_libhugetlbfs for IA64 and sparc64 +* Replace gethugepagesize with check_hugepagesize for error checking +* Make morecore argument a long to handle larger page sizes + +Test suite fixes +* Check uid/gid in tests where it matters +* tests: verify there are enough huge pages +* Change tests to read /proc/meminfo +* tests: verify that huge page size isn't too big for the test + +libhugetlbfs 1.2 "Mango Habanero" +====================================================================== + +New features + +* Partial segment remapping. This allows non-relinked binaries to try + to take advantage of libhugetlbfs' segment remapping code. Large + segments are required, especially on Power. This feature is useful + for estimating huge page performance, however full relinking will + still perform better. +* Add extra debugging for binaries that may run out of address space. +* Log library version when HUGETLB_VERBOSE is enabled. +* Beginning support for ia64 and sparc64. +* New test to check handling of misaligned mmap() parameters. + +Bug fixes + +* Fix EH_FRAME segment. Fixes some C++ applications. +* Rework PLT detection to work better on Power. +* Add per-target-arch syscall stubs to the library. These provide + reliable error messages from elflink.c if they occur while the + program segments are unmapped. +* Add proper SONAME to shared libs. +* Makefile respects CFLAGS/LDFLAGS/CPPFLAGS environment variables. +* Make mlock() failure non-fatal. + +Test suite fixes + +* Fix zero_filesize_segment test. +* Fix the icache-hygeine testcase for i386 and x86_64. +* Fix SEGVs in task-size-overrun test. +* Enable stack_grow_into_huge test, previously skipped. +* Fix test_addr_huge() for anon pages. + +libhugetlbfs 1.1 "Carribbean Jerk" +====================================================================== + +This large release brings several performance improvements + +Security + +* Remove the sharing daemon completely and rework the segment sharing + code. Permissions on the hugetlbfs mountpoint are used to enforce + segment sharing. + +Bug fixes + +* Sharing of writable segments is no longer possible, due to address + space randomization on x86_64 (although similar issues could happen on + any architecture). +* minimal_copy detection should work better in this release. + +Trivial but notable changes + +* Testcase updates + +libhugetlbfs 1.0.1 "Spicy Garlic" +====================================================================== + +This small maintenance release brings a security fix, a few minor bug +fixes, plus some documentation and error message updates. + +Security + +* A section on security has been added to the README file +* The hugetlbd daemon socket has been moved from /tmp to /var/run. + This will require the daemon to be run as root, which was previously + just a recommendation. + +Bug fixes + +* Reduce reserved huge pages needed for application start-up +* PPC linker script fixes + +Trivial but notable changes + +* Always install linker scripts for all targets +* Error message updates +* Add documentation on HUGETLB_DEBUG +* Testcase updates + +libhugetlbfs 1.0 +====================================================================== + +* First stable release diff --git a/README b/README new file mode 100644 index 0000000..268a66f --- /dev/null +++ b/README @@ -0,0 +1,78 @@ +03 December 2015 -- Yet another mailing list move + +librelist seems to be dead or at least broken. I have recieved several +emails directly saying that patches were posted, but they never got +responses and the archives always seem empty. So we are moving the list +again. This from here on out we will be using + +libhugetlbfs@googlegroups.com + +as our mailing list. Please send patches to this list rather than +creating a pull request. + +03 June 2015 -- libhugetlbfs to find new home + +As part of the fall out from the recent hijacking various "unmaintained" +projects, I no longer wish to host this (or any other) project at +sourceforge. Effective today, the new official home for libhugetlbfs +code is + +https://github.com/libhugetlbfs/libhugetlbfs + +The doubling of the name is unfortunate, but I wanted the repo to belong +to the org of the same name so there it is. + +Please do not submit pull requests, they will be closed with a redirect +to the mailing list (see below) at best, or ignored completely at worst. + +Tarballs of specific releases can still be downloaded using the github +Download ZIP button from the appropriate tag (or branch). The mailing +list will now hosted at librelists and can be found at + +libhugetlbfs@librelist.com + +For libhugetlbfs usage, see the HOWTO, for what has changed see NEWS, +and for how to work with the project see SubmittingCode + +10/03/2006 -- libhugetlbfs-1.0 Released + +After roughly one year in development, version 1.0 of libhugetlbfs is here. +It can be downloaded from SourceForge or the OzLabs mirror: + + http://sourceforge.net/project/showfiles.php?group_id=156936 + http://libhugetlbfs.ozlabs.org/snapshots/ + +After a series of preview releases, we have tested a huge array of the +supported usage scenarios using benchmarks and real HPC applications. +Usability and reliability have greatly improved. But... due to the +incredible diversity of applications that exist, there is bound to be a few +that will not work correctly. + +If using libhugetlbfs makes your application slower: + + * Play around with the different combinations of hugetlb malloc and the + two different supported link types to see which combination works best. + + * Keep in mind that huge pages are a niche performance tweak and are not + suitable for every type of application. They are specifically known to + hurt performance in certain situations. + +If you experience problems: + + * You've already read the HOWTO document, but read through it again. It + is full of hints, notes, warnings, and caveats that we have found over + time. This is the best starting point for a quick resolution to your + issue. + + * Make sure you have enough huge pages allocated. Even if you think you + have enough, try increasing it to a number you know you will not use. + + * Set HUGETLB_VERBOSE=99 and HUGETLB_DEBUG=yes. These options increase + the verbosity of the library and enable extra checking to help diagnose + the problem. + +If the above steps do not help, send as much information about the problem +(including all libhugetlbfs debug output) to +libhugetlbfs@googlegroups.com and we'll help out as much as we +can. We will probably ask you to collect things like: straces, +/proc/pid/maps and gdb back traces. diff --git a/SubmittingCode b/SubmittingCode new file mode 100644 index 0000000..b6c03b8 --- /dev/null +++ b/SubmittingCode @@ -0,0 +1,31 @@ +The libhugetlbfs project uses a number of code submission rules from the +Linux kernel. For instance, we follow the coding style document very +closely. Please see: + +http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/CodingStyle?id=HEAD + +For a very indepth look at coding style. + +Also like the Linux kernel, we transact in patches over a mailing list. +The libhugetlbfs mailing is at: + +libhugetlbfs@googlegroups.com + +You do not need to be subscribed to send mail to that list. + +I know that Github provides a nifty pull request tool, please do not use +it. At best your request will be closed with reference to this document +or the mailing list, and at worst it will simply be ignored. + +Libhugetlbfs does not require Copyright assignment to any particular +organization. You (or the organization paying your wages) keep the +copyright for code you write. We do require a Signed-off-by: line at +the end of the changelog. For more on the how and why of this please +see: + +http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/SubmittingPatches?id=HEAD + +And finally, if you have any questions send them to the list. It is low +traffic and it might take you some time to get a response, but messages +sent there will be read eventually. + diff --git a/alloc.c b/alloc.c new file mode 100644 index 0000000..bce9464 --- /dev/null +++ b/alloc.c @@ -0,0 +1,337 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * alloc.c - Simple allocator of regions backed by hugepages + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hugetlbfs.h" +#include "libhugetlbfs_internal.h" + +/* Allocate base pages if huge page allocation fails */ +static void *fallback_base_pages(size_t len, ghp_t flags) +{ + int fd; + void *buf; + INFO("get_huge_pages: Falling back to base pages\n"); + + /* + * Map /dev/zero instead of MAP_ANONYMOUS avoid VMA mergings. Freeing + * pages depends on /proc/pid/maps to find lengths of allocations. + * This is a bit lazy and if found to be costly due to either the + * extra open() or virtual address space usage, we could track active + * mappings in a lock-protected list instead. + */ + fd = open("/dev/zero", O_RDWR); + if (fd == -1) { + ERROR("get_huge_pages: Failed to open /dev/zero for fallback"); + return NULL; + } + + buf = mmap(NULL, len, + PROT_READ|PROT_WRITE, + MAP_PRIVATE, + fd, 0); + if (buf == MAP_FAILED) { + WARNING("Base page fallback failed: %s\n", strerror(errno)); + buf = NULL; + } + close(fd); + + return buf; +} + +/** + * get_huge_pages - Allocate an amount of memory backed by huge pages + * len: Size of the region to allocate, must be hugepage-aligned + * flags: Flags specifying the behaviour of the function + * + * This function allocates a region of memory that is backed by huge pages + * and hugepage-aligned. This is not a suitable drop-in for malloc() but a + * a malloc library could use this function to create a new fixed-size heap + * similar in principal to what morecore does for glibc malloc. + */ +void *get_huge_pages(size_t len, ghp_t flags) +{ + void *buf; + int buf_fd = -1; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; + int mmap_hugetlb = 0; + int ret; + + /* Catch an altogether-too easy typo */ + if (flags & GHR_MASK) + ERROR("Improper use of GHR_* in get_huge_pages()\n"); + +#ifdef MAP_HUGETLB + mmap_hugetlb = MAP_HUGETLB; +#endif + + if (__hugetlb_opts.map_hugetlb && + gethugepagesize() == kernel_default_hugepage_size()) { + /* Because we can use MAP_HUGETLB, we simply mmap the region */ + buf = mmap(NULL, len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS|mmap_hugetlb|mmap_reserve, + 0, 0); + } else { + /* Create a file descriptor for the new region */ + buf_fd = hugetlbfs_unlinked_fd(); + if (buf_fd < 0) { + WARNING("Couldn't open hugetlbfs file for %zd-sized buffer\n", + len); + return NULL; + } + + /* Map the requested region */ + buf = mmap(NULL, len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|mmap_reserve, buf_fd, 0); + } + + if (buf == MAP_FAILED) { + if (buf_fd >= 0) + close(buf_fd); + + WARNING("get_huge_pages: New region mapping failed (flags: 0x%lX): %s\n", + flags, strerror(errno)); + return NULL; + } + + /* Fault the region to ensure accesses succeed */ + ret = hugetlbfs_prefault(buf, len); + if (ret != 0) { + munmap(buf, len); + if (buf_fd >= 0) + close(buf_fd); + + WARNING("get_huge_pages: Prefaulting failed (flags: 0x%lX): %s\n", + flags, strerror(ret)); + return NULL; + } + + /* Close the file so we do not have to track the descriptor */ + if (buf_fd >= 0 && close(buf_fd) != 0) { + WARNING("Failed to close new buffer fd: %s\n", strerror(errno)); + munmap(buf, len); + return NULL; + } + + /* woo, new buffer of shiny */ + return buf; +} + +#define MAPS_BUF_SZ 4096 +static void __free_huge_pages(void *ptr, int aligned) +{ + FILE *fd; + char line[MAPS_BUF_SZ]; + unsigned long start = 0, end = 0; + unsigned long palign = 0, hpalign = 0; + unsigned long hpalign_end = 0; + + /* + * /proc/self/maps is used to determine the length of the original + * allocation. As mappings are based on different files, we can + * assume that maps will not merge. If the hugepages were truly + * anonymous, this assumption would be broken. + */ + fd = fopen("/proc/self/maps", "r"); + if (!fd) { + ERROR("Failed to open /proc/self/maps\n"); + return; + } + + /* + * An unaligned address allocated by get_hugepage_region() + * could be either page or hugepage aligned + */ + if (!aligned) { + palign = ALIGN_DOWN((unsigned long)ptr, getpagesize()); + hpalign = ALIGN_DOWN((unsigned long)ptr, gethugepagesize()); + } + + /* Parse /proc/maps for address ranges line by line */ + while (!feof(fd)) { + char *bufptr; + char *saveptr = NULL; + + /* Read a line of input */ + if (fgets(line, MAPS_BUF_SZ, fd) == NULL) + break; + + /* Parse the line to get the start and end of each mapping */ + bufptr = strtok_r(line, " ", &saveptr); + bufptr = strtok_r(bufptr, "-", &saveptr); + start = strtoull(bufptr, NULL, 16); + bufptr = strtok_r(NULL, "-", &saveptr); + + /* If the correct mapping is found, remove it */ + if (start == (unsigned long)ptr) { + end = strtoull(bufptr, NULL, 16); + munmap(ptr, end - start); + break; + } + + /* If the passed address is aligned, just move along */ + if (aligned) + continue; + + /* + * If an address is hpage-aligned, record it but keep looking. + * We might find a page-aligned or exact address later + */ + if (start == hpalign) { + hpalign_end = strtoull(bufptr, NULL, 16); + continue; + } + + /* If an address is page-aligned, free it */ + if (start == palign) { + end = strtoull(bufptr, NULL, 16); + munmap((void *)start, end - start); + break; + } + + } + + /* + * If no exact or page-aligned address was found, check for a + * hpage-aligned address. If found, free it, otherwise warn that + * the ptr pointed nowhere + */ + if (end == 0) { + if (hpalign_end == 0) + ERROR("hugepages_free using invalid or double free\n"); + else + munmap((void *)hpalign, hpalign_end - hpalign); + } + + fclose(fd); +} + +/** + * free_huge_pages - Free a region allocated that was backed by large pages + * ptr - The pointer to the buffer returned by get_huge_pages() + * + * This function finds a region to free based on the contents of + * /proc/pid/maps. The assumption is made that the ptr is the start of + * a hugepage region allocated with free_huge_pages. No checking is made + * that the pointer is to a hugepage backed region. + */ +void free_huge_pages(void *ptr) +{ + __free_huge_pages(ptr, 1); +} + +/* + * Offset the buffer using bytes wasted due to alignment to avoid using the + * same cache lines for the start of every buffer returned by + * get_huge_pages(). A small effort is made to select a random cacheline + * rather than sequential lines to give decent behaviour on average. + */ +void *cachecolor(void *buf, size_t len, size_t color_bytes) +{ + static long cacheline_size = 0; + static int linemod = 0; + char *bytebuf = (char *)buf; + int numlines; + int line = 0; + + /* Lookup our cacheline size once */ + if (cacheline_size == 0) { + cacheline_size = sysconf(_SC_LEVEL2_CACHE_LINESIZE); + linemod = time(NULL); + } + + numlines = color_bytes / cacheline_size; + DEBUG("%d lines of cacheline size %ld due to %zd wastage\n", + numlines, cacheline_size, color_bytes); + if (numlines) { + line = linemod % numlines; + bytebuf += cacheline_size * line; + + /* Pseudo-ish random line selection */ + linemod += len % numlines; + } + DEBUG("Using line offset %d from start\n", line); + + return bytebuf; +} + +/** + * get_hugepage_region - Allocate an amount of memory backed by huge pages + * + * len: Size of the region to allocate + * flags: Flags specifying the behaviour of the function + * + * This function allocates a region of memory backed by huge pages. Care should + * be taken when using this function as a drop-in replacement for malloc() as + * memory can be wasted if the length is not hugepage-aligned. This function + * is more relaxed than get_huge_pages() in that it allows fallback to small + * pages when requested. + */ +void *get_hugepage_region(size_t len, ghr_t flags) +{ + size_t aligned_len, wastage; + void *buf; + + /* Catch an altogether-too easy typo */ + if (flags & GHP_MASK) + ERROR("Improper use of GHP_* in get_hugepage_region()\n"); + + /* Align the len parameter to a hugepage boundary and allocate */ + aligned_len = ALIGN(len, gethugepagesize()); + buf = get_huge_pages(aligned_len, GHP_DEFAULT); + if (buf == NULL && (flags & GHR_FALLBACK)) { + aligned_len = ALIGN(len, getpagesize()); + buf = fallback_base_pages(len, flags); + } + + /* Calculate wastage for coloring */ + wastage = aligned_len - len; + if (wastage != 0 && !(flags & GHR_COLOR)) + DEBUG("get_hugepage_region: Wasted %zd bytes due to alignment\n", + wastage); + + /* Only colour if requested */ + if (flags & GHR_COLOR) + buf = cachecolor(buf, len, wastage); + + return buf; +} + +/** + * free_hugepage_region - Free a region allocated by get_hugepage_region + * ptr - The pointer to the buffer returned by get_hugepage_region + * + * This function finds a region to free based on the contents of + * /proc/pid/maps. The assumption is made that the ptr is the start of + * a hugepage region allocated with get_hugepage_region. No checking is made + * that the pointer is to a hugepage backed region. + */ +void free_hugepage_region(void *ptr) +{ + __free_huge_pages(ptr, 0); +} diff --git a/debug.c b/debug.c new file mode 100644 index 0000000..6bc7b76 --- /dev/null +++ b/debug.c @@ -0,0 +1,50 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include "hugetlbfs.h" + +#include "libhugetlbfs_internal.h" + +int __hugetlbfs_verbose = VERBOSITY_DEFAULT; +bool __hugetlbfs_debug = false; +bool __hugetlbfs_prefault = true; +char __hugetlbfs_hostname[64]; + +static int initialized; + +static void __hugetlbfs_init_debug(void) +{ + if (initialized) + return; + + gethostname(__hugetlbfs_hostname, sizeof(__hugetlbfs_hostname)-1); + + initialized = 1; +} + +void hugetlbfs_setup_debug(void) +{ + __hugetlbfs_init_debug(); +} diff --git a/elf32ppclinux.c b/elf32ppclinux.c new file mode 100644 index 0000000..24adaf1 --- /dev/null +++ b/elf32ppclinux.c @@ -0,0 +1,54 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "libhugetlbfs_internal.h" + +/* + * The powerpc 32-bit ELF ABI defines the location and size of the plt as + * follows (see the ELF ABI and powerpc32 supplement for details): + * + * Location: (data segment p_vaddr) + (data segment p_filesz) + * Size: (dynamic symbol table DT_PTRELSZ entry) + 72 + * + * plt entries have likely been initialized when the libhugetlbfs remapping + * code runs, we must copy these entries when preparing the data segment. Tell + * the arch-independent code how many bytes to copy. + */ +ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab) +{ + int i; + ElfW(Word) pltrelsz = 0; + + /* Find the needed information in the dynamic section */ + for (i = 0; dyntab[i].d_tag != DT_NULL; i++) + if (dyntab[i].d_tag == DT_PLTRELSZ) + pltrelsz = dyntab[i].d_un.d_val; + + /* pltrelsz indicates the size of all plt entries used to cache + * symbol lookups, but does not include the reserved entry at PLT[0]. + * 72 bytes is the ABI-defined size of a plt entry. + */ + if (pltrelsz) + return pltrelsz + 72; + else + return 0; +} diff --git a/elf64lppc.c b/elf64lppc.c new file mode 120000 index 0000000..1f9eba6 --- /dev/null +++ b/elf64lppc.c @@ -0,0 +1 @@ +elf64ppc.c \ No newline at end of file diff --git a/elf64ppc.c b/elf64ppc.c new file mode 100644 index 0000000..8c86fca --- /dev/null +++ b/elf64ppc.c @@ -0,0 +1,54 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "libhugetlbfs_internal.h" + +/* + * The powerpc 64-bit ELF ABI defines the location and size of the plt as + * follows (see the ELF ABI and powerpc64 supplement for details): + * + * Location: (data segment p_vaddr) + (data segment p_filesz) + * Size: (dynamic symbol table DT_PTRELSZ entry) + 24 + * + * plt entries have likely been initialized when the libhugetlbfs remapping + * code runs, we must copy these entries when preparing the data segment. Tell + * the arch-independent code how many bytes to copy. + */ +ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab) +{ + int i; + ElfW(Word) pltrelsz = 0; + + /* Find the needed information in the dynamic section */ + for (i = 0; dyntab[i].d_tag != DT_NULL; i++) + if (dyntab[i].d_tag == DT_PLTRELSZ) + pltrelsz = dyntab[i].d_un.d_val; + + /* pltrelsz indicates the size of all plt entries used to cache + * symbol lookups, but does not include the reserved entry at PLT[0]. + * 24 bytes is the ABI-defined size of a plt entry. + */ + if (pltrelsz) + return pltrelsz + 24; + else + return 0; +} diff --git a/elflink.c b/elflink.c new file mode 100644 index 0000000..ffc84dd --- /dev/null +++ b/elflink.c @@ -0,0 +1,1333 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "version.h" +#include "hugetlbfs.h" +#include "libhugetlbfs_internal.h" + +#ifdef __LP64__ +#define Elf_Ehdr Elf64_Ehdr +#define Elf_Phdr Elf64_Phdr +#define Elf_Dyn Elf64_Dyn +#define Elf_Sym Elf64_Sym +#define ELF_ST_BIND(x) ELF64_ST_BIND(x) +#define ELF_ST_TYPE(x) ELF64_ST_TYPE(x) +#else +#define Elf_Ehdr Elf32_Ehdr +#define Elf_Phdr Elf32_Phdr +#define Elf_Dyn Elf32_Dyn +#define Elf_Sym Elf32_Sym +#define ELF_ST_BIND(x) ELF64_ST_BIND(x) +#define ELF_ST_TYPE(x) ELF64_ST_TYPE(x) +#endif + +/* + * SHARED_TIMEOUT is used by find_or_prepare_shared_file for when it + * should timeout while waiting for other users to finish preparing + * the file it wants. The value is the number of tries before giving + * up with a 1 second wait between tries + */ +#define SHARED_TIMEOUT 10 + +/* This function prints an error message to stderr, then aborts. It + * is safe to call, even if the executable segments are presently + * unmapped. + * + * Arguments are printf() like, but at present supports only %d and %p + * with no modifiers + * + * FIXME: This works in practice, but I suspect it + * is not guaranteed safe: the library functions we call could in + * theory call other functions via the PLT which will blow up. */ +static void write_err(const char *start, int len) +{ + direct_syscall(__NR_write, 2 /*stderr*/, start, len); +} +static void sys_abort(void) +{ + pid_t pid = direct_syscall(__NR_getpid); + + direct_syscall(__NR_kill, pid, SIGABRT); +} +static void write_err_base(unsigned long val, int base) +{ + const char digit[] = "0123456789abcdef"; + char str1[sizeof(val)*8]; + char str2[sizeof(val)*8]; + int len = 0; + int i; + + str1[0] = '0'; + while (val) { + str1[len++] = digit[val % base]; + val /= base; + } + + if (len == 0) + len = 1; + + /* Reverse digits */ + for (i = 0; i < len; i++) + str2[i] = str1[len-i-1]; + + write_err(str2, len); +} + +static void unmapped_abort(const char *fmt, ...) +{ + const char *p, *q; + int done = 0; + unsigned long val; + va_list ap; + + /* World's worst printf()... */ + va_start(ap, fmt); + p = q = fmt; + while (! done) { + switch (*p) { + case '\0': + write_err(q, p-q); + done = 1; + break; + + case '%': + write_err(q, p-q); + p++; + switch (*p) { + case 'u': + val = va_arg(ap, unsigned); + write_err_base(val, 10); + p++; + break; + case 'p': + val = (unsigned long)va_arg(ap, void *); + write_err_base(val, 16); + p++; + break; + } + q = p; + break; + default: + p++; + } + } + + va_end(ap); + + sys_abort(); +} + +/* The directory to use for sharing readonly segments */ +static char share_readonly_path[PATH_MAX+1]; + +#define MAX_HTLB_SEGS 3 +#define MAX_SEGS 10 + +struct seg_info { + void *vaddr; + unsigned long filesz, memsz, extrasz; + int prot; + int fd; + int index; + long page_size; +}; + +struct seg_layout { + unsigned long start, end; + long page_size; +}; + +static struct seg_info htlb_seg_table[MAX_HTLB_SEGS]; +static int htlb_num_segs; +static unsigned long force_remap; /* =0 */ +static long hpage_readonly_size, hpage_writable_size; + +/** + * assemble_path - handy wrapper around snprintf() for building paths + * @dst: buffer of size PATH_MAX+1 to assemble string into + * @fmt: format string for path + * @...: printf() style parameters for path + * + * assemble_path() builds a path in the target buffer (which must have + * PATH_MAX+1 available bytes), similar to sprintf(). However, f the + * assembled path would exceed PATH_MAX characters in length, + * assemble_path() prints an error and abort()s, so there is no need + * to check the return value and backout. + */ +static void assemble_path(char *dst, const char *fmt, ...) +{ + va_list ap; + int len; + + va_start(ap, fmt); + len = vsnprintf(dst, PATH_MAX+1, fmt, ap); + va_end(ap); + + if (len < 0) { + ERROR("vsnprintf() error\n"); + abort(); + } + + if (len > PATH_MAX) { + ERROR("Overflow assembling path\n"); + abort(); + } +} + +static void check_memsz() +{ + int i; + unsigned long memsz_total = 0, memsz_max = 0; + if (htlb_num_segs == 0) + return; + /* + * rough heuristic to see if we'll run out of address + * space + */ + for (i = 0; i < htlb_num_segs; i++) { + memsz_total += htlb_seg_table[i].memsz; + if (htlb_seg_table[i].memsz > memsz_max) + memsz_max = htlb_seg_table[i].memsz; + } + /* avoid overflow checking by using two checks */ + DEBUG("Total memsz = %#0lx, memsz of largest segment = %#0lx\n", + memsz_total, memsz_max); +} + +/** + * find_or_create_share_path - obtain a directory to store the shared + * hugetlbfs files + * + * Checks environment and filesystem to locate a suitable directory + * for shared hugetlbfs files, creating a new directory if necessary. + * The determined path is stored in global variable share_readonly_path. + * + * returns: + * -1, on error + * 0, on success + */ +static int find_or_create_share_path(long page_size) +{ + const char *base_path; + struct stat sb; + int ret; + + /* If no remaping is planned for the read-only segments we are done */ + if (!page_size) + return 0; + + if (__hugetlb_opts.share_path) { + /* Given an explicit path */ + if (hugetlbfs_test_path(__hugetlb_opts.share_path) != 1) { + WARNING("HUGETLB_SHARE_PATH %s is not on a hugetlbfs" + " filesystem\n", __hugetlb_opts.share_path); + return -1; + } + + /* Make sure the page size matches */ + if (page_size != + hugetlbfs_test_pagesize(__hugetlb_opts.share_path)) { + WARNING("HUGETLB_SHARE_PATH %s is not valid for a %li " + "kB page size\n", __hugetlb_opts.share_path, + page_size / 1024); + return -1; + } + assemble_path(share_readonly_path, "%s", + __hugetlb_opts.share_path); + return 0; + } + + base_path = hugetlbfs_find_path_for_size(page_size); + if (!base_path) + return -1; + + assemble_path(share_readonly_path, "%s/elflink-uid-%d", + base_path, getuid()); + + ret = mkdir(share_readonly_path, 0700); + if ((ret != 0) && (errno != EEXIST)) { + WARNING("Error creating share directory %s\n", + share_readonly_path); + return -1; + } + + /* Check the share directory is sane */ + ret = lstat(share_readonly_path, &sb); + if (ret != 0) { + WARNING("Couldn't stat() %s: %s\n", share_readonly_path, + strerror(errno)); + return -1; + } + + if (! S_ISDIR(sb.st_mode)) { + WARNING("%s is not a directory\n", share_readonly_path); + return -1; + } + + if (sb.st_uid != getuid()) { + WARNING("%s has wrong owner (uid=%d instead of %d)\n", + share_readonly_path, sb.st_uid, getuid()); + return -1; + } + + if (sb.st_mode & (S_IWGRP | S_IWOTH)) { + WARNING("%s has bad permissions 0%03o\n", + share_readonly_path, sb.st_mode); + return -1; + } + + return 0; +} + +/* + * Look for non-zero BSS data inside a range and print out any matches + */ + +static void check_bss(unsigned long *start, unsigned long *end) +{ + unsigned long *addr; + + for (addr = start; addr < end; addr++) { + if (*addr != 0) + DEBUG("Non-zero BSS data @ %p: %lx\n", addr, *addr); + } +} + +/** + * get_shared_file_name - create a shared file name from program name, + * segment number and current word size + * @htlb_seg_info: pointer to program's segment data + * @file_path: pointer to a PATH_MAX+1 array to store filename in + * + * The file name created is *not* intended to be unique, except when + * the name, gid or phdr number differ. The goal here is to have a + * standard means of accessing particular segments of particular + * executables. + * + * returns: + * -1, on failure + * 0, on success + */ +static int get_shared_file_name(struct seg_info *htlb_seg_info, char *file_path) +{ + int ret; + char binary[PATH_MAX+1]; + char *binary2; + + memset(binary, 0, sizeof(binary)); + ret = readlink("/proc/self/exe", binary, PATH_MAX); + if (ret < 0) { + WARNING("shared_file: readlink() on /proc/self/exe " + "failed: %s\n", strerror(errno)); + return -1; + } + + binary2 = basename(binary); + if (!binary2) { + WARNING("shared_file: basename() on %s failed: %s\n", + binary, strerror(errno)); + return -1; + } + + assemble_path(file_path, "%s/%s_%zd_%d", share_readonly_path, binary2, + sizeof(unsigned long) * 8, htlb_seg_info->index); + + return 0; +} + +/* Find the .dynamic program header */ +static int find_dynamic(Elf_Dyn **dyntab, const Elf_Phdr *phdr, int phnum) +{ + int i = 1; + + while ((phdr[i].p_type != PT_DYNAMIC) && (i < phnum)) { + ++i; + } + if (phdr[i].p_type == PT_DYNAMIC) { + *dyntab = (Elf_Dyn *)phdr[i].p_vaddr; + return 0; + } else { + DEBUG("No dynamic segment found\n"); + return -1; + } +} + +/* Find the dynamic string and symbol tables */ +static int find_tables(Elf_Dyn *dyntab, Elf_Sym **symtab, char **strtab) +{ + int i = 1; + while ((dyntab[i].d_tag != DT_NULL)) { + if (dyntab[i].d_tag == DT_SYMTAB) + *symtab = (Elf_Sym *)dyntab[i].d_un.d_ptr; + else if (dyntab[i].d_tag == DT_STRTAB) + *strtab = (char *)dyntab[i].d_un.d_ptr; + i++; + } + + if (!*symtab) { + DEBUG("No symbol table found\n"); + return -1; + } + if (!*strtab) { + DEBUG("No string table found\n"); + return -1; + } + return 0; +} + +/* Find the number of symbol table entries */ +static int find_numsyms(Elf_Sym *symtab, char *strtab) +{ + /* + * WARNING - The symbol table size calculation does not follow the ELF + * standard, but rather exploits an assumption we enforce in + * our linker scripts that the string table follows + * immediately after the symbol table. The linker scripts + * must maintain this assumption or this code will break. + */ + if ((void *)strtab <= (void *)symtab) { + DEBUG("Could not calculate dynamic symbol table size\n"); + return -1; + } + return ((void *)strtab - (void *)symtab) / sizeof(Elf_Sym); +} + +/* + * To reduce the size of the extra copy window, we can eliminate certain + * symbols based on information in the dynamic section. The following + * characteristics apply to symbols which may require copying: + * - Within the BSS + * - Global or Weak binding + * - Object type (variable) + * - Non-zero size (zero size means the symbol is just a marker with no data) + */ +static inline int keep_symbol(char *strtab, Elf_Sym *s, void *start, void *end) +{ + if ((void *)s->st_value < start) + return 0; + if ((void *)s->st_value > end) + return 0; + if ((ELF_ST_BIND(s->st_info) != STB_GLOBAL) && + (ELF_ST_BIND(s->st_info) != STB_WEAK)) + return 0; + if (ELF_ST_TYPE(s->st_info) != STT_OBJECT) + return 0; + if (s->st_size == 0) + return 0; + + if (__hugetlbfs_debug) + DEBUG("symbol to copy at %p: %s\n", (void *)s->st_value, + strtab + s->st_name); + + return 1; +} + +/* If unspecified by the architecture, no extra copying of the plt is needed */ +ElfW(Word) __attribute__ ((weak)) plt_extrasz(ElfW(Dyn) *dyntab) +{ + return 0; +} + +/* + * Subtle: Since libhugetlbfs depends on glibc, we allow it + * it to be loaded before us. As part of its init functions, it + * initializes stdin, stdout, and stderr in the bss. We need to + * include these initialized variables in our copy. + */ + +static void get_extracopy(struct seg_info *seg, const Elf_Phdr *phdr, int phnum) +{ + Elf_Dyn *dyntab; /* dynamic segment table */ + Elf_Sym *symtab = NULL; /* dynamic symbol table */ + Elf_Sym *sym; /* a symbol */ + char *strtab = NULL; /* string table for dynamic symbols */ + int ret, numsyms, found_sym = 0; + void *start, *end, *end_orig; + void *sym_end; + void *plt_end; + + end_orig = seg->vaddr + seg->memsz; + start = seg->vaddr + seg->filesz; + if (seg->filesz == seg->memsz) + return; + if (!__hugetlb_opts.min_copy) + goto bail2; + + /* Find dynamic program header */ + ret = find_dynamic(&dyntab, phdr, phnum); + if (ret < 0) + goto bail; + + /* Find symbol and string tables */ + ret = find_tables(dyntab, &symtab, &strtab); + if (ret < 0) + goto bail; + + numsyms = find_numsyms(symtab, strtab); + if (numsyms < 0) + goto bail; + + /* + * We must ensure any returns done hereafter have sane start and end + * values, as the criss-cross apple sauce algorithm is beginning + */ + end = start; + + for (sym = symtab; sym < symtab + numsyms; sym++) { + if (!keep_symbol(strtab, sym, start, end_orig)) + continue; + + /* These are the droids we are looking for */ + found_sym = 1; + sym_end = (void *)(sym->st_value + sym->st_size); + if (sym_end > end) + end = sym_end; + } + + /* + * Some platforms (PowerPC 64bit ELF) place their PLT beyond the filesz + * part of the data segment. When this is the case, we must extend the + * copy window to include this data which has been initialized by the + * run-time linker. + */ + plt_end = start + plt_extrasz(dyntab); + if (plt_end > end) { + end = plt_end; + found_sym = 1; + } + + if (__hugetlbfs_debug) + check_bss(end, end_orig); + + if (found_sym) { + seg->extrasz = end - start; + } + /* + * else no need to copy anything, so leave seg->extrasz as zero + */ + return; + +bail: + DEBUG("Unable to perform minimal copy\n"); +bail2: + seg->extrasz = end_orig - start; +} + +#if defined(__powerpc64__) || \ + (defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)) +#define SLICE_LOW_TOP (0x100000000UL) +#define SLICE_LOW_SIZE (1UL << SLICE_LOW_SHIFT) +#define SLICE_HIGH_SIZE (1UL << SLICE_HIGH_SHIFT) +#endif + +/* + * Return the address of the start and end of the hugetlb slice + * containing @addr. A slice is a range of addresses, start inclusive + * and end exclusive. + * Note, that since relinking is not supported on ia64, we can leave it + * out here. + */ +static unsigned long hugetlb_slice_start(unsigned long addr) +{ +#if defined(__powerpc64__) + if (addr < SLICE_LOW_TOP) + return ALIGN_DOWN(addr, SLICE_LOW_SIZE); + else if (addr < SLICE_HIGH_SIZE) + return SLICE_LOW_TOP; + else + return ALIGN_DOWN(addr, SLICE_HIGH_SIZE); +#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS) + return ALIGN_DOWN(addr, SLICE_LOW_SIZE); +#else + return ALIGN_DOWN(addr, gethugepagesize()); +#endif +} + +static unsigned long hugetlb_slice_end(unsigned long addr) +{ +#if defined(__powerpc64__) + if (addr < SLICE_LOW_TOP) + return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1; + else + return ALIGN_UP(addr, SLICE_HIGH_SIZE) - 1; +#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS) + return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1; +#else + return ALIGN_UP(addr, gethugepagesize()) - 1; +#endif +} + +static unsigned long hugetlb_next_slice_start(unsigned long addr) +{ + return hugetlb_slice_end(addr) + 1; +} + +static unsigned long hugetlb_prev_slice_end(unsigned long addr) +{ + return hugetlb_slice_start(addr) - 1; +} + +/* + * Store a copy of the given program header + */ +static int save_phdr(int table_idx, int phnum, const ElfW(Phdr) *phdr) +{ + int prot = 0; + + if (table_idx >= MAX_HTLB_SEGS) { + WARNING("Executable has too many segments (max %d)\n", + MAX_HTLB_SEGS); + htlb_num_segs = 0; + return -1; + } + + if (phdr->p_flags & PF_R) + prot |= PROT_READ; + if (phdr->p_flags & PF_W) + prot |= PROT_WRITE; + if (phdr->p_flags & PF_X) + prot |= PROT_EXEC; + + htlb_seg_table[table_idx].vaddr = (void *) phdr->p_vaddr; + htlb_seg_table[table_idx].filesz = phdr->p_filesz; + htlb_seg_table[table_idx].memsz = phdr->p_memsz; + htlb_seg_table[table_idx].prot = prot; + htlb_seg_table[table_idx].index = phnum; + + INFO("Segment %d (phdr %d): %#0lx-%#0lx (filesz=%#0lx) " + "(prot = %#0x)\n", table_idx, phnum, + (unsigned long) phdr->p_vaddr, + (unsigned long) phdr->p_vaddr + phdr->p_memsz, + (unsigned long) phdr->p_filesz, (unsigned int) prot); + + return 0; +} + +static int verify_segment_layout(struct seg_layout *segs, int num_segs) +{ + int i; + long base_size = getpagesize(); + + for (i = 1; i < num_segs; i++) { + unsigned long prev_end = segs[i - 1].end; + unsigned long start = segs[i].start; + + /* + * Do not worry about the boundary between segments that will + * not be remapped. + */ + if (segs[i - 1].page_size == base_size && + segs[i].page_size == base_size) + continue; + + /* Make sure alignment hasn't caused segments to overlap */ + if (prev_end > start) { + WARNING("Layout problem with segments %i and %i:\n\t" + "Segments would overlap\n", i - 1, i); + return 1; + } + + /* Make sure page size transitions occur on slice boundaries */ + if ((segs[i - 1].page_size != segs[i].page_size) && + hugetlb_slice_end(prev_end) > + hugetlb_slice_start(start)) { + WARNING("Layout problem with segments %i and %i:\n\t" + "Only one page size per slice\n", i - 1, i); + return 1; + } + } + return 0; +} + +static long segment_requested_page_size(const ElfW(Phdr) *phdr) +{ + int writable = phdr->p_flags & PF_W; + + /* Check if a page size was requested by the user */ + if (writable && hpage_writable_size) + return hpage_writable_size; + if (!writable && hpage_readonly_size) + return hpage_readonly_size; + + /* Check if this segment requests remapping by default */ + if (!hpage_readonly_size && !hpage_writable_size && + (phdr->p_flags & PF_LINUX_HUGETLB)) + return gethugepagesize(); + + /* No remapping selected, return the base page size */ + return getpagesize(); +} + +static +int parse_elf_normal(struct dl_phdr_info *info, size_t size, void *data) +{ + int i, num_segs; + unsigned long page_size, seg_psize, start, end; + struct seg_layout segments[MAX_SEGS]; + + page_size = getpagesize(); + num_segs = 0; + + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + if (i >= MAX_SEGS) { + WARNING("Maximum number of PT_LOAD segments" + "exceeded\n"); + return 1; + } + + seg_psize = segment_requested_page_size(&info->dlpi_phdr[i]); + if (seg_psize != page_size) { + if (save_phdr(htlb_num_segs, i, &info->dlpi_phdr[i])) + return 1; + get_extracopy(&htlb_seg_table[htlb_num_segs], + &info->dlpi_phdr[0], info->dlpi_phnum); + htlb_seg_table[htlb_num_segs].page_size = seg_psize; + htlb_num_segs++; + } + start = ALIGN_DOWN(info->dlpi_phdr[i].p_vaddr, seg_psize); + end = ALIGN(info->dlpi_phdr[i].p_vaddr + + info->dlpi_phdr[i].p_memsz, seg_psize); + + segments[num_segs].page_size = seg_psize; + segments[num_segs].start = start; + segments[num_segs].end = end; + num_segs++; + } + if (verify_segment_layout(segments, num_segs)) + htlb_num_segs = 0; + + if (__hugetlbfs_debug) + check_memsz(); + + return 1; +} + +/* + * Parse the phdrs of a normal program to attempt partial segment remapping + */ +static +int parse_elf_partial(struct dl_phdr_info *info, size_t size, void *data) +{ + unsigned long vaddr, memsz, gap; + unsigned long slice_end; + int i; + + /* This should never actually be called more than once in an + * iteration: we assume that dl_iterate_phdrs() always gives + * us the main program's phdrs on the first iteration, and + * always return 1 to cease iteration at that point. */ + + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + /* + * Partial segment remapping only makes sense if the + * memory size of the segment is larger than the + * granularity at which hugepages can be used. This + * mostly affects ppc, where the segment must be larger + * than 256M. This guarantees that remapping the binary + * in this forced way won't violate any contiguity + * constraints. + */ + vaddr = hugetlb_next_slice_start(info->dlpi_phdr[i].p_vaddr); + gap = vaddr - info->dlpi_phdr[i].p_vaddr; + slice_end = hugetlb_slice_end(vaddr); + /* + * we should stop remapping just before the slice + * containing the end of the memsz portion (taking away + * the gap of the memsz) + */ + memsz = info->dlpi_phdr[i].p_memsz; + if (memsz < gap) { + INFO("Segment %d's unaligned memsz is too small: " + "%#0lx < %#0lx\n", + i, memsz, gap); + continue; + } + memsz -= gap; + if (memsz < (slice_end - vaddr)) { + INFO("Segment %d's aligned memsz is too small: " + "%#0lx < %#0lx\n", + i, memsz, slice_end - vaddr); + continue; + } + memsz = hugetlb_prev_slice_end(vaddr + memsz) - vaddr; + + if (save_phdr(htlb_num_segs, i, &info->dlpi_phdr[i])) + return 1; + + /* + * When remapping partial segments, we create a sub-segment + * that is based on the original. For this reason, we must + * make some changes to the phdr captured by save_phdr(): + * vaddr is aligned upwards to a slice boundary + * memsz is aligned downwards to a slice boundary + * filesz is set to memsz to force all memory to be copied + */ + htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr; + htlb_seg_table[htlb_num_segs].filesz = memsz; + htlb_seg_table[htlb_num_segs].memsz = memsz; + + htlb_num_segs++; + } + return 1; +} + +/* + * Verify that a range of memory is unoccupied and usable + */ +static void check_range_empty(void *addr, unsigned long len) +{ + void *p; + + p = mmap(addr, len, PROT_READ, MAP_PRIVATE|MAP_ANON, 0, 0); + if (p != addr) { + WARNING("Unable to verify address range %p - %p. Not empty?\n", + addr, addr + len); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + } + if (p != MAP_FAILED) + munmap(p, len); +} + +/* + * Copy a program segment into a huge page. If possible, try to copy the + * smallest amount of data possible, unless the user disables this + * optimization via the HUGETLB_ELFMAP environment variable. + */ +static int prepare_segment(struct seg_info *seg) +{ + void *start, *p, *end, *new_end; + unsigned long size, offset; + long page_size = getpagesize(); + long hpage_size; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; + + hpage_size = seg->page_size; + + /* + * mmaps must begin at an address aligned to the page size. If the + * vaddr of this segment is not hpage_size aligned, align it downward + * and begin the mmap there. Note the offset so we can copy data to + * the correct starting address within the temporary mmap. + */ + start = (void *) ALIGN_DOWN((unsigned long)seg->vaddr, hpage_size); + offset = seg->vaddr - start; + + /* + * Calculate the size of the temporary mapping we must create. + * This includes the offset (described above) and the filesz and + * extrasz portions of the segment (described below). We must align + * this total to the huge page size so it will be valid for mmap. + */ + size = ALIGN(offset + seg->filesz + seg->extrasz, hpage_size); + + /* + * If the segment's start or end addresses have been adjusted to align + * them to the hpage_size, check to make sure nothing is mapped in the + * padding before and after the segment. + */ + end = (void *) ALIGN((unsigned long)seg->vaddr + seg->memsz, page_size); + new_end = (void *) ALIGN((unsigned long)end, hpage_size); + if (ALIGN_DOWN(offset, page_size)) + check_range_empty(start, ALIGN_DOWN(offset, page_size)); + if (end != new_end) + check_range_empty(end, new_end - end); + + /* Create the temporary huge page mmap */ + p = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_SHARED|mmap_reserve, seg->fd, 0); + if (p == MAP_FAILED) { + WARNING("Couldn't map hugepage segment to copy data: %s\n", + strerror(errno)); + return -1; + } + + /* + * Minimizing the amount of data copied will maximize performance. + * By definition, the filesz portion of the segment contains + * initialized data and must be copied. If part of the memsz portion + * is known to be initialized already, extrasz will be non-zero and + * that many addtional bytes will be copied from the beginning of the + * memsz region. The rest of the memsz is understood to be zeroes and + * need not be copied. + */ + INFO("Mapped hugeseg at %p. Copying %#0lx bytes and %#0lx extra bytes" + " from %p...", p, seg->filesz, seg->extrasz, seg->vaddr); + memcpy(p + offset, seg->vaddr, seg->filesz + seg->extrasz); + INFO_CONT("done\n"); + + munmap(p, size); + + return 0; +} + +/* + * [PPC] Prior to 2.6.22 (which added slices), our temporary hugepage + * mappings are placed in the segment before the stack. This 'taints' that + * segment for be hugepage-only for the lifetime of the process, resulting + * in a maximum stack size of 256MB. If we instead create our hugepage + * mappings in a child process, we can avoid this problem. + * + * This does not adversely affect non-PPC platforms so do it everywhere. + */ +static int fork_and_prepare_segment(struct seg_info *htlb_seg_info) +{ + int pid, ret, status; + + if ((pid = fork()) < 0) { + WARNING("fork failed"); + return -1; + } + if (pid == 0) { + ret = prepare_segment(htlb_seg_info); + if (ret < 0) { + WARNING("Failed to prepare segment\n"); + exit(1); + } + else + exit(0); + } + ret = waitpid(pid, &status, 0); + if (ret == -1) { + WARNING("waitpid failed"); + return -1; + } + + if (WEXITSTATUS(status) != 0) + return -1; + + INFO("Prepare succeeded\n"); + return 0; +} + +/** + * find_or_prepare_shared_file - get one shareable file + * @htlb_seg_info: pointer to program's segment data + * + * This function either locates a hugetlbfs file already containing + * data for a given program segment, or creates one if it doesn't + * already exist. + * + * We use the following algorithm to ensure that when processes race + * to instantiate the hugepage file, we will never obtain an + * incompletely prepared file or have multiple processes prepar + * separate copies of the file. + * - first open 'filename.tmp' with O_EXCL (this acts as a lockfile) + * - second open 'filename' with O_RDONLY (even if the first open + * succeeded). + * Then: + * - If both opens succeed, close the O_EXCL open, unlink + * filename.tmp and use the O_RDONLY fd. (Somebody else has prepared + * the file already) + * - If only the O_RDONLY open suceeds, and the O_EXCL open + * fails with EEXIST, just used the O_RDONLY fd. (Somebody else has + * prepared the file already, but we raced with their rename()). + * - If only the O_EXCL open suceeds, and the O_RDONLY fails with + * ENOENT, prepare the the O_EXCL open, then rename() filename.tmp to + * filename. (We're the first in, we have to prepare the file). + * - If both opens fail, with EEXIST and ENOENT, respectively, + * wait for a little while, then try again from the beginning + * (Somebody else is preparing the file, but hasn't finished yet) + * + * returns: + * -1, on failure + * 0, on success + */ +static int find_or_prepare_shared_file(struct seg_info *htlb_seg_info) +{ + int fdx = -1, fds; + int errnox, errnos; + int ret; + int i; + char final_path[PATH_MAX+1]; + char tmp_path[PATH_MAX+1]; + + ret = get_shared_file_name(htlb_seg_info, final_path); + if (ret < 0) + return -1; + assemble_path(tmp_path, "%s.tmp", final_path); + + for (i = 0; i < SHARED_TIMEOUT; i++) { + /* NB: mode is modified by umask */ + fdx = open(tmp_path, O_CREAT | O_EXCL | O_RDWR, 0666); + errnox = errno; + fds = open(final_path, O_RDONLY); + errnos = errno; + + if (fds >= 0) { + /* Got an already-prepared file -> use it */ + if (fdx > 0) { + /* Also got an exclusive file -> clean up */ + ret = unlink(tmp_path); + if (ret != 0) + WARNING("shared_file: unable to clean " + "up unneeded file %s: %s\n", + tmp_path, strerror(errno)); + close(fdx); + } else if (errnox != EEXIST) { + WARNING("shared_file: Unexpected failure on exclusive" + " open of %s: %s\n", tmp_path, + strerror(errnox)); + } + htlb_seg_info->fd = fds; + return 0; + } + + if (fdx >= 0) { + /* It's our job to prepare */ + if (errnos != ENOENT) + WARNING("shared_file: Unexpected failure on" + " shared open of %s: %s\n", final_path, + strerror(errnos)); + + htlb_seg_info->fd = fdx; + + INFO("Got unpopulated shared fd -- Preparing\n"); + ret = fork_and_prepare_segment(htlb_seg_info); + if (ret < 0) + goto fail; + + INFO("Prepare succeeded\n"); + /* move to permanent location */ + ret = rename(tmp_path, final_path); + if (ret != 0) { + WARNING("shared_file: unable to rename %s" + " to %s: %s\n", tmp_path, final_path, + strerror(errno)); + goto fail; + } + + return 0; + } + + /* Both opens failed, somebody else is still preparing */ + /* Wait and try again */ + sleep(1); + } + + fail: + if (fdx > 0) { + ret = unlink(tmp_path); + if (ret != 0) + WARNING("shared_file: Unable to clean up temp file %s " + "on failure: %s\n", tmp_path, strerror(errno)); + close(fdx); + } + + return -1; +} + +/** + * obtain_prepared_file - multiplex callers depending on if + * sharing or not + * @htlb_seg_info: pointer to program's segment data + * + * returns: + * -1, on error + * 0, on success + */ +static int obtain_prepared_file(struct seg_info *htlb_seg_info) +{ + int fd = -1; + int ret; + long hpage_size = htlb_seg_info->page_size; + + /* Share only read-only segments */ + if (__hugetlb_opts.sharing && !(htlb_seg_info->prot & PROT_WRITE)) { + /* first, try to share */ + ret = find_or_prepare_shared_file(htlb_seg_info); + if (ret == 0) + return 0; + /* but, fall through to unlinked files, if sharing fails */ + WARNING("Falling back to unlinked files\n"); + } + fd = hugetlbfs_unlinked_fd_for_size(hpage_size); + if (fd < 0) + return -1; + htlb_seg_info->fd = fd; + + return fork_and_prepare_segment(htlb_seg_info); +} + +static void remap_segments(struct seg_info *seg, int num) +{ + int i; + void *p; + unsigned long start, offset, mapsize; + long page_size = getpagesize(); + long hpage_size; + int mmap_flags; + + /* + * XXX: The bogus call to mmap below forces ld.so to resolve the + * mmap symbol before we unmap the plt in the data segment + * below. This might only be needed in the case where sharing + * is enabled and the hugetlbfs files have already been prepared + * by another process. + */ + p = mmap(0, 0, 0, 0, 0, 0); + + /* This is the hairy bit, between unmap and remap we enter a + * black hole. We can't call anything which uses static data + * (ie. essentially any library function...) + */ + for (i = 0; i < num; i++) { + start = ALIGN_DOWN((unsigned long)seg[i].vaddr, page_size); + offset = (unsigned long)(seg[i].vaddr - start); + mapsize = ALIGN(offset + seg[i].memsz, page_size); + munmap((void *) start, mapsize); + } + + /* Step 4. Rebuild the address space with hugetlb mappings */ + /* NB: we can't do the remap as hugepages within the main loop + * because of PowerPC: we may need to unmap all the normal + * segments before the MMU segment is ok for hugepages */ + for (i = 0; i < num; i++) { + hpage_size = seg[i].page_size; + start = ALIGN_DOWN((unsigned long)seg[i].vaddr, hpage_size); + offset = (unsigned long)(seg[i].vaddr - start); + mapsize = ALIGN(offset + seg[i].memsz, hpage_size); + mmap_flags = MAP_PRIVATE|MAP_FIXED; + + /* If requested, make no reservations */ + if (__hugetlb_opts.no_reserve) + mmap_flags |= MAP_NORESERVE; + + /* + * If this is a read-only mapping whose contents are + * entirely contained within the file, then use MAP_NORESERVE. + * The assumption is that the pages already exist in the + * page cache for the hugetlbfs file since it was prepared + * earlier and that mprotect() will not be called which would + * require a COW + */ + if (!(seg[i].prot & PROT_WRITE) && + seg[i].filesz == seg[i].memsz) + mmap_flags |= MAP_NORESERVE; + + p = mmap((void *) start, mapsize, seg[i].prot, + mmap_flags, seg[i].fd, 0); + if (p == MAP_FAILED) + unmapped_abort("Failed to map hugepage segment %u: " + "%p-%p (errno=%u)\n", i, start, + start + mapsize, errno); + if (p != (void *) start) + unmapped_abort("Mapped hugepage segment %u (%p-%p) at " + "wrong address %p\n", i, seg[i].vaddr, + seg[i].vaddr+mapsize, p); + } + /* The segments are all back at this point. + * and it should be safe to reference static data + */ +} + +static int set_hpage_sizes(const char *env) +{ + char *pos; + long size; + char *key; + char keys[5] = { "R\0" "W\0" "\0" }; + + /* For each key in R,W */ + for (key = keys; *key != '\0'; key += 2) { + pos = strcasestr(env, key); + if (!pos) + continue; + + if (*(++pos) == '=') { + size = parse_page_size(pos + 1); + if (size == -1) + return size; + } else + size = gethugepagesize(); + + if (size <= 0) { + if (errno == ENOSYS) + WARNING("Hugepages unavailable\n"); + else if (errno == EOVERFLOW) + WARNING("Hugepage size too large\n"); + else + WARNING("Hugepage size (%s)\n", + strerror(errno)); + size = 0; + } else if (!hugetlbfs_find_path_for_size(size)) { + WARNING("Hugepage size %li unavailable", size); + size = 0; + } + + if (*key == 'R') + hpage_readonly_size = size; + else + hpage_writable_size = size; + } + return 0; +} + +static int check_env(void) +{ + extern Elf_Ehdr __executable_start __attribute__((weak)); + + if (__hugetlb_opts.elfmap && + (strcasecmp(__hugetlb_opts.elfmap, "no") == 0)) { + INFO("HUGETLB_ELFMAP=%s, not attempting to remap program " + "segments\n", __hugetlb_opts.elfmap); + return -1; + } + if (__hugetlb_opts.elfmap && set_hpage_sizes(__hugetlb_opts.elfmap)) { + WARNING("Cannot set elfmap page sizes: %s", strerror(errno)); + return -1; + } + + if (__hugetlb_opts.ld_preload && + strstr(__hugetlb_opts.ld_preload, "libhugetlbfs")) { + if (__hugetlb_opts.force_elfmap) { + force_remap = 1; + INFO("HUGETLB_FORCE_ELFMAP=yes, " + "enabling partial segment " + "remapping for non-relinked " + "binaries\n"); + INFO("Disabling filesz copy optimization\n"); + __hugetlb_opts.min_copy = false; + } else { + if (&__executable_start) { + WARNING("LD_PRELOAD is incompatible with " + "segment remapping\n"); + WARNING("Segment remapping has been " + "DISABLED\n"); + return -1; + } + } + } + + if (__hugetlb_opts.sharing == 2) { + WARNING("HUGETLB_SHARE=%d, however sharing of writable\n" + "segments has been deprecated and is now disabled\n", + __hugetlb_opts.sharing); + __hugetlb_opts.sharing = 0; + } else { + INFO("HUGETLB_SHARE=%d, sharing ", __hugetlb_opts.sharing); + if (__hugetlb_opts.sharing == 1) { + INFO_CONT("enabled for only read-only segments\n"); + } else { + INFO_CONT("disabled\n"); + __hugetlb_opts.sharing = 0; + } + } + + INFO("HUGETLB_NO_RESERVE=%s, reservations %s\n", + __hugetlb_opts.no_reserve ? "yes" : "no", + __hugetlb_opts.no_reserve ? "disabled" : "enabled"); + + return 0; +} + +/* + * Parse an ELF header and record segment information for any segments + * which contain hugetlb information. + */ +static int parse_elf() +{ + if (force_remap) + dl_iterate_phdr(parse_elf_partial, NULL); + else + dl_iterate_phdr(parse_elf_normal, NULL); + + if (htlb_num_segs == 0) { + INFO("No segments were appropriate for remapping\n"); + return -1; + } + + return 0; +} + +void hugetlbfs_setup_elflink(void) +{ + int i, ret; + + if (check_env()) + return; + + if (parse_elf()) + return; + + INFO("libhugetlbfs version: %s\n", VERSION); + + /* Do we need to find a share directory */ + if (__hugetlb_opts.sharing) { + /* + * If HUGETLB_ELFMAP is undefined but a shareable segment has + * PF_LINUX_HUGETLB set, segment remapping will occur using the + * default huge page size. + */ + long page_size = hpage_readonly_size ? + hpage_readonly_size : gethugepagesize(); + + ret = find_or_create_share_path(page_size); + if (ret != 0) { + WARNING("Segment remapping is disabled"); + return; + } + } + + /* Step 1. Obtain hugepage files with our program data */ + for (i = 0; i < htlb_num_segs; i++) { + ret = obtain_prepared_file(&htlb_seg_table[i]); + if (ret < 0) { + WARNING("Failed to setup hugetlbfs file for segment " + "%d\n", i); + + /* Close files we have already prepared */ + for (i--; i >= 0; i--) + close(htlb_seg_table[i].fd); + + return; + } + } + + /* Step 3. Unmap the old segments, map in the new ones */ + remap_segments(htlb_seg_table, htlb_num_segs); +} diff --git a/huge_page_setup_helper.py b/huge_page_setup_helper.py new file mode 100755 index 0000000..43c9916 --- /dev/null +++ b/huge_page_setup_helper.py @@ -0,0 +1,343 @@ +#!/usr/bin/python + +# +# Tool to set up Linux large page support with minimal effort +# +# by Jarod Wilson +# (c) Red Hat, Inc., 2009 +# +# Requires hugeadm from libhugetlbfs 2.7 (or backported support) +# +import os + +debug = False + +# must be executed under the root to operate +if os.geteuid() != 0: + print "You must be root to setup hugepages!" + os._exit(1) + +# config files we need access to +sysctlConf = "/etc/sysctl.conf" +if not os.access(sysctlConf, os.W_OK): + print "Cannot access %s" % sysctlConf + if debug == False: + os._exit(1) + +# This file will be created if it doesn't exist +limitsConf = "/etc/security/limits.d/hugepages.conf" + + +# Figure out what we've got in the way of memory +memTotal = 0 +hugePageSize = 0 +hugePages = 0 + +hugeadmexplain = os.popen("/usr/bin/hugeadm --explain 2>/dev/null").readlines() + +for line in hugeadmexplain: + if line.startswith("Total System Memory:"): + memTotal = int(line.split()[3]) + break + +if memTotal == 0: + print "Your version of libhugetlbfs' hugeadm utility is too old!" + os._exit(1) + + +# Pick the default huge page size and see how many pages are allocated +poolList = os.popen("/usr/bin/hugeadm --pool-list").readlines() +for line in poolList: + if '*' in line: + hugePageSize = int(line.split()[0]) + hugePages = int(line.split()[2]) + break + +if hugePageSize == 0: + print "Aborting, cannot determine system huge page size!" + os._exit(1) + +# Get initial sysctl settings +shmmax = 0 +hugeGID = 0 + +for line in hugeadmexplain: + if line.startswith("A /proc/sys/kernel/shmmax value of"): + shmmax = int(line.split()[4]) + break + +for line in hugeadmexplain: + if line.strip().startswith("vm.hugetlb_shm_group = "): + hugeGID = int(line.split()[2]) + break + + +# translate group into textual version +hugeGIDName = "null" +groupNames = os.popen("/usr/bin/getent group").readlines() +for line in groupNames: + curGID = int(line.split(":")[2]) + if curGID == hugeGID: + hugeGIDName = line.split(":")[0] + break + + +# dump system config as we see it before we start tweaking it +print "Current configuration:" +print " * Total System Memory......: %6d MB" % memTotal +print " * Shared Mem Max Mapping...: %6d MB" % (shmmax / (1024 * 1024)) +print " * System Huge Page Size....: %6d MB" % (hugePageSize / (1024 * 1024)) +print " * Number of Huge Pages.....: %6d" % hugePages +print " * Total size of Huge Pages.: %6d MB" % (hugePages * hugePageSize / (1024 * 1024)) +print " * Remaining System Memory..: %6d MB" % (memTotal - (hugePages * hugePageSize / (1024 * 1024))) +print " * Huge Page User Group.....: %s (%d)" % (hugeGIDName, hugeGID) +print + + +# ask how memory they want to allocate for huge pages +userIn = None +while not userIn: + try: + userIn = raw_input("How much memory would you like to allocate for huge pages? " + "(input in MB, unless postfixed with GB): ") + if userIn[-2:] == "GB": + userHugePageReqMB = int(userIn[0:-2]) * 1024 + elif userIn[-1:] == "G": + userHugePageReqMB = int(userIn[0:-1]) * 1024 + elif userIn[-2:] == "MB": + userHugePageReqMB = int(userIn[0:-2]) + elif userIn[-1:] == "M": + userHugePageReqMB = int(userIn[0:-1]) + else: + userHugePageReqMB = int(userIn) + # As a sanity safeguard, require at least 128M not be allocated to huge pages + if userHugePageReqMB > (memTotal - 128): + userIn = None + print "Refusing to allocate %d, you must leave at least 128MB for the system" % userHugePageReqMB + elif userHugePageReqMB < (hugePageSize / (1024 * 1024)): + userIn = None + print "Sorry, allocation must be at least a page's worth!" + else: + break + except ValueError: + userIn = None + print "Input must be an integer, please try again!" +userHugePageReqKB = userHugePageReqMB * 1024 +userHugePagesReq = userHugePageReqKB / (hugePageSize / 1024) +print "Okay, we'll try to allocate %d MB for huge pages..." % userHugePageReqMB +print + + +# some basic user input validation +badchars = list(' \\\'":;~`!$^&*(){}[]?/><,') +inputIsValid = False +# ask for the name of the group allowed access to huge pages +while inputIsValid == False: + foundbad = False + userGroupReq = raw_input("What group should have access to the huge pages?" + "(The group will be created, if need be) [hugepages]: ") + if userGroupReq is '': + userGroupReq = 'hugepages' + if userGroupReq[0].isdigit() or userGroupReq[0] == "-": + foundbad = True + print "Group names cannot start with a number or dash, please try again!" + for char in badchars: + if char in userGroupReq: + foundbad = True + print "Illegal characters in group name, please try again!" + break + if len(userGroupReq) > 16: + foundbad = True + print "Group names can't be more than 16 characaters, please try again!" + if foundbad == False: + inputIsValid = True +print "Okay, we'll give group %s access to the huge pages" % userGroupReq + + +# see if group already exists, use it if it does, if not, create it +userGIDReq = -1 +for line in groupNames: + curGroupName = line.split(":")[0] + if curGroupName == userGroupReq: + userGIDReq = int(line.split(":")[2]) + break + +if userGIDReq > -1: + print "Group %s (gid %d) already exists, we'll use it" % (userGroupReq, userGIDReq) +else: + if debug == False: + os.popen("/usr/sbin/groupadd %s" % userGroupReq) + else: + print "/usr/sbin/groupadd %s" % userGroupReq + groupNames = os.popen("/usr/bin/getent group %s" % userGroupReq).readlines() + for line in groupNames: + curGroupName = line.split(":")[0] + if curGroupName == userGroupReq: + userGIDReq = int(line.split(":")[2]) + break + print "Created group %s (gid %d) for huge page use" % (userGroupReq, userGIDReq) +print + + +# basic user input validation, take 2 +# space is valid in this case, wasn't in the prior incarnation +badchars = list('\\\'":;~`!$^&*(){}[]?/><,') +inputIsValid = False +# ask for user(s) that should be in the huge page access group +while inputIsValid == False: + foundbad = False + userUsersReq = raw_input("What user(s) should have access to the huge pages (space-delimited list, users created as needed)? ") + for char in badchars: + if char in userUsersReq: + foundbad = True + print "Illegal characters in user name(s) or invalid list format, please try again!" + break + for n in userUsersReq.split(): + if len(n) > 32: + foundbad = True + print "User names can't be more than 32 characaters, please try again!" + break + if n[0] == "-": + foundbad = True + print "User names cannot start with a dash, please try again!" + break + if foundbad == False: + inputIsValid = True +# see if user(s) already exist(s) +curUserList = os.popen("/usr/bin/getent passwd").readlines() +hugePageUserList = userUsersReq.split() +for hugeUser in hugePageUserList: + userExists = False + for line in curUserList: + curUser = line.split(":")[0] + if curUser == hugeUser: + print "Adding user %s to huge page group" % hugeUser + userExists = True + if debug == False: + os.popen("/usr/sbin/usermod -a -G %s %s" % (userGroupReq, hugeUser)) + else: + print "/usr/sbin/usermod -a -G %s %s" % (userGroupReq, hugeUser) + if userExists == True: + break + if userExists == False: + print "Creating user %s with membership in huge page group" % hugeUser + if debug == False: + if hugeUser == userGroupReq: + os.popen("/usr/sbin/useradd %s -g %s" % (hugeUser, userGroupReq)) + else: + os.popen("/usr/sbin/useradd %s -G %s" % (hugeUser, userGroupReq)) + else: + print "/usr/sbin/useradd %s -G %s" % (hugeUser, userGroupReq) +print + + +# set values for the current running environment +if debug == False: + os.popen("/usr/bin/hugeadm --pool-pages-min DEFAULT:%sM" % userHugePageReqMB) + os.popen("/usr/bin/hugeadm --pool-pages-max DEFAULT:%sM" % userHugePageReqMB) + os.popen("/usr/bin/hugeadm --set-shm-group %d" % userGIDReq) + os.popen("/usr/bin/hugeadm --set-recommended-shmmax") +else: + print "/usr/bin/hugeadm --pool-pages-min DEFAULT:%sM" % userHugePageReqMB + print "/usr/bin/hugeadm --pool-pages-max DEFAULT:%sM" % userHugePageReqMB + print "/usr/bin/hugeadm --set-shm-group %d" % userGIDReq + print "/usr/bin/hugeadm --set-recommended-shmmax" + print + +# figure out what that shmmax value we just set was +hugeadmexplain = os.popen("/usr/bin/hugeadm --explain 2>/dev/null").readlines() +for line in hugeadmexplain: + if line.strip().startswith("kernel.shmmax = "): + shmmax = int(line.split()[2]) + break + +# write out sysctl config changes to persist across reboot +if debug == False: + sysctlConfLines = "# sysctl configuration\n" + if os.access(sysctlConf, os.W_OK): + try: + sysctlConfLines = open(sysctlConf).readlines() + os.rename(sysctlConf, sysctlConf + ".backup") + print("Saved original %s as %s.backup" % (sysctlConf, sysctlConf)) + except: + pass + + fd = open(sysctlConf, "w") + for line in sysctlConfLines: + if line.startswith("kernel.shmmax"): + continue + elif line.startswith("vm.nr_hugepages"): + continue + elif line.startswith("vm.hugetlb_shm_group"): + continue + else: + fd.write(line); + + fd.write("kernel.shmmax = %d\n" % shmmax) + fd.write("vm.nr_hugepages = %d\n" % userHugePagesReq) + fd.write("vm.hugetlb_shm_group = %d\n" % userGIDReq) + fd.close() + +else: + print "Add to %s:" % sysctlConf + print "kernel.shmmax = %d" % shmmax + print "vm.nr_hugepages = %d" % userHugePagesReq + print "vm.hugetlb_shm_group = %d" % userGIDReq + print + + +# write out limits.conf changes to persist across reboot +if debug == False: + limitsConfLines = "# Huge page access configuration\n" + if os.access(limitsConf, os.W_OK): + try: + limitsConfLines = open(limitsConf).readlines() + os.rename(limitsConf, limitsConf + ".backup") + print("Saved original %s as %s.backup" % (limitsConf, limitsConf)) + except: + pass + + fd = open(limitsConf, "w") + for line in limitsConfLines: + cfgExist = False + for hugeUser in hugePageUserList: + try: + if line.split()[0] == hugeUser: + cfgExist = True + except IndexError: + # hit either white or comment line, it is safe not to take + # any action and continue. + pass + if cfgExist == True: + continue + else: + fd.write(line) + + for hugeUser in hugePageUserList: + fd.write("%s soft memlock %d\n" % (hugeUser, userHugePageReqKB)) + fd.write("%s hard memlock %d\n" % (hugeUser, userHugePageReqKB)) + fd.close() + +else: + print "Add to %s:" % limitsConf + for hugeUser in hugePageUserList: + print "%s soft memlock %d" % (hugeUser, userHugePageReqKB) + print "%s hard memlock %d" % (hugeUser, userHugePageReqKB) + + +# dump the final configuration of things now that we're done tweaking +print +print "Final configuration:" +print " * Total System Memory......: %6d MB" % memTotal +if debug == False: + print " * Shared Mem Max Mapping...: %6d MB" % (shmmax / (1024 * 1024)) +else: + # This should be what we *would* have set it to, had we actually run hugeadm --set-recommended-shmmax + print " * Shared Mem Max Mapping...: %6d MB" % (userHugePagesReq * hugePageSize / (1024 * 1024)) +print " * System Huge Page Size....: %6d MB" % (hugePageSize / (1024 * 1024)) +print " * Available Huge Pages.....: %6d" % userHugePagesReq +print " * Total size of Huge Pages.: %6d MB" % (userHugePagesReq * hugePageSize / (1024 * 1024)) +print " * Remaining System Memory..: %6d MB" % (memTotal - userHugePageReqMB) +print " * Huge Page User Group.....: %s (%d)" % (userGroupReq, userGIDReq) +print + diff --git a/hugeadm.c b/hugeadm.c new file mode 100644 index 0000000..fe4211d --- /dev/null +++ b/hugeadm.c @@ -0,0 +1,1714 @@ +/*************************************************************************** + * User front end for using huge pages Copyright (C) 2008, IBM * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation; either version 2.1 of the * + * License, or at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the Lesser GNU General Public * + * License along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +/* + * hugeadm is designed to make an administrators life simpler, to automate + * and simplify basic system configuration as it relates to hugepages. It + * is designed to help with pool and mount configuration. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define _GNU_SOURCE /* for getopt_long */ +#include +#include + +#define KB (1024) +#define MB (1024*KB) +#define GB (1024*MB) + +#define REPORT_UTIL "hugeadm" +#define REPORT(level, prefix, format, ...) \ + do { \ + if (verbose_level >= level) \ + fprintf(stderr, "hugeadm:" prefix ": " format, \ + ##__VA_ARGS__); \ + } while (0); + +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" + +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +#define MOUNT_DIR "/var/lib/hugetlbfs" +#define OPT_MAX 4096 + +#define PROCMOUNTS "/proc/mounts" +#define PROCHUGEPAGES_MOVABLE "/proc/sys/vm/hugepages_treat_as_movable" +#define PROCMINFREEKBYTES "/proc/sys/vm/min_free_kbytes" +#define PROCSHMMAX "/proc/sys/kernel/shmmax" +#define PROCHUGETLBGROUP "/proc/sys/vm/hugetlb_shm_group" +#define PROCZONEINFO "/proc/zoneinfo" +#define FS_NAME "hugetlbfs" +#define MIN_COL 20 +#define MAX_SIZE_MNTENT (64 + PATH_MAX + 32 + 128 + 2 * sizeof(int)) +#define FORMAT_LEN 20 + +#define MEM_TOTAL "MemTotal:" +#define SWAP_FREE "SwapFree:" +#define SWAP_TOTAL "SwapTotal:" + +#define ALWAYS "always" +#define MADVISE "madvise" +#define NEVER "never" +#define TRANS_ENABLE "/sys/kernel/mm/transparent_hugepage/enabled" +#define KHUGE_SCAN_PAGES "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan" +#define KHUGE_SCAN_SLEEP "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs" +#define KHUGE_ALLOC_SLEEP "/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs" + +void print_usage() +{ + fprintf(stderr, "hugeadm [options]\n"); + fprintf(stderr, "options:\n"); + + OPTION("--list-all-mounts", "List all current hugetlbfs mount points"); + OPTION("--pool-list", "List all pools"); + OPTION("--hard", "specified with --pool-pages-min to make"); + CONT("multiple attempts at adjusting the pool size to the"); + CONT("specified count on failure"); + OPTION("--pool-pages-min :[+|-]>", ""); + CONT("Adjust pool 'size' lower bound"); + OPTION("--obey-mempolicy", "Obey the NUMA memory policy when"); + CONT("adjusting the pool 'size' lower bound"); + OPTION("--thp-always", "Enable transparent huge pages always"); + OPTION("--thp-madvise", "Enable transparent huge pages with madvise"); + OPTION("--thp-never", "Disable transparent huge pages"); + OPTION("--thp-khugepaged-pages ", "Number of pages that khugepaged"); + CONT("should scan on each pass"); + OPTION("--thp-khugepaged-scan-sleep ", "Time in ms to sleep between"); + CONT("khugepaged passes"); + OPTION("--thp-khugepages-alloc-sleep ", "Time in ms for khugepaged"); + CONT("to wait if there was a huge page allocation failure"); + OPTION("--pool-pages-max :[+|-]>", ""); + CONT("Adjust pool 'size' upper bound"); + OPTION("--set-recommended-min_free_kbytes", ""); + CONT("Sets min_free_kbytes to a recommended value to improve availability of"); + CONT("huge pages at runtime"); + OPTION("--set-recommended-shmmax", "Sets shmmax to a recommended value to"); + CONT("maximise the size possible for shared memory pools"); + OPTION("--set-shm-group ", "Sets hugetlb_shm_group to the"); + CONT("specified group, which has permission to use hugetlb shared memory pools"); + OPTION("--add-temp-swap[=count]", "Specified with --pool-pages-min to create"); + CONT("temporary swap space for the duration of the pool resize. Default swap"); + CONT("size is 5 huge pages. Optional arg sets size to 'count' huge pages"); + OPTION("--add-ramdisk-swap", "Specified with --pool-pages-min to create"); + CONT("swap space on ramdisks. By default, swap is removed after the resize."); + OPTION("--persist", "Specified with --add-temp-swap or --add-ramdisk-swap"); + CONT("options to make swap space persist after the resize."); + OPTION("--enable-zone-movable", "Use ZONE_MOVABLE for huge pages"); + OPTION("--disable-zone-movable", "Do not use ZONE_MOVABLE for huge pages"); + OPTION("--create-mounts", "Creates a mount point for each available"); + CONT("huge page size on this system under /var/lib/hugetlbfs"); + OPTION("--create-user-mounts ", ""); + CONT("Creates a mount point for each available huge"); + CONT("page size under /var/lib/hugetlbfs/"); + CONT("usable by user "); + OPTION("--create-group-mounts ", ""); + CONT("Creates a mount point for each available huge"); + CONT("page size under /var/lib/hugetlbfs/"); + CONT("usable by group "); + OPTION("--create-global-mounts", ""); + CONT("Creates a mount point for each available huge"); + CONT("page size under /var/lib/hugetlbfs/global"); + CONT("usable by anyone"); + + OPTION("--max-size >", "Limit the filesystem size of a new mount point"); + OPTION("--max-inodes ", "Limit the number of inodes on a new mount point"); + + OPTION("--page-sizes", "Display page sizes that a configured pool"); + OPTION("--page-sizes-all", + "Display page sizes support by the hardware"); + OPTION("--dry-run", "Print the equivalent shell commands for what"); + CONT("the specified options would have done without"); + CONT("taking any action"); + + OPTION("--explain", "Gives a overview of the status of the system"); + CONT("with respect to huge page availability"); + + OPTION("--verbose , -v", "Increases/sets tracing levels"); + OPTION("--help, -h", "Prints this message"); +} + +int opt_dry_run = 0; +int opt_hard = 0; +int opt_movable = -1; +int opt_set_recommended_minfreekbytes = 0; +int opt_set_recommended_shmmax = 0; +int opt_set_hugetlb_shm_group = 0; +int opt_temp_swap = 0; +int opt_ramdisk_swap = 0; +int opt_swap_persist = 0; +int opt_obey_mempolicy = 0; +unsigned long opt_limit_mount_size = 0; +int opt_limit_mount_inodes = 0; +int verbose_level = VERBOSITY_DEFAULT; +char ramdisk_list[PATH_MAX] = ""; + +void setup_environment(char *var, char *val) +{ + if (opt_dry_run) { + printf("%s='%s'\n", var, val); + return; + } + + setenv(var, val, 1); + DEBUG("%s='%s'\n", var, val); +} + +/* Enable/disable allocation of hugepages from ZONE_MOVABLE */ +void setup_zone_movable(int able) +{ + if (opt_dry_run) { + printf("echo %d > %s\n", able, PROCHUGEPAGES_MOVABLE); + return; + } + + DEBUG("Setting %s to %d\n", PROCHUGEPAGES_MOVABLE, able); + + /* libhugetlbfs reports any error that occurs */ + file_write_ulong(PROCHUGEPAGES_MOVABLE, (unsigned long)able); +} + +void verbose_init(void) +{ + char *env; + + env = getenv("HUGETLB_VERBOSE"); + if (env) + verbose_level = atoi(env); + env = getenv("HUGETLB_DEBUG"); + if (env) + verbose_level = VERBOSITY_MAX; +} + +void verbose(char *which) +{ + int new_level; + + if (which) { + new_level = atoi(which); + if (new_level < 0 || new_level > 99) { + ERROR("%d: verbosity out of range 0-99\n", + new_level); + exit(EXIT_FAILURE); + } + } else { + new_level = verbose_level + 1; + if (new_level == 100) { + WARNING("verbosity limited to 99\n"); + new_level--; + } + } + verbose_level = new_level; +} + +void verbose_expose(void) +{ + char level[3]; + + if (verbose_level == 99) { + setup_environment("HUGETLB_DEBUG", "yes"); + } + snprintf(level, sizeof(level), "%d", verbose_level); + setup_environment("HUGETLB_VERBOSE", level); +} + +/* + * getopts return values for options which are long only. + */ +#define LONG_POOL ('p' << 8) +#define LONG_POOL_LIST (LONG_POOL|'l') +#define LONG_POOL_MIN_ADJ (LONG_POOL|'m') +#define LONG_POOL_MAX_ADJ (LONG_POOL|'M') +#define LONG_POOL_MEMPOL (LONG_POOL|'p') + +#define LONG_SET_RECOMMENDED_MINFREEKBYTES ('k' << 8) +#define LONG_SET_RECOMMENDED_SHMMAX ('x' << 8) +#define LONG_SET_HUGETLB_SHM_GROUP ('R' << 8) + +#define LONG_MOVABLE ('z' << 8) +#define LONG_MOVABLE_ENABLE (LONG_MOVABLE|'e') +#define LONG_MOVABLE_DISABLE (LONG_MOVABLE|'d') + +#define LONG_HARD ('h' << 8) +#define LONG_SWAP ('s' << 8) +#define LONG_SWAP_DISK (LONG_SWAP|'d') +#define LONG_SWAP_RAMDISK (LONG_SWAP|'r') +#define LONG_SWAP_PERSIST (LONG_SWAP|'p') + +#define LONG_PAGE ('P' << 8) +#define LONG_PAGE_SIZES (LONG_PAGE|'s') +#define LONG_PAGE_AVAIL (LONG_PAGE|'a') + +#define LONG_MOUNTS ('m' << 8) +#define LONG_CREATE_MOUNTS (LONG_MOUNTS|'C') +#define LONG_CREATE_USER_MOUNTS (LONG_MOUNTS|'U') +#define LONG_CREATE_GROUP_MOUNTS (LONG_MOUNTS|'g') +#define LONG_CREATE_GLOBAL_MOUNTS (LONG_MOUNTS|'G') +#define LONG_LIST_ALL_MOUNTS (LONG_MOUNTS|'A') + +#define LONG_LIMITS ('l' << 8) +#define LONG_LIMIT_SIZE (LONG_LIMITS|'S') +#define LONG_LIMIT_INODES (LONG_LIMITS|'I') + +#define LONG_EXPLAIN ('e' << 8) + +#define LONG_TRANS ('t' << 8) +#define LONG_TRANS_ALWAYS (LONG_TRANS|'a') +#define LONG_TRANS_MADVISE (LONG_TRANS|'m') +#define LONG_TRANS_NEVER (LONG_TRANS|'n') + +#define LONG_KHUGE ('K' << 8) +#define LONG_KHUGE_PAGES (LONG_KHUGE|'p') +#define LONG_KHUGE_SCAN (LONG_KHUGE|'s') +#define LONG_KHUGE_ALLOC (LONG_KHUGE|'a') + +#define MAX_POOLS 32 + +static int cmpsizes(const void *p1, const void *p2) +{ + return ((struct hpage_pool *)p1)->pagesize > + ((struct hpage_pool *)p2)->pagesize; +} + +void pool_list(void) +{ + struct hpage_pool pools[MAX_POOLS]; + int pos; + int cnt; + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + qsort(pools, cnt, sizeof(pools[0]), cmpsizes); + + printf("%10s %8s %8s %8s %8s\n", + "Size", "Minimum", "Current", "Maximum", "Default"); + for (pos = 0; cnt--; pos++) { + printf("%10ld %8ld %8ld %8ld %8s\n", pools[pos].pagesize, + pools[pos].minimum, pools[pos].size, + pools[pos].maximum, (pools[pos].is_default) ? "*" : ""); + } +} + +struct mount_list +{ + struct mntent entry; + char data[MAX_SIZE_MNTENT]; + struct mount_list *next; +}; + +void print_mounts(struct mount_list *current, int longest) +{ + char format_str[FORMAT_LEN]; + + snprintf(format_str, FORMAT_LEN, "%%-%ds %%s\n", longest); + printf(format_str, "Mount Point", "Options"); + while (current) { + printf(format_str, current->entry.mnt_dir, + current->entry.mnt_opts); + current = current->next; + } +} + +/* + * collect_active_mounts returns a list of active hugetlbfs + * mount points, and, if longest is not NULL, the number of + * characters in the longest mount point to ease output + * formatting. Caller is expected to free the list of mounts. + */ +struct mount_list *collect_active_mounts(int *longest) +{ + FILE *mounts; + struct mount_list *list, *current, *previous = NULL; + int length; + + /* First try /proc/mounts, then /etc/mtab */ + mounts = setmntent(PROCMOUNTS, "r"); + if (!mounts) { + mounts = setmntent(MOUNTED, "r"); + if (!mounts) { + ERROR("unable to open %s or %s for reading", + PROCMOUNTS, MOUNTED); + exit(EXIT_FAILURE); + } + } + + list = malloc(sizeof(struct mount_list)); + if (!list) { + ERROR("out of memory"); + exit(EXIT_FAILURE); + } + + list->next = NULL; + current = list; + while (getmntent_r(mounts, &(current->entry), current->data, MAX_SIZE_MNTENT)) { + if (strcasecmp(current->entry.mnt_type, FS_NAME) == 0) { + length = strlen(current->entry.mnt_dir); + if (longest && length > *longest) + *longest = length; + + current->next = malloc(sizeof(struct mount_list)); + if (!current->next) { + ERROR("out of memory"); + exit(EXIT_FAILURE); + } + previous = current; + current = current->next; + current->next = NULL; + } + } + + endmntent(mounts); + + if (previous) { + free(previous->next); + previous->next = NULL; + return list; + } + return NULL; +} + +void mounts_list_all(void) +{ + struct mount_list *list, *previous; + int longest = MIN_COL; + + list = collect_active_mounts(&longest); + + if (!list) { + ERROR("No hugetlbfs mount points found\n"); + return; + } + + print_mounts(list, longest); + + while (list) { + previous = list; + list = list->next; + free(previous); + } +} + +int make_dir(char *path, mode_t mode, uid_t uid, gid_t gid) +{ + struct passwd *pwd; + struct group *grp; + + if (opt_dry_run) { + pwd = getpwuid(uid); + grp = getgrgid(gid); + printf("if [ ! -e %s ]\n", path); + printf("then\n"); + printf(" mkdir %s\n", path); + printf(" chown %s:%s %s\n", pwd->pw_name, grp->gr_name, path); + printf(" chmod %o %s\n", mode, path); + printf("fi\n"); + return 0; + } + + if (mkdir(path, mode)) { + if (errno != EEXIST) { + ERROR("Unable to create dir %s, error: %s\n", + path, strerror(errno)); + return 1; + } + } else { + if (chown(path, uid, gid)) { + ERROR("Unable to change ownership of %s, error: %s\n", + path, strerror(errno)); + return 1; + } + + if (chmod(path, mode)) { + ERROR("Unable to change permission on %s, error: %s\n", + path, strerror(errno)); + return 1; + } + } + + return 0; +} + +/** + * ensure_dir will build the entire directory structure up to and + * including path, all directories built will be owned by + * user:group and permissions will be set to mode. + */ +int ensure_dir(char *path, mode_t mode, uid_t uid, gid_t gid) +{ + char *idx; + + if (!path || strlen(path) == 0) + return 0; + + idx = strchr(path + 1, '/'); + + do { + if (idx) + *idx = '\0'; + + if (make_dir(path, mode, uid, gid)) + return 1; + + if (idx) { + *idx = '/'; + idx++; + } + } while ((idx = strchr(idx, '/')) != NULL); + + if (make_dir(path, mode, uid, gid)) + return 1; + + return 0; +} + +int check_if_already_mounted(struct mount_list *list, char *path) +{ + while (list) { + if (!strcmp(list->entry.mnt_dir, path)) + return 1; + list = list->next; + } + return 0; +} + +int mount_dir(char *path, char *options, mode_t mode, uid_t uid, gid_t gid) +{ + struct passwd *pwd; + struct group *grp; + struct mntent entry; + FILE *mounts; + char dummy; + int useMtab; + + struct mount_list *list, *previous; + + list = collect_active_mounts(NULL); + + if (list && check_if_already_mounted(list, path)) { + WARNING("Directory %s is already mounted.\n", path); + + while (list) { + previous = list; + list = list->next; + free(previous); + } + return 0; + } + + while (list) { + previous = list; + list = list->next; + free(previous); + } + + if (opt_dry_run) { + pwd = getpwuid(uid); + grp = getgrgid(gid); + printf("mount -t %s none %s -o %s\n", FS_NAME, + path, options); + printf("chown %s:%s %s\n", pwd->pw_name, grp->gr_name, + path); + printf("chmod %o %s\n", mode, path); + } else { + if (mount("none", path, FS_NAME, 0, options)) { + ERROR("Unable to mount %s, error: %s\n", + path, strerror(errno)); + return 1; + } + + /* Check if mtab is a symlink */ + useMtab = (readlink(MOUNTED, &dummy, 1) < 0); + if (useMtab) { + mounts = setmntent(MOUNTED, "a+"); + if (mounts) { + entry.mnt_fsname = FS_NAME; + entry.mnt_dir = path; + entry.mnt_type = FS_NAME; + entry.mnt_opts = options; + entry.mnt_freq = 0; + entry.mnt_passno = 0; + if (addmntent(mounts, &entry)) + WARNING("Unable to add entry %s to %s, error: %s\n", + path, MOUNTED, strerror(errno)); + endmntent(mounts); + } else { + WARNING("Unable to open %s, error: %s\n", + MOUNTED, strerror(errno)); + } + } + + if (chown(path, uid, gid)) { + ERROR("Unable to change ownership of %s, error: %s\n", + path, strerror(errno)); + return 1; + } + + if (chmod(path, mode)) { + ERROR("Unable to set permissions on %s, error: %s\n", + path, strerror(errno)); + return 1; + } + } + return 0; +} + +void scale_size(char *buf, unsigned long pagesize) +{ + if(pagesize >= GB) + snprintf(buf, OPT_MAX, "%luGB", pagesize / GB); + else if(pagesize >= MB) + snprintf(buf, OPT_MAX, "%luMB", pagesize / MB); + else + snprintf(buf, OPT_MAX, "%luKB", pagesize / KB); +} + +void create_mounts(char *user, char *group, char *base, mode_t mode) +{ + struct hpage_pool pools[MAX_POOLS]; + char path[PATH_MAX]; + char options[OPT_MAX]; + char limits[OPT_MAX]; + char scaled[OPT_MAX]; + int cnt, pos; + struct passwd *pwd; + struct group *grp; + uid_t uid = 0; + gid_t gid = 0; + + if (geteuid() != 0) { + ERROR("Mounts can only be created by root\n"); + exit(EXIT_FAILURE); + } + + if (user) { + pwd = getpwnam(user); + if (!pwd) { + ERROR("Could not find specified user %s\n", user); + exit(EXIT_FAILURE); + } + uid = pwd->pw_uid; + } else if (group) { + grp = getgrnam(group); + if (!grp) { + ERROR("Could not find specified group %s\n", group); + exit(EXIT_FAILURE); + } + gid = grp->gr_gid; + } + + if (ensure_dir(base, + S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH, 0, 0)) + exit(EXIT_FAILURE); + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("Unable to obtain pools list\n"); + exit(EXIT_FAILURE); + } + + for (pos=0; cnt--; pos++) { + scaled[0] = 0; + scale_size(scaled, pools[pos].pagesize); + if (user) + snprintf(path, PATH_MAX, "%s/%s/pagesize-%s", + base, user, scaled); + else if (group) + snprintf(path, PATH_MAX, "%s/%s/pagesize-%s", + base, group, scaled); + else + snprintf(path, PATH_MAX, "%s/pagesize-%s", + base, scaled); + + snprintf(options, OPT_MAX, "pagesize=%ld", + pools[pos].pagesize); + + /* Yes, this could be cleverer */ + if (opt_limit_mount_size && opt_limit_mount_inodes) + snprintf(limits, OPT_MAX, ",size=%lu,nr_inodes=%d", + opt_limit_mount_size, opt_limit_mount_inodes); + else { + if (opt_limit_mount_size) + snprintf(limits, OPT_MAX, ",size=%lu", + opt_limit_mount_size); + if (opt_limit_mount_inodes) + snprintf(limits, OPT_MAX, ",nr_inodes=%d", + opt_limit_mount_inodes); + } + + /* Append limits if specified */ + if (limits[0] != 0) { + size_t maxlen = OPT_MAX - strlen(options); + if (maxlen > strlen(limits)) + strcat(options, limits); + else + WARNING("String limitations met, cannot append limitations onto mount options string. Increase OPT_MAX"); + } + + if (ensure_dir(path, mode, uid, gid)) + exit(EXIT_FAILURE); + + if (mount_dir(path, options, mode, uid, gid)) + exit(EXIT_FAILURE); + } +} + +/** + * show_mem shouldn't change the behavior of any of its + * callers, it only prints a message to the user showing the + * total amount of memory in the system (in megabytes). + */ +void show_mem() +{ + long mem_total; + + mem_total = read_meminfo(MEM_TOTAL); + printf("Total System Memory: %ld MB\n\n", mem_total / 1024); +} + +/** + * check_swap shouldn't change the behavior of any of its + * callers, it only prints a message to the user if something + * is being done that might fail without swap available. i.e. + * resizing a huge page pool + */ +void check_swap() +{ + long swap_sz; + long swap_total; + + swap_total = read_meminfo(SWAP_TOTAL); + if (swap_total <= 0) { + WARNING("There is no swap space configured, resizing hugepage pool may fail\n"); + WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n"); + return; + } + + swap_sz = read_meminfo(SWAP_FREE); + /* meminfo keeps values in kb, but we use bytes for hpage sizes */ + swap_sz *= 1024; + if (swap_sz <= gethugepagesize()) { + WARNING("There is very little swap space free, resizing hugepage pool may fail\n"); + WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n"); + } +} + +#define ZONEINFO_LINEBUF 1024 +long recommended_minfreekbytes(void) +{ + FILE *f; + char buf[ZONEINFO_LINEBUF]; + int nr_zones = 0; + long recommended_min; + long pageblock_kbytes = kernel_default_hugepage_size() / 1024; + + /* Detect the number of zones in the system */ + f = fopen(PROCZONEINFO, "r"); + if (f == NULL) { + WARNING("Unable to open " PROCZONEINFO); + return 0; + } + while (fgets(buf, ZONEINFO_LINEBUF, f) != NULL) { + if (strncmp(buf, "Node ", 5) == 0) + nr_zones++; + } + fclose(f); + + /* Make sure at least 2 pageblocks are free for MIGRATE_RESERVE */ + recommended_min = pageblock_kbytes * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_kbytes * nr_zones * 3 * 3; + return recommended_min; +} + +void set_recommended_minfreekbytes(void) +{ + long recommended_min = recommended_minfreekbytes(); + + if (opt_dry_run) { + printf("echo \"%ld\" > %s\n", recommended_min, + PROCMINFREEKBYTES); + return; + } + + DEBUG("Setting min_free_kbytes to %ld\n", recommended_min); + file_write_ulong(PROCMINFREEKBYTES, (unsigned long)recommended_min); +} + +/* + * check_minfreekbytes does not alter the value of min_free_kbytes. It just + * reports what the current value is and what it should be + */ +void check_minfreekbytes(void) +{ + long min_free_kbytes = file_read_ulong(PROCMINFREEKBYTES, NULL); + long recommended_min = recommended_minfreekbytes(); + + /* There should be at least one pageblock free per zone in the system */ + if (recommended_min > min_free_kbytes) { + printf("\n"); + printf("The " PROCMINFREEKBYTES " of %ld is too small. To maximiuse efficiency\n", min_free_kbytes); + printf("of fragmentation avoidance, there should be at least one huge page free per zone\n"); + printf("in the system which minimally requires a min_free_kbytes value of %ld\n", recommended_min); + } +} + +unsigned long long recommended_shmmax(void) +{ + struct hpage_pool pools[MAX_POOLS]; + unsigned long long recommended_shmmax = 0; + int pos, cnt; + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + + for (pos = 0; cnt--; pos++) + recommended_shmmax += ((unsigned long long)pools[pos].maximum * + pools[pos].pagesize); + + return recommended_shmmax; +} + +void set_recommended_shmmax(void) +{ + int ret; + unsigned long max_recommended = -1UL; + unsigned long long recommended = recommended_shmmax(); + + if (recommended == 0) { + printf("\n"); + WARNING("We can only set a recommended shmmax when huge pages are configured!\n"); + return; + } + + if (recommended > max_recommended) + recommended = max_recommended; + + DEBUG("Setting shmmax to %llu\n", recommended); + ret = file_write_ulong(PROCSHMMAX, (unsigned long)recommended); + + if (!ret) { + INFO("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n"); + INFO(" kernel.shmmax = %llu\n", recommended); + } +} + +void check_shmmax(void) +{ + long current_shmmax = file_read_ulong(PROCSHMMAX, NULL); + long recommended = recommended_shmmax(); + + if (current_shmmax != recommended) { + printf("\n"); + printf("A " PROCSHMMAX " value of %ld bytes may be sub-optimal. To maximise\n", current_shmmax); + printf("shared memory usage, this should be set to the size of the largest shared memory\n"); + printf("segment size you want to be able to use. Alternatively, set it to a size matching\n"); + printf("the maximum possible allocation size of all huge pages. This can be done\n"); + printf("automatically, using the --set-recommended-shmmax option.\n"); + } + + if (recommended == 0) { + printf("\n"); + WARNING("We can't make a shmmax recommendation until huge pages are configured!\n"); + return; + } + + printf("\n"); + printf("The recommended shmmax for your currently allocated huge pages is %ld bytes.\n", recommended); + printf("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n"); + printf(" kernel.shmmax = %ld\n", recommended); +} + +void set_hugetlb_shm_group(gid_t gid, char *group) +{ + int ret; + + DEBUG("Setting hugetlb_shm_group to %d (%s)\n", gid, group); + ret = file_write_ulong(PROCHUGETLBGROUP, (unsigned long)gid); + + if (!ret) { + INFO("To make hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n"); + INFO(" vm.hugetlb_shm_group = %d\n", gid); + } +} + +/* heisted from shadow-utils/libmisc/list.c::is_on_list() */ +static int user_in_group(char *const *list, const char *member) +{ + while (*list != NULL) { + if (strcmp(*list, member) == 0) { + return 1; + } + list++; + } + + return 0; +} + +void check_user(void) +{ + uid_t uid; + gid_t gid; + struct passwd *pwd; + struct group *grp; + + gid = (gid_t)file_read_ulong(PROCHUGETLBGROUP, NULL); + grp = getgrgid(gid); + if (!grp) { + printf("\n"); + WARNING("Group ID %d in hugetlb_shm_group doesn't appear to be a valid group!\n", gid); + return; + } + + uid = getuid(); + pwd = getpwuid(uid); + + /* Don't segfault if user does not have a passwd entry. */ + if (!pwd) { + printf("\n"); + WARNING("User uid %d is not in the password file!\n", uid); + return; + } + + if (gid != pwd->pw_gid && !user_in_group(grp->gr_mem, pwd->pw_name) && uid != 0) { + printf("\n"); + WARNING("User %s (uid: %d) is not a member of the hugetlb_shm_group %s (gid: %d)!\n", pwd->pw_name, uid, grp->gr_name, gid); + } else { + printf("\n"); + printf("To make your hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n"); + printf(" vm.hugetlb_shm_group = %d\n", gid); + } +} + +void add_temp_swap(long page_size) +{ + char path[PATH_MAX]; + char file[PATH_MAX]; + char mkswap_cmd[PATH_MAX]; + FILE *f; + char *buf; + long swap_size; + long pid; + int ret; + int num_pages; + + if (geteuid() != 0) { + ERROR("Swap can only be manipulated by root\n"); + exit(EXIT_FAILURE); + } + + pid = getpid(); + snprintf(path, PATH_MAX, "%s/swap/temp", MOUNT_DIR); + snprintf(file, PATH_MAX, "%s/swapfile-%ld", path, pid); + + /* swapsize is 5 hugepages */ + if (opt_temp_swap == -1) + num_pages = 5; + else + num_pages = opt_temp_swap; + swap_size = num_pages * page_size; + + if (ensure_dir(path, S_IRWXU | S_IRGRP | S_IXGRP, 0, 0)) + exit(EXIT_FAILURE); + + if (opt_dry_run) { + printf("dd bs=1024 count=%ld if=/dev/zero of=%s\n", + swap_size / 1024, file); + printf("mkswap %s\nswapon %s\n", file, file); + return; + } + + f = fopen(file, "wx"); + if (!f) { + WARNING("Couldn't open %s: %s\n", file, strerror(errno)); + opt_temp_swap = 0; + return; + } + + buf = malloc(swap_size); + memset(buf, 0, swap_size); + fwrite(buf, sizeof(char), swap_size, f); + free(buf); + fclose(f); + + snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", file); + ret = system(mkswap_cmd); + if (WIFSIGNALED(ret)) { + WARNING("Call to mkswap failed\n"); + opt_temp_swap = 0; + return; + } else if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + WARNING("Call to mkswap failed\n"); + opt_temp_swap = 0; + return; + } + } + + DEBUG("swapon %s\n", file); + if (swapon(file, 0)) { + WARNING("swapon on %s failed: %s\n", file, strerror(errno)); + opt_temp_swap = 0; + } +} + +void rem_temp_swap() { + char file[PATH_MAX]; + long pid; + + pid = getpid(); + snprintf(file, PATH_MAX, "%s/swap/temp/swapfile-%ld", MOUNT_DIR, pid); + + if (opt_dry_run) { + printf("swapoff %s\nrm -f %s\n", file, file); + return; + } + + if (swapoff(file)) + WARNING("swapoff on %s failed: %s\n", file, strerror(errno)); + remove(file); + DEBUG("swapoff %s\n", file); +} + +void add_ramdisk_swap(long page_size) { + char ramdisk[PATH_MAX]; + char mkswap_cmd[PATH_MAX]; + int disk_num=0; + int count = 0; + long ramdisk_size; + int ret; + int fd; + + snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num); + fd = open(ramdisk, O_RDONLY); + ioctl(fd, BLKGETSIZE, &ramdisk_size); + close(fd); + + ramdisk_size = ramdisk_size * 512; + count = (page_size/ramdisk_size) + 1; + + if (count > 1) { + INFO("Swap will be initialized on multiple ramdisks because\n\ + ramdisk size is less than huge page size. To avoid\n\ + this in the future, use kernel command line parameter\n\ + ramdisk_size=N, to set ramdisk size to N blocks.\n"); + } + + while (count > 0) { + snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num); + if (access(ramdisk, F_OK) != 0){ + break; + } + disk_num++; + + if (opt_dry_run) { + printf("mkswap %s\nswapon %s\n", ramdisk, ramdisk); + } else { + snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", ramdisk); + ret = system(mkswap_cmd); + if (WIFSIGNALED(ret)) { + WARNING("Call to mkswap failed\n"); + continue; + } else if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + WARNING("Call to mkswap failed\n"); + continue; + } + } + DEBUG("swapon %s\n", ramdisk); + if (swapon(ramdisk, 0)) { + WARNING("swapon on %s failed: %s\n", ramdisk, strerror(errno)); + opt_temp_swap = 0; + continue; + } + } + count--; + strcat(ramdisk_list, " "); + strcat(ramdisk_list, ramdisk); + } +} + +void rem_ramdisk_swap(){ + char *ramdisk; + char *iter = NULL; + + ramdisk = strtok_r(ramdisk_list, " ", &iter); + while (ramdisk != NULL) { + if (opt_dry_run) { + printf("swapoff %s\n", ramdisk); + } else { + DEBUG("swapoff %s\n", ramdisk); + if (swapoff(ramdisk)) { + WARNING("swapoff on %s failed: %s\n", ramdisk, strerror(errno)); + continue; + } + } + ramdisk = strtok_r(NULL, " ", &iter); + } +} + +void set_trans_opt(const char *file, const char *value) +{ + FILE *f; + + if (geteuid() != 0) { + ERROR("Transparent huge page options can only be set by root\n"); + exit(EXIT_FAILURE); + } + + if (opt_dry_run) { + printf("echo '%s' > %s\n", value, file); + return; + } + + f = fopen(file, "w"); + if (!f) { + ERROR("Couldn't open %s: %s\n", file, strerror(errno)); + return; + } + + fprintf(f, "%s", value); + fclose(f); +} + +enum { + POOL_MIN, + POOL_MAX, + POOL_BOTH, +}; + +static long value_adjust(char *adjust_str, long base, long page_size) +{ + long long adjust; + char *iter; + + /* Convert and validate the adjust. */ + errno = 0; + adjust = strtol(adjust_str, &iter, 0); + /* Catch strtol errors and sizes that overflow the native word size */ + if (errno || adjust_str == iter) { + if (errno == ERANGE) + errno = EOVERFLOW; + else + errno = EINVAL; + ERROR("%s: invalid adjustment\n", adjust_str); + exit(EXIT_FAILURE); + } + + /* size_to_smaller_unit() only works with positive values */ + if (adjust_str[0] == '-') + adjust = -adjust; + + switch (*iter) { + case 'G': + case 'g': + adjust = size_to_smaller_unit(adjust); + case 'M': + case 'm': + adjust = size_to_smaller_unit(adjust); + case 'K': + case 'k': + adjust = size_to_smaller_unit(adjust); + adjust = adjust / page_size; + } + + /* if previously negative, make negative again */ + if (adjust_str[0] == '-') + adjust = -adjust; + + if (adjust_str[0] != '+' && adjust_str[0] != '-') + base = 0; + + /* Ensure we neither go negative nor exceed LONG_MAX. */ + if (adjust < 0 && -adjust > base) { + adjust = -base; + } + if (adjust > 0 && (base + adjust) < base) { + adjust = LONG_MAX - base; + } + base += adjust; + + DEBUG("Returning page count of %ld\n", base); + + return base; +} + + +void pool_adjust(char *cmd, unsigned int counter) +{ + struct hpage_pool pools[MAX_POOLS]; + int pos; + int cnt; + + char *iter = NULL; + char *page_size_str = NULL; + char *adjust_str = NULL; + long page_size; + + unsigned long min; + unsigned long min_orig; + unsigned long max; + unsigned long last_pool_value; + + /* Extract the pagesize and adjustment. */ + page_size_str = strtok_r(cmd, ":", &iter); + if (page_size_str) + adjust_str = strtok_r(NULL, ":", &iter); + + if (!page_size_str || !adjust_str) { + ERROR("%s: invalid resize specification\n", cmd); + exit(EXIT_FAILURE); + } + INFO("page_size<%s> adjust<%s> counter<%d>\n", + page_size_str, adjust_str, counter); + + /* Convert and validate the page_size. */ + if (strcmp(page_size_str, "DEFAULT") == 0) + page_size = kernel_default_hugepage_size(); + else + page_size = parse_page_size(page_size_str); + + DEBUG("Working with page_size of %ld\n", page_size); + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + for (pos = 0; cnt--; pos++) { + if (pools[pos].pagesize == page_size) + break; + } + if (cnt < 0) { + ERROR("%s: unknown page size\n", page_size_str); + exit(EXIT_FAILURE); + } + + min_orig = min = pools[pos].minimum; + max = pools[pos].maximum; + + if (counter == POOL_BOTH) { + min = value_adjust(adjust_str, min, page_size); + max = min; + } else if (counter == POOL_MIN) { + min = value_adjust(adjust_str, min, page_size); + if (min > max) + max = min; + } else { + max = value_adjust(adjust_str, max, page_size); + if (max < min) + min = max; + } + + INFO("%ld, %ld -> %ld, %ld\n", pools[pos].minimum, pools[pos].maximum, + min, max); + + if ((pools[pos].maximum - pools[pos].minimum) < (max - min)) { + INFO("setting HUGEPAGES_OC to %ld\n", (max - min)); + set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min)); + } + + if (opt_hard) + cnt = 5; + else + cnt = -1; + + if (min > min_orig) { + if (opt_temp_swap) + add_temp_swap(page_size); + if (opt_ramdisk_swap) + add_ramdisk_swap(page_size); + check_swap(); + } + + if (opt_obey_mempolicy && get_huge_page_counter(page_size, + HUGEPAGES_TOTAL_MEMPOL) < 0) { + opt_obey_mempolicy = 0; + WARNING("Counter for NUMA huge page allocations is not found, continuing with normal pool adjustment\n"); + } + + INFO("setting HUGEPAGES_TOTAL%s to %ld\n", + opt_obey_mempolicy ? "_MEMPOL" : "", min); + set_huge_page_counter(page_size, + opt_obey_mempolicy ? HUGEPAGES_TOTAL_MEMPOL : HUGEPAGES_TOTAL, + min); + get_pool_size(page_size, &pools[pos]); + + /* If we fail to make an allocation, retry if user requests */ + last_pool_value = pools[pos].minimum; + while ((pools[pos].minimum != min) && (cnt > 0)) { + /* Make note if progress is being made and sleep for IO */ + if (last_pool_value == pools[pos].minimum) + cnt--; + else + cnt = 5; + sleep(6); + + last_pool_value = pools[pos].minimum; + INFO("Retrying allocation HUGEPAGES_TOTAL%s to %ld current %ld\n", opt_obey_mempolicy ? "_MEMPOL" : "", min, pools[pos].minimum); + set_huge_page_counter(page_size, + opt_obey_mempolicy ? + HUGEPAGES_TOTAL_MEMPOL : + HUGEPAGES_TOTAL, + min); + get_pool_size(page_size, &pools[pos]); + } + + if (min > min_orig && !opt_swap_persist) { + if (opt_temp_swap) + rem_temp_swap(); + else if (opt_ramdisk_swap) + rem_ramdisk_swap(); + } + + /* + * HUGEPAGES_TOTAL is not guarenteed to check to exactly the figure + * requested should there be insufficient pages. Check the new + * value and adjust HUGEPAGES_OC accordingly. + */ + if (pools[pos].minimum != min) { + WARNING("failed to set pool minimum to %ld became %ld\n", + min, pools[pos].minimum); + min = pools[pos].minimum; + } + if (pools[pos].maximum != max) { + INFO("setting HUGEPAGES_OC to %ld\n", (max - min)); + set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min)); + } +} + +void page_sizes(int all) +{ + struct hpage_pool pools[MAX_POOLS]; + int pos; + int cnt; + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + qsort(pools, cnt, sizeof(pools[0]), cmpsizes); + + for (pos = 0; cnt--; pos++) { + if (all || (pools[pos].maximum && + hugetlbfs_find_path_for_size(pools[pos].pagesize))) + printf("%ld\n", pools[pos].pagesize); + } +} + +void explain() +{ + show_mem(); + mounts_list_all(); + printf("\nHuge page pools:\n"); + pool_list(); + printf("\nHuge page sizes with configured pools:\n"); + page_sizes(0); + check_minfreekbytes(); + check_shmmax(); + check_swap(); + check_user(); + printf("\nNote: Permanent swap space should be preferred when dynamic " + "huge page pools are used.\n"); +} + +int main(int argc, char** argv) +{ + int ops; + int has_hugepages = kernel_has_hugepages(); + + char opts[] = "+hdv"; + char base[PATH_MAX]; + char *opt_min_adj[MAX_POOLS], *opt_max_adj[MAX_POOLS]; + char *opt_user_mounts = NULL, *opt_group_mounts = NULL; + int opt_list_mounts = 0, opt_pool_list = 0, opt_create_mounts = 0; + int opt_global_mounts = 0, opt_pgsizes = 0, opt_pgsizes_all = 0; + int opt_explain = 0, minadj_count = 0, maxadj_count = 0; + int opt_trans_always = 0, opt_trans_never = 0, opt_trans_madvise = 0; + int opt_khuge_pages = 0, opt_khuge_scan = 0, opt_khuge_alloc = 0; + int ret = 0, index = 0; + char *khuge_pages = NULL, *khuge_alloc = NULL, *khuge_scan = NULL; + gid_t opt_gid = 0; + struct group *opt_grp = NULL; + int group_invalid = 0; + struct option long_opts[] = { + {"help", no_argument, NULL, 'h'}, + {"verbose", required_argument, NULL, 'v' }, + + {"list-all-mounts", no_argument, NULL, LONG_LIST_ALL_MOUNTS}, + {"pool-list", no_argument, NULL, LONG_POOL_LIST}, + {"pool-pages-min", required_argument, NULL, LONG_POOL_MIN_ADJ}, + {"pool-pages-max", required_argument, NULL, LONG_POOL_MAX_ADJ}, + {"obey-mempolicy", no_argument, NULL, LONG_POOL_MEMPOL}, + {"thp-always", no_argument, NULL, LONG_TRANS_ALWAYS}, + {"thp-madvise", no_argument, NULL, LONG_TRANS_MADVISE}, + {"thp-never", no_argument, NULL, LONG_TRANS_NEVER}, + {"thp-khugepaged-pages", required_argument, NULL, LONG_KHUGE_PAGES}, + {"thp-khugepaged-scan-sleep", required_argument, NULL, LONG_KHUGE_SCAN}, + {"thp-khugepaged-alloc-sleep", required_argument, NULL, LONG_KHUGE_ALLOC}, + {"set-recommended-min_free_kbytes", no_argument, NULL, LONG_SET_RECOMMENDED_MINFREEKBYTES}, + {"set-recommended-shmmax", no_argument, NULL, LONG_SET_RECOMMENDED_SHMMAX}, + {"set-shm-group", required_argument, NULL, LONG_SET_HUGETLB_SHM_GROUP}, + {"enable-zone-movable", no_argument, NULL, LONG_MOVABLE_ENABLE}, + {"disable-zone-movable", no_argument, NULL, LONG_MOVABLE_DISABLE}, + {"hard", no_argument, NULL, LONG_HARD}, + {"add-temp-swap", optional_argument, NULL, LONG_SWAP_DISK}, + {"add-ramdisk-swap", no_argument, NULL, LONG_SWAP_RAMDISK}, + {"persist", no_argument, NULL, LONG_SWAP_PERSIST}, + {"create-mounts", no_argument, NULL, LONG_CREATE_MOUNTS}, + {"create-user-mounts", required_argument, NULL, LONG_CREATE_USER_MOUNTS}, + {"create-group-mounts", required_argument, NULL, LONG_CREATE_GROUP_MOUNTS}, + {"create-global-mounts", no_argument, NULL, LONG_CREATE_GLOBAL_MOUNTS}, + + {"max-size", required_argument, NULL, LONG_LIMIT_SIZE}, + {"max-inodes", required_argument, NULL, LONG_LIMIT_INODES}, + + {"page-sizes", no_argument, NULL, LONG_PAGE_SIZES}, + {"page-sizes-all", no_argument, NULL, LONG_PAGE_AVAIL}, + {"dry-run", no_argument, NULL, 'd'}, + {"explain", no_argument, NULL, LONG_EXPLAIN}, + + {0}, + }; + + hugetlbfs_setup_debug(); + setup_mounts(); + verbose_init(); + + ops = 0; + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + switch (ret) { + case -1: + break; + + case '?': + print_usage(); + exit(EXIT_FAILURE); + + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + case 'v': + verbose(optarg); + continue; + + case 'd': + opt_dry_run = 1; + continue; + + default: + /* All other commands require hugepage support. */ + if (! has_hugepages) { + ERROR("kernel does not support huge pages\n"); + exit(EXIT_FAILURE); + } + } + switch (ret) { + case -1: + break; + + case LONG_HARD: + opt_hard = 1; + continue; + + case LONG_SWAP_DISK: + if (optarg) + opt_temp_swap = atoi(optarg); + else + opt_temp_swap = -1; + break; + + case LONG_SWAP_RAMDISK: + opt_ramdisk_swap = 1; + break; + + case LONG_SWAP_PERSIST: + opt_swap_persist = 1; + + case LONG_LIST_ALL_MOUNTS: + opt_list_mounts = 1; + break; + + case LONG_POOL_LIST: + opt_pool_list = 1; + break; + + case LONG_POOL_MIN_ADJ: + if (minadj_count == MAX_POOLS) { + WARNING("Attempting to adjust an invalid " + "pool or a pool multiple times, " + "ignoring request: '%s'\n", optarg); + } else { + opt_min_adj[minadj_count++] = optarg; + } + break; + + case LONG_POOL_MEMPOL: + opt_obey_mempolicy = 1; + break; + + case LONG_TRANS_ALWAYS: + opt_trans_always = 1; + break; + + case LONG_TRANS_MADVISE: + opt_trans_madvise = 1; + break; + + case LONG_TRANS_NEVER: + opt_trans_never = 1; + break; + + case LONG_KHUGE_PAGES: + opt_khuge_pages = 1; + khuge_pages = optarg; + break; + + case LONG_KHUGE_SCAN: + opt_khuge_scan = 1; + khuge_scan = optarg; + break; + + case LONG_KHUGE_ALLOC: + opt_khuge_alloc = 1; + khuge_alloc = optarg; + break; + + case LONG_POOL_MAX_ADJ: + if (! kernel_has_overcommit()) { + ERROR("kernel does not support overcommit, " + "max cannot be adjusted\n"); + exit(EXIT_FAILURE); + } + + if (maxadj_count == MAX_POOLS) { + WARNING("Attempting to adjust an invalid " + "pool or a pool multiple times, " + "ignoring request: '%s'\n", optarg); + } else { + opt_max_adj[maxadj_count++] = optarg; + } + break; + + case LONG_MOVABLE_ENABLE: + opt_movable = 1; + break; + + case LONG_SET_RECOMMENDED_MINFREEKBYTES: + opt_set_recommended_minfreekbytes = 1; + break; + + case LONG_SET_RECOMMENDED_SHMMAX: + opt_set_recommended_shmmax = 1; + break; + + case LONG_SET_HUGETLB_SHM_GROUP: + opt_grp = getgrnam(optarg); + if (!opt_grp) { + opt_gid = atoi(optarg); + if (opt_gid == 0 && strcmp(optarg, "0")) + group_invalid = 1; + opt_grp = getgrgid(opt_gid); + if (!opt_grp) + group_invalid = 1; + } else { + opt_gid = opt_grp->gr_gid; + } + if (group_invalid) { + ERROR("Invalid group specification (%s)\n", optarg); + exit(EXIT_FAILURE); + } + opt_set_hugetlb_shm_group = 1; + break; + + case LONG_MOVABLE_DISABLE: + opt_movable = 0; + break; + + case LONG_CREATE_MOUNTS: + opt_create_mounts = 1; + break; + + case LONG_CREATE_USER_MOUNTS: + opt_user_mounts = optarg; + break; + + case LONG_CREATE_GROUP_MOUNTS: + opt_group_mounts = optarg; + break; + + case LONG_CREATE_GLOBAL_MOUNTS: + opt_global_mounts = 1; + break; + + case LONG_LIMIT_SIZE: + /* Not a pagesize, but the conversions the same */ + opt_limit_mount_size = parse_page_size(optarg); + if (!opt_limit_mount_size) + WARNING("Mount max size specification 0, invalid or overflowed\n"); + break; + + case LONG_LIMIT_INODES: + opt_limit_mount_inodes = atoi(optarg); + break; + + case LONG_PAGE_SIZES: + opt_pgsizes = 1; + break; + + case LONG_PAGE_AVAIL: + opt_pgsizes_all = 1; + break; + + case LONG_EXPLAIN: + opt_explain = 1; + break; + + default: + WARNING("unparsed option %08x\n", ret); + ret = -1; + break; + } + if (ret != -1) + ops++; + } + + verbose_expose(); + + if (opt_list_mounts) + mounts_list_all(); + + if (opt_pool_list) + pool_list(); + + if (opt_movable != -1) + setup_zone_movable(opt_movable); + + if (opt_trans_always) + set_trans_opt(TRANS_ENABLE, ALWAYS); + + if (opt_trans_madvise) + set_trans_opt(TRANS_ENABLE, MADVISE); + + if (opt_trans_never) + set_trans_opt(TRANS_ENABLE, NEVER); + + if (opt_khuge_pages) + set_trans_opt(KHUGE_SCAN_PAGES, khuge_pages); + + if (opt_khuge_alloc) + set_trans_opt(KHUGE_ALLOC_SLEEP, khuge_alloc); + + if (opt_khuge_scan) + set_trans_opt(KHUGE_SCAN_SLEEP, khuge_scan); + + if (opt_set_recommended_minfreekbytes) + set_recommended_minfreekbytes(); + + if (opt_set_recommended_shmmax) + set_recommended_shmmax(); + + if (opt_set_hugetlb_shm_group) + set_hugetlb_shm_group(opt_gid, opt_grp->gr_name); + + while (--minadj_count >= 0) { + if (! kernel_has_overcommit()) + pool_adjust(opt_min_adj[minadj_count], POOL_BOTH); + else + pool_adjust(opt_min_adj[minadj_count], POOL_MIN); + } + + while (--maxadj_count >=0) + pool_adjust(opt_max_adj[maxadj_count], POOL_MAX); + + if (opt_create_mounts) { + snprintf(base, PATH_MAX, "%s", MOUNT_DIR); + create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG); + } + + + if (opt_user_mounts != NULL) { + snprintf(base, PATH_MAX, "%s/user", MOUNT_DIR); + create_mounts(opt_user_mounts, NULL, base, S_IRWXU); + } + + if (opt_group_mounts) { + snprintf(base, PATH_MAX, "%s/group", MOUNT_DIR); + create_mounts(NULL, opt_group_mounts, base, S_IRWXG); + } + + if (opt_global_mounts) { + snprintf(base, PATH_MAX, "%s/global", MOUNT_DIR); + create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX ); + } + + if (opt_pgsizes) + page_sizes(0); + + if (opt_pgsizes_all) + page_sizes(1); + + if (opt_explain) + explain(); + + index = optind; + + if ((argc - index) != 0 || ops == 0) { + print_usage(); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} diff --git a/hugectl.c b/hugectl.c new file mode 100644 index 0000000..8f05595 --- /dev/null +++ b/hugectl.c @@ -0,0 +1,488 @@ +/*************************************************************************** + * User front end for using huge pages Copyright (C) 2008, IBM * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation; either version 2.1 of the * + * License, or at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the Lesser GNU General Public * + * License along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +/* + * hugectl is inspired by numactl as a single front end to a large number of + * options for controlling a very specific environment. Eventually it will + * have support for controlling the all of the environment variables for + * libhugetlbfs, but options will only be added after they have been in the + * library for some time and are throughly tested and stable. + * + * This program should be treated as an ABI for using libhugetlbfs. + */ + +#include +#include +#include +#include +#include + +#define _GNU_SOURCE /* for getopt_long */ +#include +#include + +#define REPORT(level, prefix, format, ...) \ + do { \ + if (verbose_level >= level) \ + fprintf(stderr, "hugectl: " prefix ": " format, \ + ##__VA_ARGS__); \ + } while (0); + +#include "libhugetlbfs_debug.h" + +extern int errno; +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +void print_usage() +{ + fprintf(stderr, "hugectl [options] target\n"); + fprintf(stderr, "options:\n"); + + OPTION("--help, -h", "Prints this message"); + OPTION("--verbose , -v", "Increases/sets tracing levels"); + + OPTION("--text[=]", "Requests remapping of the program text"); + OPTION("--data[=]", "Requests remapping of the program data"); + OPTION("--bss[=]", "Requests remapping of the program bss"); + OPTION("--heap[=]", "Requests remapping of the program heap"); + CONT("(malloc space)"); + OPTION("--shm", "Requests remapping of shared memory segments"); + OPTION("--thp", "Setup the heap space to be aligned for merging"); + CONT("by khugepaged into huge pages. This requires"); + CONT("kernel support for transparent huge pages to be"); + CONT("enabled"); + + OPTION("--no-preload", "Disable preloading the libhugetlbfs library"); + OPTION("--no-reserve", "Disable huge page reservation for segments"); + OPTION("--force-preload", "Force preloading the libhugetlbfs library"); + + OPTION("--dry-run", "describe what would be done without doing it"); + + OPTION("--library-use-path", "Use the system library path"); + OPTION("--share-text", "Share text segments between multiple"); + CONT("application instances"); + OPTION("--library-path ", "Select a library prefix"); + CONT("(Default: " +#ifdef LIBDIR32 + LIBDIR32 ":" +#endif +#ifdef LIBDIR32 + LIBDIR32 ":" +#endif + ")"); +} + +int opt_dry_run = 0; +int opt_force_preload = 0; +int verbose_level = VERBOSITY_DEFAULT; + +void verbose_init(void) +{ + char *env; + + env = getenv("HUGETLB_VERBOSE"); + if (env) + verbose_level = atoi(env); + env = getenv("HUGETLB_DEBUG"); + if (env) + verbose_level = VERBOSITY_MAX; +} + +void verbose(char *which) +{ + int new_level; + + if (which) { + new_level = atoi(which); + if (new_level < 0 || new_level > 99) { + ERROR("%d: verbosity out of range 0-99\n", + new_level); + exit(EXIT_FAILURE); + } + } else { + new_level = verbose_level + 1; + if (new_level == 100) { + WARNING("verbosity limited to 99\n"); + new_level--; + } + } + verbose_level = new_level; +} + +void quiet(void) +{ + int new_level = verbose_level - 1; + if (new_level < 0) { + WARNING("verbosity must be at least 0\n"); + new_level = 0; + } + verbose_level = new_level; +} + +void setup_environment(char *var, char *val) +{ + setenv(var, val, 1); + INFO("%s='%s'\n", var, val); + + if (opt_dry_run) + printf("%s='%s'\n", var, val); +} + +void verbose_expose(void) +{ + char level[3]; + + if (verbose_level == 99) { + setup_environment("HUGETLB_DEBUG", "yes"); + } + snprintf(level, sizeof(level), "%d", verbose_level); + setup_environment("HUGETLB_VERBOSE", level); +} + +/* + * getopts return values for options which are long only. + */ +#define MAP_BASE 0x1000 +#define LONG_BASE 0x2000 + +#define LONG_NO_PRELOAD (LONG_BASE | 'p') +#define LONG_NO_RESERVE (LONG_BASE | 'r') +#define LONG_FORCE_PRELOAD (LONG_BASE | 'F') + +#define LONG_DRY_RUN (LONG_BASE | 'd') + +#define LONG_SHARE (LONG_BASE | 's') +#define LONG_NO_LIBRARY (LONG_BASE | 'L') +#define LONG_LIBRARY (LONG_BASE | 'l') + +#define LONG_THP_HEAP ('t') + +/* + * Mapping selectors, one per remappable/backable area as requested + * by the user. These are also used as returns from getopts where they + * are offset from MAP_BASE, which must be removed before they are compared. + */ +enum { + MAP_TEXT, + MAP_DATA, + MAP_BSS, + MAP_HEAP, + MAP_SHM, + MAP_DISABLE, + + MAP_COUNT, +}; +char *map_size[MAP_COUNT]; + +char default_size[] = "the default hugepage size"; +#define DEFAULT_SIZE default_size + +#define available(buf, ptr) ((int)(sizeof(buf) - (ptr - buf))) +void setup_mappings(int count) +{ + char value[128]; + char *ptr = value; + int needed; + + /* + * HUGETLB_ELFMAP should be set to either a combination of 'R' and 'W' + * which indicate which segments should be remapped. Each may take + * an optional page size. It may also be set to 'no' to prevent + * remapping. + */ + + /* + * Accumulate sections each with a ':' prefix to simplify later + * handling. We will elide the initial ':' before use. + */ + if (map_size[MAP_TEXT]) { + if (map_size[MAP_TEXT] == DEFAULT_SIZE) + needed = snprintf(ptr, available(value, ptr), ":R"); + else + needed = snprintf(ptr, available(value, ptr), + ":R=%s", map_size[MAP_TEXT]); + ptr += needed; + if (needed < 0 || available(value, ptr) < 0) { + ERROR("%s: bad size specification\n", map_size[MAP_TEXT]); + exit(EXIT_FAILURE); + } + } + if (map_size[MAP_DATA] != 0 || map_size[MAP_BSS] != 0) { + char *size = map_size[MAP_BSS]; + if (map_size[MAP_DATA]) + size = map_size[MAP_DATA]; + if (map_size[MAP_DATA] != map_size[MAP_BSS]) + WARNING("data and bss remapped together in %s\n", size); + + if (size == DEFAULT_SIZE) + needed = snprintf(ptr, available(value, ptr), ":W"); + else + needed = snprintf(ptr, available(value, ptr), + ":W=%s", size); + ptr += needed; + if (needed < 0 || available(value, ptr) < 0) { + ERROR("%s: bad size specification\n", size); + exit(EXIT_FAILURE); + } + } + *ptr = '\0'; + if (ptr != value) + setup_environment("HUGETLB_ELFMAP", &value[1]); + + if (map_size[MAP_DISABLE]) { + if (ptr != value) + WARNING("--disable masks requested remap\n"); + setup_environment("HUGETLB_ELFMAP", "no"); + } + + if (map_size[MAP_HEAP] == DEFAULT_SIZE) + setup_environment("HUGETLB_MORECORE", "yes"); + else if (map_size[MAP_HEAP]) + setup_environment("HUGETLB_MORECORE", map_size[MAP_HEAP]); + + if (map_size[MAP_SHM] && map_size[MAP_SHM] != DEFAULT_SIZE) + WARNING("shm segments may only be mapped in the " + "default hugepage size\n"); + if (map_size[MAP_SHM]) + setup_environment("HUGETLB_SHM", "yes"); +} + +#define LIBRARY_DISABLE ((void *)-1) + +void library_path(char *path) +{ + char val[PATH_MAX] = ""; + char *env; + + env = getenv("LD_LIBRARY_PATH"); + + /* + * Select which libraries we wish to use. If the path is NULL + * use the libraries included with hugectl. If the path is valid + * and points to a directory including a libhugetlbfs.so use it + * directly. Else path is assumed to be a prefix to the 32/64 bit + * directories both of which are added, where available. + */ + if (path) { + snprintf(val, sizeof(val), "%s/libhugetlbfs.so", path); + if (access(val, F_OK) == 0) { + /* $PATH */ + snprintf(val, sizeof(val), "%s:%s", + path, env ? env : ""); + + } else { + /* [$PATH/LIB32:][$PATH/LIB64:]$LD_LIBRARY_PATH */ + snprintf(val, sizeof(val), "" +#ifdef LIBDIR32 + "%s/" LIB32 ":" +#endif +#ifdef LIBDIR64 + "%s/" LIB64 ":" +#endif + "%s", +#ifdef LIBDIR32 + path, +#endif +#ifdef LIBDIR64 + path, +#endif + env ? env : ""); + } + + } else { + /* [LIBDIR32:][LIBDIR64:]$LD_LIBRARY_PATH */ + snprintf(val, sizeof(val), "" +#ifdef LIBDIR32 + LIBDIR32 ":" +#endif +#ifdef LIBDIR64 + LIBDIR64 ":" +#endif + "%s", env ? env : ""); + } + setup_environment("LD_LIBRARY_PATH", val); +} + +void ldpreload(int count) +{ + int allowed = 0; + + if (map_size[MAP_HEAP]) + allowed++; + if (map_size[MAP_SHM]) + allowed++; + + if ((allowed == count) || opt_force_preload) { + setup_environment("LD_PRELOAD", "libhugetlbfs.so"); + if (allowed == count) + INFO("LD_PRELOAD in use for lone --heap/--shm\n"); + } else { + WARNING("LD_PRELOAD not appropriate for this map combination\n"); + } +} + +int main(int argc, char** argv) +{ + int opt_mappings = 0; + int opt_preload = 1; + int opt_no_reserve = 0; + int opt_share = 0; + int opt_thp_heap = 0; + char *opt_library = NULL; + + char opts[] = "+hvq"; + int ret = 0, index = 0; + struct option long_opts[] = { + {"help", no_argument, NULL, 'h'}, + {"verbose", required_argument, NULL, 'v' }, + {"no-preload", no_argument, NULL, LONG_NO_PRELOAD}, + {"no-reserve", no_argument, NULL, LONG_NO_RESERVE}, + {"force-preload", + no_argument, NULL, LONG_FORCE_PRELOAD}, + {"dry-run", no_argument, NULL, LONG_DRY_RUN}, + {"library-path", + required_argument, NULL, LONG_LIBRARY}, + {"library-use-path", + no_argument, NULL, LONG_NO_LIBRARY}, + {"share-text", no_argument, NULL, LONG_SHARE}, + + {"disable", optional_argument, NULL, MAP_BASE|MAP_DISABLE}, + {"text", optional_argument, NULL, MAP_BASE|MAP_TEXT}, + {"data", optional_argument, NULL, MAP_BASE|MAP_DATA}, + {"bss", optional_argument, NULL, MAP_BASE|MAP_BSS}, + {"heap", optional_argument, NULL, MAP_BASE|MAP_HEAP}, + {"shm", optional_argument, NULL, MAP_BASE|MAP_SHM}, + {"thp", no_argument, NULL, LONG_THP_HEAP}, + {0}, + }; + + verbose_init(); + + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + if (ret > 0 && (ret & MAP_BASE)) { + if (optarg) + map_size[ret & ~MAP_BASE] = optarg; + else + map_size[ret & ~MAP_BASE] = DEFAULT_SIZE; + opt_mappings++; + continue; + } + switch (ret) { + case '?': + print_usage(); + exit(EXIT_FAILURE); + + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + case 'v': + verbose(optarg); + break; + + case 'q': + quiet(); + break; + + case LONG_THP_HEAP: + opt_thp_heap = 1; + INFO("Aligning heap for use with THP\n"); + break; + + case LONG_NO_PRELOAD: + opt_preload = 0; + INFO("LD_PRELOAD disabled\n"); + break; + + case LONG_NO_RESERVE: + opt_no_reserve = 1; + INFO("MAP_NORESERVE used for huge page mappings\n"); + break; + + case LONG_FORCE_PRELOAD: + opt_preload = 1; + opt_force_preload = 1; + INFO("Forcing ld preload\n"); + break; + + case LONG_DRY_RUN: + opt_dry_run = 1; + break; + + case LONG_NO_LIBRARY: + opt_library = LIBRARY_DISABLE; + INFO("using LD_LIBRARY_PATH to find library\n"); + break; + + case LONG_LIBRARY: + opt_library = optarg; + break; + + case LONG_SHARE: + opt_share = 1; + break; + + case -1: + break; + + default: + WARNING("unparsed option %08x\n", ret); + ret = -1; + break; + } + } + index = optind; + + if (!opt_dry_run && (argc - index) < 1) { + print_usage(); + exit(EXIT_FAILURE); + } + + verbose_expose(); + + if (opt_library != LIBRARY_DISABLE) + library_path(opt_library); + + if (opt_mappings) + setup_mappings(opt_mappings); + + if (opt_preload) + ldpreload(opt_mappings); + + if (opt_no_reserve) + setup_environment("HUGETLB_NO_RESERVE", "yes"); + + if (opt_share) + setup_environment("HUGETLB_SHARE", "1"); + + if (opt_thp_heap) + setup_environment("HUGETLB_MORECORE", "thp"); + + if (opt_dry_run) + exit(EXIT_SUCCESS); + + execvp(argv[index], &argv[index]); + ERROR("exec failed: %s\n", strerror(errno)); + exit(EXIT_FAILURE); +} diff --git a/hugeedit.c b/hugeedit.c new file mode 100644 index 0000000..bd66e77 --- /dev/null +++ b/hugeedit.c @@ -0,0 +1,244 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Eventually we plan to use the libhugetlbfs reporting facility, + * but until that is possible, redefine a simpler version here. + */ +#define REPORT(level, prefix, format, ...) \ + do { \ + fprintf(stderr, "hugeedit: " prefix ": " format, \ + ##__VA_ARGS__); \ + } while (0) + +#include "libhugetlbfs_internal.h" + +/* + * All MAP_* options are tagged with MAP_BASE to differentiate them as options + * in the options parser. This must be removed before they are compared. + */ +#define MAP_BASE 0x1000 +#define MAP_DISABLE 0x0001 +#define MAP_TEXT 0x0002 +#define MAP_DATA 0x0004 + +#define PF_LINUX_HUGETLB 0x100000 +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +void print_usage() +{ + fprintf(stderr, "hugeedit [options] target\n"); + fprintf(stderr, "options:\n"); + OPTION("--text", "Remap program text into huge pages by default"); + OPTION("--data", "Remap program data into huge pages by default"); + OPTION("--disable", "Remap no segments into huge pages by default"); + OPTION("--help, -h", "Print this usage information"); +} + +int check_elf_wordsize(void *ehdr) +{ + char *e_ident = (char *) ehdr; + + if (strncmp(e_ident, ELFMAG, SELFMAG)) { + ERROR("Not a valid ELF executable\n"); + exit(EXIT_FAILURE); + } + + switch (e_ident[EI_CLASS]) { + case ELFCLASS32: + case ELFCLASS64: + return e_ident[EI_CLASS]; + default: + ERROR("Can not determine word size\n"); + exit(EXIT_FAILURE); + } +} + +/* + * We need to map enough of the binary so that we can access the ELF header and + * all of the program headers. This function takes a pointer to the first page + * of ELF headers which is guaranteed to be enough data to determine if we need + * to map more of the binary. Use mremap to enlarge the mapping if needed. + * + * void **elf - may be updated with a new address if mremap moved it + * unsigned long *size - may be updated with the new mapping size + */ +#define elf_ph_end_offset(e) ((e)->e_phoff + (e)->e_phentsize * (e)->e_phnum) +void check_remap_elf(void **elf, unsigned long *size, int wordsize) +{ + unsigned long newsize; + int pagesize = getpagesize(); + + if (wordsize == ELFCLASS32) { + Elf32_Ehdr *ehdr = *(Elf32_Ehdr **) elf; + newsize = elf_ph_end_offset(ehdr); + } else { + Elf64_Ehdr *ehdr = *(Elf64_Ehdr **) elf; + newsize = elf_ph_end_offset(ehdr); + } + newsize = ALIGN_UP(newsize, pagesize); + + if (newsize > *size) { + *size = newsize; + *elf = mremap(*elf, *size, newsize, MREMAP_MAYMOVE); + if (*elf == MAP_FAILED) { + ERROR("Remapping failed: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + } +} + +#define is_text(p) ((((p)->p_flags & (PF_R|PF_W|PF_X)) == (PF_R|PF_X)) && \ + ((p)->p_memsz == (p)->p_filesz)) +#if defined(__powerpc__) && !defined(__powerpc64__) +#define is_data(p) (((p)->p_flags & (PF_R|PF_W|PF_X)) == (PF_R|PF_W|PF_X)) +#else +#define is_data(p) (((p)->p_flags & (PF_R|PF_W|PF_X)) == (PF_R|PF_W)) +#endif + +#define update_phdrs(_BITS_) \ +void update_phdrs##_BITS_(Elf##_BITS_##_Ehdr *ehdr, int remap_opts) \ +{ \ + int i; \ + Elf##_BITS_##_Phdr *phdr; \ + unsigned long long start, end; \ + \ + phdr = (Elf##_BITS_##_Phdr *)((char *)ehdr + ehdr->e_phoff); \ + for (i = 0; i < ehdr->e_phnum; i++) { \ + if (phdr[i].p_type != PT_LOAD) \ + continue; \ + if (remap_opts) \ + phdr[i].p_flags &= ~PF_LINUX_HUGETLB; \ + if ((remap_opts & MAP_TEXT) && is_text(&phdr[i])) \ + phdr[i].p_flags |= PF_LINUX_HUGETLB; \ + if ((remap_opts & MAP_DATA) && is_data(&phdr[i])) \ + phdr[i].p_flags |= PF_LINUX_HUGETLB; \ + start = (unsigned long long) phdr[i].p_vaddr; \ + end = start + phdr[i].p_memsz; \ + printf("Segment %i 0x%llx - 0x%llx (%s%s) default is " \ + "%s pages\n", i, start, end, \ + is_text(&phdr[i]) ? "TEXT" : "", \ + is_data(&phdr[i]) ? "DATA" : "", \ + (phdr[i].p_flags & PF_LINUX_HUGETLB) ? \ + "HUGE" : "BASE"); \ + } \ +} +update_phdrs(32) +update_phdrs(64) + +int main(int argc, char ** argv) +{ + char opts[] = "+h"; + struct option long_opts[] = { + {"help", no_argument, NULL, 'h'}, + {"disable", no_argument, NULL, MAP_BASE|MAP_DISABLE}, + {"text", no_argument, NULL, MAP_BASE|MAP_TEXT}, + {"data", no_argument, NULL, MAP_BASE|MAP_DATA}, + {0}, + }; + int ret = 0, index = 0, remap_opts = 0; + int fd; + const char *target; + void *ehdr; + unsigned long mapsize = getpagesize(); + int target_wordsize; + + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + if (ret > 0 && (ret & MAP_BASE)) { + remap_opts |= ret; + continue; + } + switch (ret) { + case '?': + print_usage(); + exit(EXIT_FAILURE); + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + default: + ret = -1; + break; + } + } + index = optind; + remap_opts &= ~MAP_BASE; + if (remap_opts & MAP_DISABLE && remap_opts != MAP_DISABLE) { + ERROR("--disable is not compatible with --text or --data\n"); + exit(EXIT_FAILURE); + } + + if ((argc - index) != 1) { + print_usage(); + exit(EXIT_FAILURE); + } + target = argv[index]; + + /* We don't need write access unless we plan to alter the binary */ + fd = open(target, (remap_opts ? O_RDWR : O_RDONLY)); + if (fd < 0) { + ERROR("Opening %s failed: %s\n", target, strerror(errno)); + exit(EXIT_FAILURE); + } + + ehdr = mmap(NULL, mapsize, PROT_READ | (remap_opts ? PROT_WRITE : 0), + MAP_SHARED, fd, 0); + if (ehdr == MAP_FAILED) { + ERROR("Mapping %s failed: %s\n", target, strerror(errno)); + exit(EXIT_FAILURE); + } + + target_wordsize = check_elf_wordsize(ehdr); + check_remap_elf(&ehdr, &mapsize, target_wordsize); + if (target_wordsize == ELFCLASS64) + update_phdrs64((Elf64_Ehdr *) ehdr, remap_opts); + else + update_phdrs32((Elf32_Ehdr *) ehdr, remap_opts); + + if (munmap(ehdr, mapsize) != 0) { + ERROR("Unmapping %s failed: %s\n", target, strerror(errno)); + exit(EXIT_FAILURE); + } + if (close(fd) != 0) { + ERROR("Final close of %s failed: %s -- possible data loss!\n", + target, strerror(errno)); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} diff --git a/hugetlbfs.h b/hugetlbfs.h new file mode 100644 index 0000000..ecd178b --- /dev/null +++ b/hugetlbfs.h @@ -0,0 +1,79 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are part of the published libhugetlfs API. Functions + * exported here must also be listed in version.lds. + */ + +#ifndef _HUGETLBFS_H +#define _HUGETLBFS_H + +#define HUGETLBFS_MAGIC 0x958458f6 + +long gethugepagesize(void); +int gethugepagesizes(long pagesizes[], int n_elem); +int getpagesizes(long pagesizes[], int n_elem); +int hugetlbfs_test_path(const char *mount); +const char *hugetlbfs_find_path(void); +const char *hugetlbfs_find_path_for_size(long page_size); +int hugetlbfs_unlinked_fd(void); +int hugetlbfs_unlinked_fd_for_size(long page_size); + +#define PF_LINUX_HUGETLB 0x100000 + +/* + * Direct hugepage allocation flags and types + * + * GHP_DEFAULT - Use the default hugepage size to back the region + */ +typedef unsigned long ghp_t; +#define GHP_DEFAULT ((ghp_t)0x01UL) +#define GHP_MASK (GHP_DEFAULT) + +/* Direct alloc functions for hugepages */ +void *get_huge_pages(size_t len, ghp_t flags); +void free_huge_pages(void *ptr); + +/* + * Region alloc flags and types + * + * GHR_DEFAULT - Use a combination of flags deemed to be a sensible default + * by the current implementation of the library + * GHR_FALLBACK - Use the default hugepage size if possible but fallback to + * smaller pages if necessary + * GHR_STRICT - Use hugepages of some size or return NULL + * GHP_COLOR - Use bytes wasted due to alignment to offset the buffer + * by a random cache line. This gives better average + * performance with many buffers + */ +typedef unsigned long ghr_t; +#define GHR_STRICT ((ghr_t)0x10000000U) +#define GHR_FALLBACK ((ghr_t)0x20000000U) +#define GHR_COLOR ((ghr_t)0x40000000U) +#define GHR_DEFAULT (GHR_FALLBACK|GHR_COLOR) + +#define GHR_MASK (GHR_FALLBACK|GHR_STRICT|GHR_COLOR) + +/* Allocation functions for regions backed by hugepages */ +void *get_hugepage_region(size_t len, ghr_t flags); +void free_hugepage_region(void *ptr); + +#endif /* _HUGETLBFS_H */ diff --git a/hugeutils.c b/hugeutils.c new file mode 100644 index 0000000..60488e8 --- /dev/null +++ b/hugeutils.c @@ -0,0 +1,1190 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE /* Need this for statfs64 */ +#define _GNU_SOURCE +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" + +struct libhugeopts_t __hugetlb_opts; + +static int hugepagesize_errno; /* = 0 */ + +#define MAX_HPAGE_SIZES 10 +static struct hpage_size hpage_sizes[MAX_HPAGE_SIZES]; +static int nr_hpage_sizes; +static int hpage_sizes_default_idx = -1; + +static long default_size; + +/********************************************************************/ +/* Internal functions */ +/********************************************************************/ + +/* + * Lookup the kernel default page size. + */ +long kernel_default_hugepage_size() +{ + if (default_size == 0) { + default_size = file_read_ulong(MEMINFO, "Hugepagesize:"); + default_size = size_to_smaller_unit(default_size); /* kB to B */ } + return default_size; +} +void kernel_default_hugepage_size_reset(void) +{ + default_size = 0; +} + +#define BUF_SZ 256 +#define MEMINFO_SIZE 2048 + +/* + * Convert a quantity in a given unit to the next smallest unit by + * multiplying the quantity by 1024 (eg. convert 1MB to 1024kB). + * If the conversion would overflow the variable, return ULONGLONG_MAX to + * signify the error. + */ +unsigned long long size_to_smaller_unit(unsigned long long size) +{ + if (size * 1024 < size) + return -1; + else + return size * 1024; +} + +/* + * Convert a page size string with an optional unit suffix into a page size + * in bytes. + * + * On error, -1 is returned and errno is set appropriately: + * EINVAL - str could not be parsed or was not greater than zero + * EOVERFLOW - Overflow when converting from the specified units + */ +long parse_page_size(const char *str) +{ + char *pos; + long size; + + errno = 0; + size = strtol(str, &pos, 0); + /* Catch strtoul errors and sizes that overflow the native word size */ + if (errno || str == pos || size <= 0) { + if (errno == ERANGE) + errno = EOVERFLOW; + else + errno = EINVAL; + return -1; + } + + switch (*pos) { + case 'G': + case 'g': + size = size_to_smaller_unit(size); + case 'M': + case 'm': + size = size_to_smaller_unit(size); + case 'K': + case 'k': + size = size_to_smaller_unit(size); + } + + if (size < 0) + errno = EOVERFLOW; + return size; +} + +struct hugetlb_pool_counter_info_t { + char *meminfo_key; + char *sysfs_file; +}; + +static struct hugetlb_pool_counter_info_t hugetlb_counter_info[] = { + [HUGEPAGES_TOTAL] = { + .meminfo_key = "HugePages_Total:", + .sysfs_file = "nr_hugepages", + }, + [HUGEPAGES_TOTAL_MEMPOL] = { + .meminfo_key = "HugePages_Total:", + .sysfs_file = "nr_hugepages_mempolicy", + }, + [HUGEPAGES_FREE] = { + .meminfo_key = "HugePages_Free:", + .sysfs_file = "free_hugepages", + }, + [HUGEPAGES_RSVD] = { + .meminfo_key = "HugePages_Rsvd:", + .sysfs_file = "resv_hugepages", + }, + [HUGEPAGES_SURP] = { + .meminfo_key = "HugePages_Surp:", + .sysfs_file = "surplus_hugepages", + }, + [HUGEPAGES_OC] = { + .meminfo_key = NULL, + .sysfs_file = "nr_overcommit_hugepages" + }, +}; + +/* + * Read numeric data from raw and tagged kernel status files. Used to read + * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). + */ +long file_read_ulong(char *file, const char *tag) +{ + int fd; + char buf[MEMINFO_SIZE]; + int len, readerr; + char *p, *q; + long val; + + fd = open(file, O_RDONLY); + if (fd < 0) { + ERROR("Couldn't open %s: %s\n", file, strerror(errno)); + return -1; + } + + len = read(fd, buf, sizeof(buf)); + readerr = errno; + close(fd); + if (len < 0) { + ERROR("Error reading %s: %s\n", file, strerror(readerr)); + return -1; + } + if (len == sizeof(buf)) { + ERROR("%s is too large\n", file); + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (! isspace(*q)) { + ERROR("Couldn't parse %s value\n", file); + return -1; + } + + return val; +} + +int file_write_ulong(char *file, unsigned long val) +{ + FILE *f; + int ret; + + f = fopen(file, "w"); + if (!f) { + ERROR("Couldn't open %s: %s\n", file, strerror(errno)); + return -1; + } + + ret = fprintf(f, "%lu", val); + fclose(f); + return ret > 0 ? 0 : -1; +} + + +/* + * Return the name of this executable, using buf as temporary space. + */ +#define MAX_EXE 4096 +static char *get_exe_name(char *buf, int size) +{ + char *p; + int fd; + ssize_t nread; + + buf[0] = 0; + fd = open("/proc/self/cmdline", O_RDONLY); + if (fd < 0) { + WARNING("Unable to open cmdline, no exe name\n"); + return buf; + } + nread = read(fd, buf, size-1); + close(fd); + + if (nread < 0) { + WARNING("Error %d reading cmdline, no exe name\n", errno); + return buf; + } + if (nread == 0) { + WARNING("Read zero bytes from cmdline, no exe name\n"); + return buf; + } + + buf[nread] = 0; /* make sure we're null terminated */ + /* + * Take advantage of cmdline being a series of null-terminated + * strings. The first string is the path to the executable in + * the form: + * + * /path/to/exe + * + * The exe name starts one character after the last '/'. + */ + p = strrchr(buf, '/'); + if (!p) + return buf; + return p + 1; /* skip over "/" */ +} + + +/* + * Reads the contents of hugetlb environment variables and save their + * values for later use. + */ +void hugetlbfs_setup_env() +{ + char *env; + + __hugetlb_opts.min_copy = true; + + env = getenv("HUGETLB_VERBOSE"); + if (env) + __hugetlbfs_verbose = atoi(env); + + env = getenv("HUGETLB_DEBUG"); + if (env) { + __hugetlbfs_debug = true; + __hugetlbfs_verbose = VERBOSE_DEBUG; + } + + env = getenv("HUGETLB_RESTRICT_EXE"); + if (env) { + char *p, *tok, *exe, buf[MAX_EXE+1], restriction[MAX_EXE]; + int found = 0; + + exe = get_exe_name(buf, sizeof buf); + DEBUG("Found HUGETLB_RESTRICT_EXE, this exe is \"%s\"\n", exe); + strncpy(restriction, env, sizeof restriction); + restriction[sizeof(restriction)-1] = 0; + for (p = restriction; (tok = strtok(p, ":")) != NULL; p = NULL) { + DEBUG(" ...check exe match for \"%s\"\n", tok); + if (strcmp(tok, exe) == 0) { + found = 1; + DEBUG("exe match - libhugetlbfs is active for this exe\n"); + break; + } + } + if (!found) { + DEBUG("No exe match - libhugetlbfs is inactive for this exe\n"); + return; + } + } + + env = getenv("HUGETLB_NO_PREFAULT"); + if (env) + __hugetlbfs_prefault = false; + + __hugetlb_opts.share_path = getenv("HUGETLB_SHARE_PATH"); + __hugetlb_opts.elfmap = getenv("HUGETLB_ELFMAP"); + __hugetlb_opts.ld_preload = getenv("LD_PRELOAD"); + __hugetlb_opts.def_page_size = getenv("HUGETLB_DEFAULT_PAGE_SIZE"); + __hugetlb_opts.path = getenv("HUGETLB_PATH"); + __hugetlb_opts.features = getenv("HUGETLB_FEATURES"); + __hugetlb_opts.morecore = getenv("HUGETLB_MORECORE"); + __hugetlb_opts.heapbase = getenv("HUGETLB_MORECORE_HEAPBASE"); + + if (__hugetlb_opts.morecore) + __hugetlb_opts.thp_morecore = + (strcasecmp(__hugetlb_opts.morecore, "thp") == 0); + + if (__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) { + DEBUG("Heapbase specified with THP for morecore, ignoring heapbase\n"); + __hugetlb_opts.heapbase = NULL; + } + + env = getenv("HUGETLB_FORCE_ELFMAP"); + if (env && (strcasecmp(env, "yes") == 0)) + __hugetlb_opts.force_elfmap = 1; + + env = getenv("HUGETLB_MINIMAL_COPY"); + if (__hugetlb_opts.min_copy && env && (strcasecmp(env, "no") == 0)) { + INFO("HUGETLB_MINIMAL_COPY=%s, disabling filesz copy " + "optimization\n", env); + __hugetlb_opts.min_copy = false; + } + + env = getenv("HUGETLB_SHARE"); + if (env) + __hugetlb_opts.sharing = atoi(env); + + /* + * We have been seeing some unexpected behavior from malloc when + * heap shrinking is enabled, so heap shrinking is disabled by + * default. + * + * If malloc has been called successfully before setup_morecore, + * glibc will notice a gap between the previous top-of-heap and + * the new top-of-heap when it calls hugetlbfs_morecore. It treats + * this as a "foreign sbrk." Unfortunately, the "foreign sbrk" + * handling code will then immediately try to free the memory + * allocated by hugetlbfs_morecore! + * + * This behavior has been reported to the ptmalloc2 maintainer, + * along with a patch to correct the behavior. + */ + env = getenv("HUGETLB_MORECORE_SHRINK"); + if (env && strcasecmp(env, "yes") == 0) + __hugetlb_opts.shrink_ok = true; + + /* Determine if shmget() calls should be overridden */ + env = getenv("HUGETLB_SHM"); + if (env && !strcasecmp(env, "yes")) + __hugetlb_opts.shm_enabled = true; + + /* Determine if all reservations should be avoided */ + env = getenv("HUGETLB_NO_RESERVE"); + if (env && !strcasecmp(env, "yes")) + __hugetlb_opts.no_reserve = true; +} + +void hugetlbfs_setup_kernel_page_size() +{ + long page_size = kernel_default_hugepage_size(); + + if (page_size <= 0) { + WARNING("Unable to find default kernel huge page size\n"); + return; + } + + INFO("Found pagesize %ld kB\n", page_size / 1024); + hpage_sizes[0].pagesize = page_size; + + nr_hpage_sizes = 1; +} + +void hugetlbfs_check_priv_resv() +{ + /* + * If the kernel supports MAP_PRIVATE reservations, we can skip + * prefaulting the huge pages we allocate since the kernel + * guarantees them. This can help NUMA performance quite a bit. + */ + if (hugetlbfs_test_feature(HUGETLB_FEATURE_PRIVATE_RESV) > 0) { + INFO("Kernel has MAP_PRIVATE reservations. Disabling " + "heap prefaulting.\n"); + __hugetlbfs_prefault = false; + } +} + +void hugetlbfs_check_safe_noreserve() +{ + /* + * Some kernels will trigger an OOM if MAP_NORESERVE is used and + * a huge page allocation fails. This is unfortunate so limit + * the user of NORESERVE where necessary + */ + if (__hugetlb_opts.no_reserve && + hugetlbfs_test_feature(HUGETLB_FEATURE_SAFE_NORESERVE) <= 0) { + INFO("Kernel is not safe for MAP_NORESERVE. Forcing " + "use of reservations.\n"); + __hugetlb_opts.no_reserve = false; + } +} + +void hugetlbfs_check_map_hugetlb() +{ +/* + * FIXME: MAP_HUGETLB has not been picked up by glibc so even though the + * kernel may support it, without the userspace mmap flag it cannot be + * used. This ifdef should be removed when the MAP_HUGETLB flag makes it + * into glibc. + */ +#ifdef MAP_HUGETLB + /* + * Kernels after 2.6.32 support mmaping pseudo-anonymous regions + * backed by huge pages, use this feature for huge pages we + * don't intend to share. + */ + if (hugetlbfs_test_feature(HUGETLB_FEATURE_MAP_HUGETLB) > 0) { + INFO("Kernel supports MAP_HUGETLB\n"); + __hugetlb_opts.map_hugetlb = true; + } +#endif +} + +/* + * Pool counters are typically exposed in sysfs in modern kernels, the + * counters for the default page size are exposed in procfs in all kernels + * supporting hugepages. Given a specific counter (e.g. HUGEPAGES_RSVD) + * and a page size return both a filename and an optional tag to locate + * and extract this counter. + */ +static int select_pool_counter(unsigned int counter, unsigned long pagesize, + char *filename, char **key) +{ + long default_size; + char *meminfo_key; + char *sysfs_file; + + if (counter >= HUGEPAGES_MAX_COUNTERS) { + ERROR("Invalid counter specified\n"); + return -1; + } + + meminfo_key = hugetlb_counter_info[counter].meminfo_key; + sysfs_file = hugetlb_counter_info[counter].sysfs_file; + if (key) + *key = NULL; + + /* + * Get the meminfo page size. + * This could be made more efficient if utility functions were shared + * between libhugetlbfs and the test suite. For now we will just + * read /proc/meminfo. + */ + default_size = kernel_default_hugepage_size(); + if (default_size < 0) { + ERROR("Cannot determine the default page size\n"); + return -1; + } + + /* If the user is dealing in the default page size, we can use /proc */ + if (pagesize == default_size) { + if (meminfo_key && key) { + strcpy(filename, MEMINFO); + *key = meminfo_key; + } else + sprintf(filename, PROC_HUGEPAGES_DIR "%s", sysfs_file); + } else /* Use the sysfs interface */ + sprintf(filename, SYSFS_HUGEPAGES_DIR "hugepages-%lukB/%s", + pagesize / 1024, sysfs_file); + return 0; +} + +static int hpage_size_to_index(unsigned long size) +{ + int i; + + for (i = 0; i < nr_hpage_sizes; i++) + if (hpage_sizes[i].pagesize == size) + return i; + return -1; +} + +void probe_default_hpage_size(void) +{ + long size; + int index; + int default_overrided; + + if (nr_hpage_sizes == 0) { + INFO("No configured huge page sizes\n"); + hpage_sizes_default_idx = -1; + return; + } + + /* + * Check if the user specified a default size, otherwise use the + * system default size as reported by /proc/meminfo. + */ + default_overrided = (__hugetlb_opts.def_page_size && + strlen(__hugetlb_opts.def_page_size) > 0); + if (default_overrided) + size = parse_page_size(__hugetlb_opts.def_page_size); + else { + size = kernel_default_hugepage_size(); + } + + if (size >= 0) { + index = hpage_size_to_index(size); + if (index >= 0) + hpage_sizes_default_idx = index; + else { + /* + * If the user specified HUGETLB_DEFAULT_PAGE_SIZE, + * then this situation will alter semantics and they + * should receive a WARNING. Otherwise, this detail + * is purely informational in nature. + */ + char msg[] = "No mount point found for default huge " \ + "page size. Using first available mount " + "point.\n"; + if (default_overrided) + WARNING("%s", msg); + else + INFO("%s", msg); + hpage_sizes_default_idx = 0; + } + } else { + ERROR("Unable to determine default huge page size\n"); + hpage_sizes_default_idx = -1; + } +} + +static void add_hugetlbfs_mount(char *path, int user_mount) +{ + int idx; + long size; + + if (strlen(path) > PATH_MAX) + return; + + if (!hugetlbfs_test_path(path)) { + WARNING("%s is not a hugetlbfs mount point, ignoring\n", path); + return; + } + + size = hugetlbfs_test_pagesize(path); + if (size < 0) { + WARNING("Unable to detect page size for path %s\n", path); + return; + } + + idx = hpage_size_to_index(size); + if (idx < 0) { + if (nr_hpage_sizes >= MAX_HPAGE_SIZES) { + WARNING("Maximum number of huge page sizes exceeded, " + "ignoring %lukB page size\n", size); + return; + } + + idx = nr_hpage_sizes; + hpage_sizes[nr_hpage_sizes++].pagesize = size; + } + + if (strlen(hpage_sizes[idx].mount)) { + if (user_mount) + WARNING("Mount point already defined for size %li, " + "ignoring %s\n", size, path); + return; + } + + strcpy(hpage_sizes[idx].mount, path); +} + +void debug_show_page_sizes(void) +{ + int i; + + INFO("Detected page sizes:\n"); + for (i = 0; i < nr_hpage_sizes; i++) + INFO(" Size: %li kB %s Mount: %s\n", + hpage_sizes[i].pagesize / 1024, + i == hpage_sizes_default_idx ? "(default)" : "", + hpage_sizes[i].mount); +} + +#define LINE_MAXLEN 2048 +static void find_mounts(void) +{ + int fd; + char path[PATH_MAX+1]; + char line[LINE_MAXLEN + 1]; + char *eol; + char *match; + char *end; + int bytes; + off_t offset; + + fd = open("/proc/mounts", O_RDONLY); + if (fd < 0) { + fd = open("/etc/mtab", O_RDONLY); + if (fd < 0) { + ERROR("Couldn't open /proc/mounts or /etc/mtab (%s)\n", + strerror(errno)); + return; + } + } + + while ((bytes = read(fd, line, LINE_MAXLEN)) > 0) { + line[LINE_MAXLEN] = '\0'; + eol = strchr(line, '\n'); + if (!eol) { + ERROR("Line too long when parsing mounts\n"); + break; + } + + /* + * Truncate the string to just one line and reset the file + * to begin reading at the start of the next line. + */ + *eol = '\0'; + offset = bytes - (eol + 1 - line); + lseek(fd, -offset, SEEK_CUR); + + /* Match only hugetlbfs filesystems. */ + match = strstr(line, " hugetlbfs "); + if (match) { + match = strchr(line, '/'); + if (!match) + continue; + end = strchr(match, ' '); + if (!end) + continue; + + strncpy(path, match, end - match); + path[end - match] = '\0'; + if ((hugetlbfs_test_path(path) == 1) && + !(access(path, R_OK | W_OK | X_OK))) + add_hugetlbfs_mount(path, 0); + } + } + close(fd); +} + +void setup_mounts(void) +{ + int do_scan = 1; + + /* If HUGETLB_PATH is set, only add mounts specified there */ + while (__hugetlb_opts.path) { + char path[PATH_MAX + 1]; + char *next = strchrnul(__hugetlb_opts.path, ':'); + + do_scan = 0; + if (next - __hugetlb_opts.path > PATH_MAX) { + ERROR("Path too long in HUGETLB_PATH -- " + "ignoring environment\n"); + break; + } + + strncpy(path, __hugetlb_opts.path, next - __hugetlb_opts.path); + path[next - __hugetlb_opts.path] = '\0'; + add_hugetlbfs_mount(path, 1); + + /* skip the ':' token */ + __hugetlb_opts.path = *next == '\0' ? NULL : next + 1; + } + + /* Then probe all mounted filesystems */ + if (do_scan) + find_mounts(); +} + +int get_pool_size(long size, struct hpage_pool *pool) +{ + long nr_over = 0; + long nr_used = 0; + long nr_surp = 0; + long nr_resv = 0; + long nr_static = 0; + + long it_used = -1; + long it_surp = -1; + long it_resv = -1; + + /* + * Pick up those values which are basically stable with respect to + * the admin; ie. only changed by them. + * + * nr_over may be negative if this kernel does not support overcommit + * in that case we will consider it always 0 and max will track min + * always. + */ + nr_over = get_huge_page_counter(size, HUGEPAGES_OC); + if (nr_over < 0) + nr_over = 0; + + /* Sample the volatile values until they are stable. */ + while (nr_used != it_used || nr_surp != it_surp || nr_resv != it_resv) { + nr_used = it_used; + nr_surp = it_surp; + nr_resv = it_resv; + + it_used = get_huge_page_counter(size, HUGEPAGES_TOTAL); + it_surp = get_huge_page_counter(size, HUGEPAGES_SURP); + it_resv = get_huge_page_counter(size, HUGEPAGES_RSVD); + } + if (nr_surp < 0) + nr_surp = 0; + if (nr_resv < 0) + nr_resv = 0; + + nr_static = nr_used - nr_surp; + + if (nr_static >= 0) { + DEBUG("pagesize<%ld> min<%ld> max<%ld> " + "in-use<%ld>\n", + size, nr_static, nr_static + nr_over, + nr_used); + pool->pagesize = size; + pool->minimum = nr_static; + pool->maximum = nr_static + nr_over; + pool->size = nr_used; + pool->is_default = 0; + + return 1; + } + + return 0; +} + +int hpool_sizes(struct hpage_pool *pools, int pcnt) +{ + long default_size; + int which = 0; + DIR *dir; + struct dirent *entry; + + default_size = kernel_default_hugepage_size(); + if (default_size >= 0 && which < pcnt) + if (get_pool_size(default_size, &pools[which])) { + pools[which].is_default = 1; + which++; + } + + dir = opendir(SYSFS_HUGEPAGES_DIR); + if (dir) { + while ((entry = readdir(dir))) { + char *name = entry->d_name; + long size; + + DEBUG("parsing<%s>\n", name); + if (strncmp(name, "hugepages-", 10) != 0) + continue; + name += 10; + + size = size_to_smaller_unit(atol(name)); + if (size < 0 || size == default_size) + continue; + + if (get_pool_size(size, &pools[which])) + which++; + } + closedir(dir); + } + + return (which < pcnt) ? which : -1; +} + +/* + * If we have a default page size then we support hugepages. + */ +int kernel_has_hugepages(void) +{ + long default_size = kernel_default_hugepage_size(); + if (default_size < 0) + return 0; + + return 1; +} + +/* + * If we can find the default page size, and if we can find an overcommit + * control for it then the kernel must support overcommit. + */ +int kernel_has_overcommit(void) +{ + long default_size = kernel_default_hugepage_size(); + if (default_size < 0) + return 0; + + if (get_huge_page_counter(default_size, HUGEPAGES_OC) < 0) + return 0; + + return 1; +} + +/********************************************************************/ +/* Library user visible functions */ +/********************************************************************/ + +/* + * NOTE: This function uses data that is initialized by + * setup_mounts() which is called during libhugetlbfs initialization. + * + * returns: + * on success, size of a huge page in number of bytes + * on failure, -1 + * errno set to ENOSYS if huge pages are not supported + * errno set to EOVERFLOW if huge page size would overflow return type + */ +long gethugepagesize(void) +{ + long hpage_size; + + /* Are huge pages available and have they been initialized? */ + if (hpage_sizes_default_idx == -1) { + errno = hugepagesize_errno = ENOSYS; + return -1; + } + + errno = 0; + hpage_size = hpage_sizes[hpage_sizes_default_idx].pagesize; + return hpage_size; +} + +int gethugepagesizes(long pagesizes[], int n_elem) +{ + long default_size; + DIR *sysfs; + struct dirent *ent; + int nr_sizes = 0; + + if (n_elem < 0) { + errno = EINVAL; + return -1; + } + + if (n_elem > 0 && pagesizes == NULL) { + errno = EINVAL; + return -1; + } + + errno = 0; + + /* Get the system default size. */ + default_size = kernel_default_hugepage_size(); + if (default_size < 0) + return 0; + + if (pagesizes && (nr_sizes == n_elem)) + return nr_sizes; + if (pagesizes) + pagesizes[nr_sizes] = default_size; + nr_sizes++; + + /* + * Scan sysfs to look for other sizes. + * Non-existing dir is not an error, we got one size from /proc/meminfo. + */ + sysfs = opendir(SYSFS_HUGEPAGES_DIR); + if (!sysfs) { + if (errno == ENOENT) { + errno = 0; + return nr_sizes; + } else + return -1; + } + while ((ent = readdir(sysfs))) { + long size; + + if (strncmp(ent->d_name, "hugepages-", 10)) + continue; + + size = strtol(ent->d_name + 10, NULL, 10); + if (size == LONG_MIN || size == LONG_MAX) + continue; + size = size_to_smaller_unit(size); + + if (size < 0 || size == default_size) + continue; + if (pagesizes && (nr_sizes == n_elem)) + return nr_sizes; + if (pagesizes) + pagesizes[nr_sizes] = size; + nr_sizes++; + } + closedir(sysfs); + + return nr_sizes; +} + +int getpagesizes(long pagesizes[], int n_elem) +{ + int ret; + + if (n_elem < 0 || (n_elem > 0 && pagesizes == NULL)) { + errno = EINVAL; + return -1; + } + + /* Requests for sizing, we need one more slot than gethugepagesizes. */ + if (pagesizes == NULL && n_elem == 0) { + ret = gethugepagesizes(pagesizes, n_elem); + } else { + /* Install the base page size. */ + if (pagesizes && n_elem == 0) + return 0; + if (pagesizes) + pagesizes[0] = sysconf(_SC_PAGESIZE); + + ret = gethugepagesizes(pagesizes + 1, n_elem - 1); + } + if (ret < 0) + return ret; + return ret + 1; +} + +int hugetlbfs_test_path(const char *mount) +{ + struct statfs64 sb; + int err; + + /* Bugs in the 32<->64 translation code in pre-2.6.15 kernels + * mean that plain statfs() returns bogus errors on hugetlbfs + * filesystems. Use statfs64() to work around. */ + err = statfs64(mount, &sb); + if (err) + return -1; + + return (sb.f_type == HUGETLBFS_MAGIC); +} + +/* Return the page size for the given mount point in bytes */ +long hugetlbfs_test_pagesize(const char *mount) +{ + struct statfs64 sb; + int err; + + err = statfs64(mount, &sb); + if (err) + return -1; + + if ((sb.f_bsize <= 0) || (sb.f_bsize > LONG_MAX)) + return -1; + + return sb.f_bsize; +} + +const char *hugetlbfs_find_path_for_size(long page_size) +{ + char *path; + int idx; + + idx = hpage_size_to_index(page_size); + if (idx >= 0) { + path = hpage_sizes[idx].mount; + if (strlen(path)) + return path; + } + return NULL; +} + +const char *hugetlbfs_find_path(void) +{ + long hpage_size = gethugepagesize(); + if (hpage_size > 0) + return hugetlbfs_find_path_for_size(hpage_size); + else + return NULL; +} + +int hugetlbfs_unlinked_fd_for_size(long page_size) +{ + const char *path; + char name[PATH_MAX+1]; + int fd; + + path = hugetlbfs_find_path_for_size(page_size); + if (!path) + return -1; + + name[sizeof(name)-1] = '\0'; + + strcpy(name, path); + strncat(name, "/libhugetlbfs.tmp.XXXXXX", sizeof(name)-1); + /* FIXME: deal with overflows */ + + fd = mkstemp64(name); + + if (fd < 0) { + ERROR("mkstemp() failed: %s\n", strerror(errno)); + return -1; + } + + unlink(name); + + return fd; +} + +int hugetlbfs_unlinked_fd(void) +{ + long hpage_size = gethugepagesize(); + if (hpage_size > 0) + return hugetlbfs_unlinked_fd_for_size(hpage_size); + else + return -1; +} + +#define IOV_LEN 64 +int hugetlbfs_prefault(void *addr, size_t length) +{ + size_t offset; + struct iovec iov[IOV_LEN]; + int ret; + int i; + int fd; + + if (!__hugetlbfs_prefault) + return 0; + + /* + * The NUMA users of libhugetlbfs' malloc feature are + * expected to use the numactl program to specify an + * appropriate policy for hugepage allocation + * + * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT + * is set. If we instead returned a hugepage mapping with insufficient + * hugepages, the VM system would kill the process when the + * process tried to access the missing memory. + * + * The value of this environment variable is read during library + * initialisation and sets __hugetlbfs_prefault accordingly. If + * prefaulting is enabled and we can't get all that were requested, + * -ENOMEM is returned. The caller is expected to release the entire + * mapping and optionally it may recover by mapping base pages instead. + */ + + fd = open("/dev/zero", O_RDONLY); + if (fd < 0) { + ERROR("Failed to open /dev/zero for reading\n"); + return -ENOMEM; + } + + for (offset = 0; offset < length; ) { + for (i = 0; i < IOV_LEN && offset < length; i++) { + iov[i].iov_base = addr + offset; + iov[i].iov_len = 1; + offset += gethugepagesize(); + } + ret = readv(fd, iov, i); + if (ret != i) { + DEBUG("Got %d of %d requested; err=%d\n", ret, + i, ret < 0 ? errno : 0); + WARNING("Failed to reserve %ld huge pages " + "for new region\n", + length / gethugepagesize()); + close(fd); + return -ENOMEM; + } + } + + close(fd); + return 0; +} + +long get_huge_page_counter(long pagesize, unsigned int counter) +{ + char file[PATH_MAX+1]; + char *key; + + if (select_pool_counter(counter, pagesize, file, &key)) + return -1; + + if (access(file, O_RDONLY)) + return -1; + + return file_read_ulong(file, key); +} + +int set_huge_page_counter(long pagesize, unsigned int counter, + unsigned long val) +{ + char file[PATH_MAX+1]; + + if (select_pool_counter(counter, pagesize, file, NULL)) + return -1; + + return file_write_ulong(file, val); +} + +int set_nr_hugepages(long pagesize, unsigned long val) +{ + return set_huge_page_counter(pagesize, HUGEPAGES_TOTAL, val); +} + +int set_nr_overcommit_hugepages(long pagesize, unsigned long val) +{ + DEBUG("setting HUGEPAGES_OC to %ld\n", val); + return set_huge_page_counter(pagesize, HUGEPAGES_OC, val); +} + +long read_nr_overcommit(long page_size) +{ + if (!kernel_has_overcommit()) + return -1; + + return get_huge_page_counter(page_size, HUGEPAGES_OC); +} + +void restore_overcommit_pages(long page_size, long oc_pool) +{ + if (!kernel_has_overcommit()) + return; + + set_nr_overcommit_hugepages(page_size, oc_pool); +} + +/********************************************************************/ +/* Library user visible DIAGNOSES/DEBUGGING ONLY functions */ +/********************************************************************/ + +#define MAPS_BUF_SZ 4096 +long dump_proc_pid_maps() +{ + FILE *f; + char line[MAPS_BUF_SZ]; + size_t ret; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + ERROR("Failed to open /proc/self/maps\n"); + return -1; + } + + while (1) { + ret = fread(line, sizeof(char), MAPS_BUF_SZ, f); + if (ret < 0) { + ERROR("Failed to read /proc/self/maps\n"); + return -1; + } + if (ret == 0) + break; + ret = fwrite(line, sizeof(char), ret, stderr); + if (ret < 0) { + ERROR("Failed to write /proc/self/maps to stderr\n"); + return -1; + } + } + + fclose(f); + return 0; +} + +long read_meminfo(const char *tag) +{ + return file_read_ulong(MEMINFO, tag); +} diff --git a/init.c b/init.c new file mode 100644 index 0000000..b912448 --- /dev/null +++ b/init.c @@ -0,0 +1,39 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Nishanth Aravamudan, IBM Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libhugetlbfs_internal.h" + +static void __attribute__ ((constructor)) setup_libhugetlbfs(void) +{ + hugetlbfs_setup_env(); + hugetlbfs_setup_debug(); + hugetlbfs_setup_kernel_page_size(); + setup_mounts(); + probe_default_hpage_size(); + if (__hugetlbfs_debug) + debug_show_page_sizes(); + setup_features(); + hugetlbfs_check_priv_resv(); + hugetlbfs_check_safe_noreserve(); + hugetlbfs_check_map_hugetlb(); +#ifndef NO_ELFLINK + hugetlbfs_setup_elflink(); +#endif + hugetlbfs_setup_morecore(); +} diff --git a/init_privutils.c b/init_privutils.c new file mode 100644 index 0000000..f32d83b --- /dev/null +++ b/init_privutils.c @@ -0,0 +1,27 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Nishanth Aravamudan, IBM Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libhugetlbfs_internal.h" + +static void __attribute__ ((constructor)) setup_libhugetlbfs(void) +{ + hugetlbfs_setup_debug(); + setup_mounts(); + setup_features(); +} diff --git a/kernel-features.c b/kernel-features.c new file mode 100644 index 0000000..c93aabd --- /dev/null +++ b/kernel-features.c @@ -0,0 +1,303 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE /* For strchrnul */ + +#include +#include +#include +#include +#include +#include +#include +#include "kernel-features.h" +#include "hugetlbfs.h" +#include "libhugetlbfs_privutils.h" +#include "libhugetlbfs_internal.h" +#include "libhugetlbfs_debug.h" + +static struct kernel_version running_kernel_version; + +/* This mask should always be 32 bits, regardless of the platform word size */ +static unsigned int feature_mask; + +static struct feature kernel_features[] = { + [HUGETLB_FEATURE_PRIVATE_RESV] = { + .name = "private_reservations", + .required_version = "2.6.27-rc1", + }, + [HUGETLB_FEATURE_SAFE_NORESERVE] = { + .name = "noreserve_safe", + .required_version = "2.6.34", + }, + [HUGETLB_FEATURE_MAP_HUGETLB] = { + .name = "map_hugetlb", + .required_version = "2.6.32", + } +}; + +static void debug_kernel_version(void) +{ + struct kernel_version *ver = &running_kernel_version; + + INFO("Parsed kernel version: [%u] . [%u] . [%u] ", + ver->major, ver->minor, ver->release); + if (ver->post) + INFO_CONT(" [post-release: %u]\n", ver->post); + else if (ver->pre) + INFO_CONT(" [pre-release: %u]\n", ver->pre); + else + INFO_CONT("\n"); +} + +static int str_to_ver(const char *str, struct kernel_version *ver) +{ + char *start; + char *end; + + /* Clear out version struct */ + ver->major = ver->minor = ver->release = ver->post = ver->pre = 0; + + /* + * The kernel always starts x.y.z + * + * Note: strtol is used in place of sscanf because when heap override is + * used this step happens before the _morecore replacement and sscanf + * does an internal heap allocation. This mean that the next allocation + * from the heap would be on small pages until the first block allocated + * by _morecore is exhausted + */ + errno = 0; + ver->major = strtol(str, &end, 10); + if (!ver->major && errno == EINVAL) { + ERROR("Unable to determine base kernel version: %s\n", + strerror(errno)); + return -1; + } + + start = end + 1; + errno = 0; + ver->minor = strtol(start, &end, 10); + if (!ver->minor && errno == EINVAL) { + ERROR("Unable to determine base kernel version: %s\n", + strerror(errno)); + return -1; + } + + start = end + 1; + errno = 0; + ver->release = strtol(start, &end, 10); + if (!ver->release && errno == EINVAL) { + ERROR("Unable to determine base kernel version: %s\n", + strerror(errno)); + return -1; + } + + /* Try to match a post/stable version */ + start = end + 1; + if (*end == '.') { + ver->post = strtol(start, &end, 10); + if (!ver->post && errno == EINVAL) + return 0; + } + + /* Try to match a preN/rcN version */ + start = end + 1; + if (*end == '-') { + if (*start == 'r' && *(start + 1) == 'c') + start += 2; + else if (*start == 'p' && + *(start + 1) == 'r' && + *(start + 2) == 'e') + start += 3; + else { + /* + * For now we ignore any extraversions besides + * pre and rc versions and treat them as equal + * to the base version. + */ + return 0; + } + + ver->pre = strtol(start, &end, 10); + } + + return 0; +} + +static int int_cmp(int a, int b) +{ + if (a < b) + return -1; + if (a > b) + return 1; + else + return 0; +} + +/* + * Pre-release kernels have the following compare rules: + * X.Y.(Z - 1) < X.Y.Z-rcN < X.Y.X + * This order can be enforced by simply decrementing the release (for + * comparison purposes) when there is a pre/rc modifier in effect. + */ +static int ver_cmp_release(struct kernel_version *ver) +{ + if (ver->pre) + return ver->release - 1; + else + return ver->release; +} + +static int ver_cmp(struct kernel_version *a, struct kernel_version *b) +{ + int ret, a_release, b_release; + + if ((ret = int_cmp(a->major, b->major)) != 0) + return ret; + + if ((ret = int_cmp(a->minor, b->minor)) != 0) + return ret; + + a_release = ver_cmp_release(a); + b_release = ver_cmp_release(b); + if ((ret = int_cmp(a_release, b_release)) != 0) + return ret; + + if ((ret = int_cmp(a->post, b->post)) != 0) + return ret; + + if ((ret = int_cmp(a->pre, b->pre)) != 0) + return ret; + + /* We ignore forks (such as -mm and -mjb) */ + return 0; +} + +int test_compare_kver(const char *a, const char *b) +{ + struct kernel_version ka, kb; + + if (str_to_ver(a, &ka) < 0) + return -EINVAL; + if (str_to_ver(b, &kb) < 0) + return -EINVAL; + return ver_cmp(&ka, &kb); +} + +int hugetlbfs_test_feature(int feature_code) +{ + if (feature_code >= HUGETLB_FEATURE_NR) { + ERROR("hugetlbfs_test_feature: invalid feature code\n"); + return -EINVAL; + } + return feature_mask & (1 << feature_code); +} + +static void print_valid_features(void) +{ + int i; + + ERROR("HUGETLB_FEATURES=\"[,] ...\"\n"); + ERROR_CONT("Valid features:\n"); + for (i = 0; i < HUGETLB_FEATURE_NR; i++) + ERROR_CONT("\t%s, no_%s\n", kernel_features[i].name, + kernel_features[i].name); +} + +static int check_features_env_valid(const char *env) +{ + const char *pos = env; + int i; + + while (pos && *pos != '\0') { + int match = 0; + char *next; + + if (*pos == ',') + pos++; + next = strchrnul(pos, ','); + if (strncmp(pos, "no_", 3) == 0) + pos += 3; + + for (i = 0; i < HUGETLB_FEATURE_NR; i++) { + char *name = kernel_features[i].name; + if (strncmp(pos, name, next - pos) == 0) { + match = 1; + break; + } + } + if (!match) { + print_valid_features(); + return -1; + } + pos = next; + } + return 0; +} + +void setup_features() +{ + struct utsname u; + int i; + + if (uname(&u)) { + ERROR("Getting kernel version failed: %s\n", strerror(errno)); + return; + } + + str_to_ver(u.release, &running_kernel_version); + debug_kernel_version(); + + /* Check if the user has overrided any features */ + if (__hugetlb_opts.features && + check_features_env_valid(__hugetlb_opts.features) == -1) { + ERROR("HUGETLB_FEATURES was invalid -- ignoring.\n"); + __hugetlb_opts.features = NULL; + } + + for (i = 0; i < HUGETLB_FEATURE_NR; i++) { + struct kernel_version ver; + char *name = kernel_features[i].name; + char *pos; + + str_to_ver(kernel_features[i].required_version, &ver); + + /* Has the user overridden feature detection? */ + if (__hugetlb_opts.features && + (pos = strstr(__hugetlb_opts.features, name))) { + INFO("Overriding feature %s: ", name); + /* If feature is preceeded by 'no_' then turn it off */ + if (((pos - 3) >= __hugetlb_opts.features) && + !strncmp(pos - 3, "no_", 3)) + INFO_CONT("no\n"); + else { + INFO_CONT("yes\n"); + feature_mask |= (1UL << i); + } + continue; + } + + /* Is the running kernel version newer? */ + if (ver_cmp(&running_kernel_version, &ver) >= 0) { + INFO("Feature %s is present in this kernel\n", + kernel_features[i].name); + feature_mask |= (1UL << i); + } + } +} diff --git a/kernel-features.h b/kernel-features.h new file mode 100644 index 0000000..e1b6ca9 --- /dev/null +++ b/kernel-features.h @@ -0,0 +1,30 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +struct kernel_version { + unsigned int major; + unsigned int minor; + unsigned int release; + unsigned int post; + unsigned int pre; +}; + +struct feature { + char *name; + char *required_version; +}; diff --git a/ld.hugetlbfs b/ld.hugetlbfs new file mode 100755 index 0000000..388f7b4 --- /dev/null +++ b/ld.hugetlbfs @@ -0,0 +1,130 @@ +#! /bin/bash + +# Paranoid check to make sure we don't reinvoke ourselves, effectively +# making a fork()bomb +if [ -n "$LD_HUGETLBFS_RECURSION" ]; then + exit 99 +fi +export LD_HUGETLBFS_RECURSION=1 + +### SET DEFAULT LDSCRIPT PATH HERE ### +if [ -z "$HUGETLB_LDSCRIPT_PATH" ]; then + # Assume this script is running from the libhugetlbfs source tree, + # and look for the ldscripts accordingly + HUGETLB_LDSCRIPT_PATH=$(dirname $(readlink $0))/ldscripts +fi + +### SET CUSTOM_LDSCRIPTS HERE ### +if [ -z "$CUSTOM_LDSCRIPTS" ]; then + # Assume this script is running from the libhugetlbfs source tree, + # and set CUSTOM_LDSCRIPTS to default "yes" + CUSTOM_LDSCRIPTS="yes" +fi + +# Try to figure out what's the underlying linker to invoke +if [ -z "$LD" ]; then + for x in $(which -a ld); do + if [ "$x" != "$0" ]; then + LD="$x" + break + fi + done +fi + +i=0 +while [ -n "$1" ]; do + arg="$1" + case "$arg" in + -m*) + EMU="${arg#-m}" + args[$i]="$arg" + i=$[i+1] + if [ -z "$EMU" ]; then + shift + EMU="$1" + args[$i]="$1" + i=$[i+1] + fi + ;; + --hugetlbfs-link=*) + if [ -z "$HUGETLB_DEPRECATED_LINK" ]; then + echo -n "ld.hugetlbfs: --hugetlbfs-link is deprecated. " 1>&2 + echo "Migrate to --hugetlbfs-align." 1>&2 + fi + HTLB_LINK="${arg#--hugetlbfs-link=}" + ;; + --hugetlbfs-script-path=*) + HUGETLB_LDSCRIPT_PATH="${arg#--hugetlbfs-script-path=}" + ;; + --hugetlbfs-align) + HTLB_ALIGN="slice" + ;; + --) + args=("${args[@]}" "$@") + break + ;; + *) + args[$i]="$arg" + i=$[i+1] + ;; + esac + shift +done + +if [ -n "$HTLB_LINK" ]; then + if [ "$CUSTOM_LDSCRIPTS" == "no" ]; then + echo -n "ld.hugetlbfs: --hugetlbfs-link is not supported on this " 1>&2 + echo "platform. Use --hugetlbfs-align instead." 1>&2 + fi + HTLB_ALIGN="" # --hugetlbfs-link overrides --hugetlbfs-align + LDSCRIPT="$EMU.x$HTLB_LINK" + HTLBOPTS="-T${HUGETLB_LDSCRIPT_PATH}/${LDSCRIPT}" +fi + +# if -m is not present on command line +if [ -z "$EMU" ]; then + if [ -n "$LDEMULATION" ]; then + # try env. variable + EMU="$LDEMULATION" + else + # pick first supported + EMU="$(ld -V | sed -n '/Supported emulations/{n;p}' | tr -d ' ')" + fi +fi + +# if -m is not present on command line +if [ -z "$EMU" ]; then + if [ -n "$LDEMULATION" ]; then + # try env. variable + EMU="$LDEMULATION" + else + # pick first supported + EMU="$(ld -V | sed -n '/Supported emulations/{n;p}' | tr -d ' ')" + fi +fi + +MB=$((1024*1024)) +case "$EMU" in +elf32ppclinux|elf64ppc) HPAGE_SIZE=$((16*$MB)) SLICE_SIZE=$((256*$MB)) ;; +elf64lppc) HPAGE_SIZE=$((16*$MB)) SLICE_SIZE=$((256*$MB)) ;; +elf_i386|elf_x86_64) HPAGE_SIZE=$((4*$MB)) SLICE_SIZE=$HPAGE_SIZE ;; +elf_s390|elf64_s390) HPAGE_SIZE=$((1*$MB)) SLICE_SIZE=$HPAGE_SIZE ;; +armelf*_linux_eabi|aarch64elf*|aarch64linux*) + hpage_kb=$(cat /proc/meminfo | grep Hugepagesize: | awk '{print $2}') + HPAGE_SIZE=$((hpage_kb * 1024)) + SLICE_SIZE=$HPAGE_SIZE ;; +esac + +if [ "$HTLB_ALIGN" == "slice" ]; then + HTLBOPTS="-zcommon-page-size=$SLICE_SIZE -zmax-page-size=$SLICE_SIZE" + HTLBOPTS="$HTLBOPTS -lhugetlbfs" + + # targeting the ARM platform one needs to explicitly set the text segment offset + # otherwise it will be NULL. + case "$EMU" in + armelf*_linux_eabi|aarch64elf*|aarch64linux*) HTLBOPTS="$HTLBOPTS -Ttext-segment=$SLICE_SIZE" ;; + elf_i386) HTLBOPTS="$HTLBOPTS -Ttext-segment=0x08000000" ;; + esac +fi + +${LD} "${args[@]}" ${HTLBOPTS} diff --git a/ldscripts/elf32ppclinux.xB b/ldscripts/elf32ppclinux.xB new file mode 100644 index 0000000..28ad88d --- /dev/null +++ b/ldscripts/elf32ppclinux.xB @@ -0,0 +1,254 @@ +/* Link script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", + "elf32-powerpc") +OUTPUT_ARCH(powerpc:common) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } :text + .dynstr : { *(.dynstr) } :text + .gnu.version : { *(.gnu.version) } :text + .gnu.version_d : { *(.gnu.version_d) } :text + .gnu.version_r : { *(.gnu.version_r) } :text + .rel.init : { *(.rel.init) } :text + .rela.init : { *(.rela.init) } :text + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :text + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :text + .rel.fini : { *(.rel.fini) } :text + .rela.fini : { *(.rela.fini) } :text + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :text + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :text + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :text + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :text + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :text + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :text + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :text + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :text + .rel.ctors : { *(.rel.ctors) } :text + .rela.ctors : { *(.rela.ctors) } :text + .rel.dtors : { *(.rel.dtors) } :text + .rela.dtors : { *(.rela.dtors) } :text + .rel.got : { *(.rel.got) } :text + .rela.got : { *(.rela.got) } :text + .rela.got1 : { *(.rela.got1) } :text + .rela.got2 : { *(.rela.got2) } :text + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } :text + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } :text + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } :text + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } :text + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } :text + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } :text + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } :text + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } :text + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :text + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :text + .rel.plt : { *(.rel.plt) } :text + .rela.plt : { *(.rela.plt) } :text + .init : + { + KEEP (*(.init)) + } :text =0 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.glink) + } :text =0 + .fini : + { + KEEP (*(.fini)) + } :text =0 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :text + .rodata1 : { *(.rodata1) } :text + .sdata2 : + { + PROVIDE (_SDA2_BASE_ = 32768); + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } :text + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } :text + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :text */ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } :text */ + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x10000) - ((0x10000 - .) & (0x10000 - 1)); . = DATA_SEGMENT_ALIGN (0x10000, 0x1000); + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :data + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } :data + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :data + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :data + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } :data + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } :data + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } :data + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :data + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :data + .jcr : { KEEP (*(.jcr)) } :data + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :data + .got1 : { *(.got1) } :data + .got2 : { *(.got2) } :data + .dynamic : { *(.dynamic) } :dynamic :data +/* .got : SPECIAL { *(.got) } :data*/ +/* . = DATA_SEGMENT_RELRO_END (0, .);*/ +/* .plt : SPECIAL { *(.plt) } :data*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :data + .data1 : { *(.data1) } :data + .got : /*SPECIAL*/ { *(.got) } :data + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + PROVIDE (_SDA_BASE_ = 32768); + *(.sdata .sdata.* .gnu.linkonce.s.*) + } :data + _edata = .; PROVIDE (edata = .); + .plt : /*SPECIAL*/ { *(.plt) } :data + . = ALIGN(32 / 8); + . = ALIGN(32 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area */ + /* Saving hugepages is more important than saving executable size, so + * we don't attempt to maintain congruence here */ + . = ALIGN(0x10000000); /* Align to next 256MB segment */ + /* HACK: workaround fact that kernel may not cope with segments with zero + * filesize */ + .hugetlb.data : { LONG(1) } :htlb + __bss_start = .; + .sbss : + { + PROVIDE (__sbss_start = .); PROVIDE (___sbss_start = .); + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + PROVIDE (__sbss_end = .); PROVIDE (___sbss_end = .); + } :htlb + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* + * Align here to ensure that the .bss section occupies space up to + * _end. Additionally (for huge pages) align to a segment boundary. + * This ensures that no normal page mappings will be created in this + * segment (after the bss) which could interfere with remapping. + */ + . = ALIGN(256*1024*1024); + } :htlb + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.fixup) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/ldscripts/elf32ppclinux.xBDT b/ldscripts/elf32ppclinux.xBDT new file mode 100644 index 0000000..497882b --- /dev/null +++ b/ldscripts/elf32ppclinux.xBDT @@ -0,0 +1,245 @@ +/* Linker script for normal executables with text data and BSS in hugepages */ +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", + "elf32-powerpc") +OUTPUT_ARCH(powerpc:common) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :htext :interp + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .hash : { *(.hash) } :htext + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rela.got1 : { *(.rela.got1) } :htext + .rela.got2 : { *(.rela.got2) } :htext + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } :htext + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } :htext + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } :htext + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } :htext + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } :htext + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } :htext + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } :htext + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.glink) + } :htext =0 + .fini : + { + KEEP (*(.fini)) + } :htext =0 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .sdata2 : + { + PROVIDE (_SDA2_BASE_ = 32768); + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } :htext + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext */ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } :htext */ + + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + /* Just move to the very next hugepage, rather than using a guard + * page, because for ppc32 binaries we can't separate the text and + * PLT by >32MB */ + . = ALIGN (0x1000000); + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } :hdata + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } :hdata + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } :hdata + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .got1 : { *(.got1) } :hdata + .got2 : { *(.got2) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata + .got : { *(.got.plt .got) } :hdata +/* . = DATA_SEGMENT_RELRO_END (0, .); */ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1) } :hdata + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + PROVIDE (_SDA_BASE_ = 32768); + *(.sdata .sdata.* .gnu.linkonce.s.*) + } :hdata + _edata = .; PROVIDE (edata = .); + __bss_start = .; + .sbss : + { + PROVIDE (__sbss_start = .); PROVIDE (___sbss_start = .); + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + PROVIDE (__sbss_end = .); PROVIDE (___sbss_end = .); + } :hdata + .plt : { *(.plt) } :hdata + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* + * Align here to ensure that the .bss section occupies space up to + * _end. Additionally (for huge pages) align to a segment boundary. + * This ensures that no normal page mappings will be created in this + * segment (after the bss) which could interfere with remapping. + */ + . = ALIGN(256*1024*1024); + } :hdata + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.fixup) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/ldscripts/elf64ppc.xB b/ldscripts/elf64ppc.xB new file mode 100644 index 0000000..1a9c1ab --- /dev/null +++ b/ldscripts/elf64ppc.xB @@ -0,0 +1,245 @@ +/* Linker script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", + "elf64-powerpc") +OUTPUT_ARCH(powerpc:common64) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/powerpc64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rel.init : { *(.rel.init) } + .rela.init : { *(.rela.init) } + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } + .rel.fini : { *(.rel.fini) } + .rela.fini : { *(.rela.fini) } + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } + .rel.ctors : { *(.rel.ctors) } + .rela.ctors : { *(.rela.ctors) } + .rel.dtors : { *(.rel.dtors) } + .rela.dtors : { *(.rela.dtors) } + .rel.got : { *(.rel.got) } + .rela.got : { *(.rela.got) } + .rela.toc : { *(.rela.toc) } + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } + .rel.plt : { *(.rel.plt) } + .rela.plt : { *(.rela.plt) } + .rela.tocbss : { *(.rela.tocbss) } + .init : + { + KEEP (*(.init)) + } =0x60000000 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.sfpr .glink) + } =0x60000000 + .fini : + { + KEEP (*(.fini)) + } =0x60000000 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + .sdata2 : + { + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) }*/ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }*/ + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x10000) - ((0x10000 - .) & (0x10000 - 1)); . = DATA_SEGMENT_ALIGN (0x10000, 0x1000); + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :data + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } + .jcr : { KEEP (*(.jcr)) } + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } + .dynamic : { *(.dynamic) } :dynamic :data +/* . = DATA_SEGMENT_RELRO_END (0, .);*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } + .data1 : { *(.data1) } + .toc1 ALIGN(8) : { *(.toc1) } + .opd ALIGN(8) : { KEEP (*(.opd)) } + .got ALIGN(8) : { *(.got .toc) } + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + *(.sdata .sdata.* .gnu.linkonce.s.*) + } + _edata = .; PROVIDE (edata = .); + .plt : { *(.plt) } + . = ALIGN(64 / 8); + . = ALIGN(64 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area: + * Saving hugepages is more important than saving executable size, so + * we don't attempt to maintain congruence here. + * In order to map hugepages into the address space, we must advance the + * location counter to a segment boundary. If the address is < 4G, the + * next segment will be on a 256M boundary. For higher areas, we have a + * 1TB granularity. */ + . = (. < 0x100000000) ? ALIGN(0x10000000) : ALIGN(0x10000000000); + /* HACK: workaround fact that kernel may not cope with segments with zero + * filesize */ + .hugetlb.data : { LONG(1) } :htlb + __bss_start = .; + .tocbss ALIGN(8) : { *(.tocbss)} :htlb + .sbss : + { + + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/ldscripts/elf64ppc.xBDT b/ldscripts/elf64ppc.xBDT new file mode 100644 index 0000000..5477294 --- /dev/null +++ b/ldscripts/elf64ppc.xBDT @@ -0,0 +1,241 @@ +/* Linker script for normal executables with text, data and BSS in hugepages */ +OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", + "elf64-powerpc") +OUTPUT_ARCH(powerpc:common64) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/powerpc64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT( -lhugetlbfs ); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :interp :htext + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .hash : { *(.hash) } :htext + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rela.toc : { *(.rela.toc) } :htext + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } :htext + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } :htext + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } :htext + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } :htext + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } :htext + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } :htext + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } :htext + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .rela.tocbss : { *(.rela.tocbss) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0x60000000 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.sfpr .glink) + } :htext =0x60000000 + .fini : + { + KEEP (*(.fini)) + } :htext =0x60000000 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .sdata2 : + { + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } :htext + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext */ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } :htext */ + + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + . = ALIGN (. + 0x1000000, 0x1000000); /* Align to next 16MB hugepage */ + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } :hdata + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } :hdata + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } :hdata + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata +/* . = DATA_SEGMENT_RELRO_END (0, .);*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1)} :hdata + .toc1 ALIGN(8) : { *(.toc1) } :hdata + .opd ALIGN(8) : { KEEP (*(.opd)) } :hdata + .got ALIGN(8) : { *(.got .toc) } :hdata + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + *(.sdata .sdata.* .gnu.linkonce.s.*) + } :hdata + _edata = .; PROVIDE (edata = .); + __bss_start = .; + .tocbss ALIGN(8) : { *(.tocbss)} :hdata + .sbss : + { + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } :hdata + .plt : { *(.plt) } :hdata + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* + * Align here to ensure that the .bss section occupies space up to + * _end. Additionally (for huge pages) align to a segment boundary. + * This ensures that no normal page mappings will be created in this + * segment (after the bss) which could interfere with remapping. + * + * XXX: This ALIGN will need to be extended to handle the case where + * ends above 1T -- in which case the alignment should be 1T. + */ + . = ALIGN(256*1024*1024); + } :hdata + _end = .; + PROVIDE (end = .); +/*. = DATA_SEGMENT_END (.);*/ + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/ldscripts/elf_i386.xB b/ldscripts/elf_i386.xB new file mode 100644 index 0000000..43fe51c --- /dev/null +++ b/ldscripts/elf_i386.xB @@ -0,0 +1,200 @@ +/* Linker script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", + "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(_start) +SEARCH_DIR("/usr/i486-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x08048000; . = 0x08048000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rel.init : { *(.rel.init) } + .rela.init : { *(.rela.init) } + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } + .rel.fini : { *(.rel.fini) } + .rela.fini : { *(.rela.fini) } + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } + .rel.ctors : { *(.rel.ctors) } + .rela.ctors : { *(.rela.ctors) } + .rel.dtors : { *(.rel.dtors) } + .rela.dtors : { *(.rela.dtors) } + .rel.got : { *(.rel.got) } + .rela.got : { *(.rela.got) } + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } + .rel.plt : { *(.rel.plt) } + .rela.plt : { *(.rela.plt) } + .init : + { + KEEP (*(.init)) + } =0x90909090 + .plt : { *(.plt) } + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } =0x90909090 + .fini : + { + KEEP (*(.fini)) + } =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + .eh_frame_hdr : { *(.eh_frame_hdr) } :data :eh_frame_hdr + /* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } */ + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x1000) - ((0x1000 - .) & (0x1000 - 1)); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000); + /* Exception handling */ + .eh_frame : /* ONLY_IF_RW */ { KEEP (*(.eh_frame)) } :data + .gcc_except_table : /* ONLY_IF_RW */ { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(32 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } + .jcr : { KEEP (*(.jcr)) } + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } + .dynamic : { *(.dynamic) } :dynamic :data + .got : { *(.got.plt) *(.got) } + /*. = DATA_SEGMENT_RELRO_END (12, .);*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } + .data1 : { *(.data1) } + _edata = .; + PROVIDE (edata = .); + . = ALIGN(32 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area */ + . = ALIGN(0x1000000); /* Align to 16MB (4MB hugepage size, plus some slack in case of larger hugepages in future */ + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } :htlb + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/ldscripts/elf_i386.xBDT b/ldscripts/elf_i386.xBDT new file mode 100644 index 0000000..d72aebe --- /dev/null +++ b/ldscripts/elf_i386.xBDT @@ -0,0 +1,198 @@ +/* Linker script for normal executables with text, data and BSS in hugepages */ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", + "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(_start) +SEARCH_DIR("/usr/i486-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + /* Different from the normal origin addres, because we need to make + * it hugepage aligned */ + __executable_start = 0x08000000; . = 0x08000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :htext :interp + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .hash : { *(.hash) } :htext + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0x90909090 + .plt : { *(.plt) } + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } :htext =0x90909090 + .fini : + { + KEEP (*(.fini)) + } :htext =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr + /* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :htext */ + + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + . = ALIGN (0x1000000); /* Align to 16MB (4MB hugepage size, plus some slack in case of larger hugepages in future */ + /* Exception handling */ + .eh_frame : /* ONLY_IF_RW */ { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : /* ONLY_IF_RW */ { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(32 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } :hdata + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } :hdata + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } :hdata + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata + .got : { *(.got.plt) *(.got) } :hdata + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1) } :hdata + _edata = .; + PROVIDE (edata = .); + . = ALIGN(32 / 8); + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } :hdata + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/ldscripts/elf_x86_64.xB b/ldscripts/elf_x86_64.xB new file mode 100644 index 0000000..ed21a2c --- /dev/null +++ b/ldscripts/elf_x86_64.xB @@ -0,0 +1,202 @@ +/* Linker script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", + "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(_start) +SEARCH_DIR("/usr/x86_64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/x86_64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x400000; . = 0x400000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } :text + .dynstr : { *(.dynstr) } :text + .gnu.version : { *(.gnu.version) } :text + .gnu.version_d : { *(.gnu.version_d) } :text + .gnu.version_r : { *(.gnu.version_r) } :text + .rel.init : { *(.rel.init) } :text + .rela.init : { *(.rela.init) } :text + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :text + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :text + .rel.fini : { *(.rel.fini) } :text + .rela.fini : { *(.rela.fini) } :text + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :text + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :text + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :text + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :text + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :text + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :text + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :text + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :text + .rel.ctors : { *(.rel.ctors) } :text + .rela.ctors : { *(.rela.ctors) } :text + .rel.dtors : { *(.rel.dtors) } :text + .rela.dtors : { *(.rela.dtors) } :text + .rel.got : { *(.rel.got) } :text + .rela.got : { *(.rela.got) } :text + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :text + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :text + .rel.plt : { *(.rel.plt) } :text + .rela.plt : { *(.rela.plt) } :text + .init : + { + KEEP (*(.init)) + } :text =0x90909090 + .plt : { *(.plt)} :text + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } :text =0x90909090 + .fini : + { + KEEP (*(.fini)) + } :text =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :text + .rodata1 : { *(.rodata1) } :text + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :text + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :text + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x100000) - ((0x100000 - .) & (0x100000 - 1)); . = DATA_SEGMENT_ALIGN (0x100000, 0x1000); + /* Exception handling */ + .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) } :data + .gcc_except_table : ONLY_IF_RW { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :data + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :data + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :data + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(64 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } :data + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } :data + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } :data + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :data + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :data + .jcr : { KEEP (*(.jcr)) } :data + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :data + .dynamic : { *(.dynamic) } :dynamic :data + .got : { *(.got) } :data + . = DATA_SEGMENT_RELRO_END (24, .); + .got.plt : { *(.got.plt) } :data + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :data + .data1 : { *(.data1) } :data + _edata = .; + PROVIDE (edata = .); + __bss_start = .; + . = ALIGN(64 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area */ + . = ALIGN(0x1000000); /* Align to 16MB (2MB hugepage size, plus some slack in case of larger hugepages in future */ + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } :htlb + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/ldscripts/elf_x86_64.xBDT b/ldscripts/elf_x86_64.xBDT new file mode 100644 index 0000000..1855202 --- /dev/null +++ b/ldscripts/elf_x86_64.xBDT @@ -0,0 +1,202 @@ +/* Linker script for normal executables with text data and BSS in hugepages */ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", + "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(_start) +SEARCH_DIR("/usr/x86_64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/x86_64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + /* Different from the normal origin address, because we make it 16MB + * aligned, in case of future larger hugepages */ + __executable_start = 0x1000000; . = 0x1000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :interp :htext + .hash : { *(.hash) } :htext + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0x90909090 + .plt : { *(.plt) } :htext + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } :htext =0x90909090 + .fini : + { + KEEP (*(.fini)) + } :htext =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr + .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :htext + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + . = ALIGN (0x1000000); /* Align to 16MB (4MB hugepage size, plus some slack in case of larger hugepages in future */ + /* Exception handling */ + .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : ONLY_IF_RW { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(64 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } :hdata + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } :hdata + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } :hdata + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata + .got : { *(.got) } :hdata + .got.plt : { *(.got.plt) } :hdata + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1) } :hdata + _edata = .; + PROVIDE (edata = .); + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. */ + . = ALIGN(64 / 8); + } :hdata + . = ALIGN(64 / 8); + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/libhugetlbfs_debug.h b/libhugetlbfs_debug.h new file mode 100644 index 0000000..cd490ad --- /dev/null +++ b/libhugetlbfs_debug.h @@ -0,0 +1,42 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 IBM Corporation. + * Author: Andy Whitcroft + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _LIBHUGETLBFS_DEBUG_H +#define _LIBHUGETLBFS_DEBUG_H + +/* Severe, unrecoverable errors */ +#define ERROR(...) REPORT(1, "ERROR", ##__VA_ARGS__) +#define ERROR_CONT(...) REPORT_CONT(1, "ERROR", ##__VA_ARGS__) + +/* A condition that is recoverable, but may result in altered semantics */ +#define WARNING(...) REPORT(2, "WARNING", ##__VA_ARGS__) +#define WARNING_CONT(...) REPORT_CONT(2, "WARNING", ##__VA_ARGS__) + +/* Detailed information about normal library operations */ +#define INFO(...) REPORT(3, "INFO", ##__VA_ARGS__) +#define INFO_CONT(...) REPORT_CONT(3, "INFO", ##__VA_ARGS__) + +/* Diagnostic information used for debugging problems */ +#define DEBUG(...) REPORT(4, "DEBUG", ##__VA_ARGS__) +#define DEBUG_CONT(...) REPORT_CONT(4, "DEBUG", ##__VA_ARGS__) + +#define VERBOSITY_MAX 4 +#define VERBOSITY_DEFAULT 2 + +#endif diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h new file mode 100644 index 0000000..2df2ffc --- /dev/null +++ b/libhugetlbfs_internal.h @@ -0,0 +1,211 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are used internally within the libhugetlbfs library. + * + * All external functions declared here are library static and must be + * internalised using a define of the following form: + * + * #define foo __lh_foo + */ + +#ifndef _LIBHUGETLBFS_INTERNAL_H +#define _LIBHUGETLBFS_INTERNAL_H + +#include +#include +#include +#include + +#ifndef __LIBHUGETLBFS__ +#error This header should not be included by library users. +#endif /* __LIBHUGETLBFS__ */ + +#include "libhugetlbfs_privutils.h" +#include "libhugetlbfs_testprobes.h" + +#define stringify_1(x) #x +#define stringify(x) stringify_1(x) + +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) +#define ALIGN_UP(x,a) ALIGN(x,a) +#define ALIGN_DOWN(x,a) ((x) & ~((a) - 1)) + +#if defined(__powerpc64__) || \ + (defined(__powerpc__) && !defined(PPC_NO_SEGMENTS)) +#define SLICE_LOW_SHIFT 28 +#define SLICE_HIGH_SHIFT 40 +#elif defined(__ia64__) +#define SLICE_HIGH_SHIFT 63 +#endif + +struct libhugeopts_t { + int sharing; + bool min_copy; + bool shrink_ok; + bool shm_enabled; + bool no_reserve; + bool map_hugetlb; + bool thp_morecore; + unsigned long force_elfmap; + char *ld_preload; + char *elfmap; + char *share_path; + char *features; + char *path; + char *def_page_size; + char *morecore; + char *heapbase; +}; + +/* + * When adding a library local variable externalise the symbol as + * normal, plus add a #define of the form below. This define effectively + * renames the routine into the local namespace __lh_* which is forced + * local in the linker script version.lds. Some routines may need to be + * exported in the utilities library these are marked __pu_* which marks + * them for export in libhugetlbfs_privutils; their definitions should + * appear in libhugetlbfs_privutils.h rather than here. + */ +#define __hugetlbfs_verbose __lh___hugetlbfs_verbose +extern int __hugetlbfs_verbose; +#define __hugetlbfs_debug __lh___hugetlbfs_debug +extern bool __hugetlbfs_debug; +#define __hugetlbfs_prefault __lh___hugetlbfs_prefault +extern bool __hugetlbfs_prefault; +#define hugetlbfs_setup_env __lh_hugetlbfs_setup_env +extern void hugetlbfs_setup_env(); +#define hugetlbfs_setup_elflink __lh_hugetlbfs_setup_elflink +extern void hugetlbfs_setup_elflink(); +#define hugetlbfs_setup_morecore __lh_hugetlbfs_setup_morecore +extern void hugetlbfs_setup_morecore(); +#define hugetlbfs_setup_debug __lh_hugetlbfs_setup_debug +extern void hugetlbfs_setup_debug(); +#define setup_mounts __lh_setup_mounts +extern void setup_mounts(); +#define setup_features __lh_setup_features +extern void setup_features(); +#define hugetlbfs_check_priv_resv __lh_hugetlbfs_check_priv_resv +extern void hugetlbfs_check_priv_resv(); +#define hugetlbfs_check_safe_noreserve __lh_hugetlbfs_check_safe_noreserve +extern void hugetlbfs_check_safe_noreserve(); +#define hugetlbfs_check_map_hugetlb __lh_hugetblfs_check_map_hugetlb +extern void hugetlbfs_check_map_hugetlb(); +#define __hugetlbfs_hostname __lh___hugetlbfs_hostname +extern char __hugetlbfs_hostname[]; +#define hugetlbfs_prefault __lh_hugetlbfs_prefault +extern int hugetlbfs_prefault(void *addr, size_t length); +#define parse_page_size __lh_parse_page_size +extern long parse_page_size(const char *str); +#define probe_default_hpage_size __lh__probe_default_hpage_size +extern void probe_default_hpage_size(void); +#define debug_show_page_sizes __lh__debug_show_page_sizes +extern void debug_show_page_sizes(void); +#define hugetlbfs_setup_kernel_page_size __lh__hugetlbfs_setup_kernel_page_size +extern void hugetlbfs_setup_kernel_page_size(void); +#define __hugetlb_opts __lh__hugetlb_opts +extern struct libhugeopts_t __hugetlb_opts; + +#ifndef REPORT_UTIL +#define REPORT_UTIL "libhugetlbfs" +#endif + +#define VERBOSE_ERROR 1 +#define VERBOSE_WARNING 2 +#define VERBOSE_INFO 3 +#define VERBOSE_DEBUG 4 + +#ifndef REPORT +#define REPORT(level, prefix, format, ...) \ + do { \ + if (__hugetlbfs_verbose >= level) { \ + fprintf(stderr, REPORT_UTIL); \ + if (__hugetlbfs_verbose >= VERBOSE_DEBUG) \ + fprintf(stderr, " [%s:%d]", \ + __hugetlbfs_hostname, getpid()); \ + fprintf(stderr, ": " prefix ": " format, \ + ##__VA_ARGS__); \ + fflush(stderr); \ + } \ + } while (0) + +#define REPORT_CONT(level, prefix, ...) \ + do { \ + if (__hugetlbfs_verbose >= level) { \ + fprintf(stderr, ##__VA_ARGS__); \ + fflush(stderr); \ + } \ + } while (0) +#endif + +#include "libhugetlbfs_debug.h" + +#if defined(__powerpc64__) && !defined(__LP64__) +/* Older binutils fail to provide this symbol */ +#define __LP64__ +#endif + +/* Multiple huge page size support */ +struct hpage_size { + unsigned long pagesize; + char mount[PATH_MAX+1]; +}; + +struct hpage_pool { + unsigned long pagesize; + unsigned long minimum; + unsigned long maximum; + unsigned long size; + int is_default; +}; + +#define size_to_smaller_unit __lh_size_to_smaller_unit +extern unsigned long long size_to_smaller_unit(unsigned long long size); + +#define file_read_ulong __lh_file_read_ulong +extern long file_read_ulong(char *file, const char *tag); +#define file_write_ulong __lh_file_write_ulong +extern int file_write_ulong(char *file, unsigned long val); + +#define hpool_sizes __lh_hpool_sizes +extern int hpool_sizes(struct hpage_pool *, int); +#define get_pool_size __lh_get_pool_size +extern int get_pool_size(long, struct hpage_pool *); + +/* Arch-specific callbacks */ +extern int direct_syscall(int sysnum, ...); +extern ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab); + +#define MEMINFO "/proc/meminfo" +#define PROC_HUGEPAGES_DIR "/proc/sys/vm/" +#define SYSFS_HUGEPAGES_DIR "/sys/kernel/mm/hugepages/" + +#define hugetlbfs_test_pagesize __lh_hugetlbfs_test_pagesize +long hugetlbfs_test_pagesize(const char *mount); + +/* Diagnoses/debugging only functions */ +#define dump_proc_pid_maps __lh_dump_proc_pid_maps +long dump_proc_pid_maps(void); + +#define plt_extrasz __lh_plt_extrasz +ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab); + +#endif /* _LIBHUGETLBFS_INTERNAL_H */ diff --git a/libhugetlbfs_privutils.h b/libhugetlbfs_privutils.h new file mode 100644 index 0000000..149e42f --- /dev/null +++ b/libhugetlbfs_privutils.h @@ -0,0 +1,94 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are part of the internal private utilities interfaces. + * These are exposed only to utilities and tests within the source, this is + * not a public interface nor part of the libhugetlfs API. + * + * All functions declared external here must be externalised using a define + * of the following form: + * + * #define foo __pu_foo + */ + +#ifndef _LIBHUGETLBFS_PRIVUTILS_H +#define _LIBHUGETLBFS_PRIVUTILS_H + +/* Hugetlb pool counter operations */ +/* Keys for reading hugetlb pool counters */ +enum { /* The number of pages of a given size that ... */ + HUGEPAGES_TOTAL, /* are allocated to the pool */ + HUGEPAGES_TOTAL_MEMPOL, /* are allocated following the NUMA mempolicy */ + HUGEPAGES_FREE, /* are not in use */ + HUGEPAGES_RSVD, /* are reserved for possible future use */ + HUGEPAGES_SURP, /* are allocated to the pool on demand */ + HUGEPAGES_OC, /* can be allocated on demand - maximum */ + HUGEPAGES_MAX_COUNTERS, +}; +#define get_huge_page_counter __pu_get_huge_page_counter +long get_huge_page_counter(long pagesize, unsigned int counter); +#define set_huge_page_counter __pu_set_huge_page_counter +int set_huge_page_counter(long pagesize, unsigned int counter, + unsigned long val); +#define set_nr_hugepages __pu_set_nr_hugepages +int set_nr_hugepages(long pagesize, unsigned long val); +#define set_nr_overcommit_hugepages __pu_set_nr_overcommit_hugepages +int set_nr_overcommit_hugepages(long pagesize, unsigned long val); + +#define kernel_has_hugepages __pu_kernel_has_hugepages +int kernel_has_hugepages(void); + +#define kernel_has_overcommit __pu_kernel_has_overcommit +int kernel_has_overcommit(void); + +#define read_meminfo __pu_read_meminfo +long read_meminfo(const char *tag); + +#define kernel_default_hugepage_size __pu_kernel_default_hugepage_size +long kernel_default_hugepage_size(void); + +#define read_nr_overcommit __pu_read_nr_overcommit +long read_nr_overcommit(long page_size); + +#define restore_overcommit_pages __pu_restore_overcommit_pages +void restore_overcommit_pages(long page_size, long oc_pool); + +/* Kernel feature testing */ +/* This enum defines the bits in a feature bitmask */ +enum { + /* Reservations are created for private mappings */ + HUGETLB_FEATURE_PRIVATE_RESV, + + /* Whether use of MAP_NORESERVE is safe or can result in OOM */ + HUGETLB_FEATURE_SAFE_NORESERVE, + + /* If the kernel has the ability to mmap(MAP_HUGETLB)*/ + HUGETLB_FEATURE_MAP_HUGETLB, + + HUGETLB_FEATURE_NR, +}; +#define hugetlbfs_test_feature __pu_hugetlbfs_test_feature +int hugetlbfs_test_feature(int feature_code); + +#define test_compare_kver __pu_test_compare_kver +int test_compare_kver(const char *a, const char *b); + +#endif /* _LIBHUGETLBFS_PRIVUTILS_H */ diff --git a/libhugetlbfs_testprobes.h b/libhugetlbfs_testprobes.h new file mode 100644 index 0000000..6e01da4 --- /dev/null +++ b/libhugetlbfs_testprobes.h @@ -0,0 +1,39 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 IBM Corporation, author: Andy Whitcroft + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are part of the internal library test probe interfaces. + * These are exposed only to utilities and tests within the source, this is + * not a public interface nor part of the libhugetlfs API. + * + * All functions declared external here must be externalised using a define + * of the following form: + * + * #define foo __tp_foo + */ + +#ifndef _LIBHUGETLBFS_TESTPROBES_H +#define _LIBHUGETLBFS_TESTPROBES_H + +#define kernel_default_hugepage_size_reset \ + __tp_kernel_default_hugepage_size_reset +void kernel_default_hugepage_size_reset(void); + +#endif /* _LIBHUGETLBFS_TESTPROBES_H */ diff --git a/localversion b/localversion new file mode 100755 index 0000000..5d50aca --- /dev/null +++ b/localversion @@ -0,0 +1,90 @@ +#!/bin/sh +# +# libhugetlbfs - Easy use of Linux hugepages +# Copyright (C) 2006 Andy Whitcroft, IBM Corporation +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public License +# as published by the Free Software Foundation; either version 2.1 of +# the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +if [ "$#" -lt 1 ]; then + echo "Usage: localversion ..." 1>&2 + exit 1 +fi +file="$1" + +if [ -f "$file" ]; then + read current_version <"$file" +fi +version="$current_version" + +modified=0 + +# GIT: check for a git tree. +mod=`git diff-index HEAD 2>/dev/null` +if [ "$?" -eq 0 ]; then + # This is a GIT repo, see if it was modified. + if [ "$mod" != "" ]; then + modified=1 + else + # Subtle, if we are in a git archive and the repository + # is clean then update the time on the version file + # thus ensuring it will be correct in any tarball. + touch "$file" + fi + + # Try and get a real "tag relative" version name for it. + version=`git describe --tags HEAD 2>&1` + if [ "$?" -ne 0 ]; then + # ok, desperation just take the commit id. + version=`git log | awk '{ print $2; exit }'` + version="commit<$version>" + fi + +else + if [ ! -f "$file" ]; then + echo 1>&2 "$0: ERROR: unversioned tarball" + echo "#error UNVERSIONED tarball" >"$file.h" + exit 1 + fi + + # No version control, use the modification times + # of the source. + for s in "$@" + do + if [ "$s" -nt "$file" ]; then + modified=1 + fi + done +fi + +if [ "$current_version" != "$version" ]; then + echo "version update: $version" + echo "$version" >"$file" +fi + +# Update the c-define for this version, take the modification +# flags into account. +version_modified="$version" +[ "$modified" -eq 1 ] && version_modified="$version_modified (modified)" + +if [ -f "$file.h" ]; then + read d1 current_version_modified <"$file.h" +fi +if [ "$current_version_modified" != "$version_modified" ]; then + echo "version string: $version_modified" + echo "// $version_modified" >"$file.h" + echo "#define VERSION \"$version_modified\"" >>"$file.h" +fi + +exit 0 diff --git a/man/cpupcstat.8 b/man/cpupcstat.8 new file mode 100644 index 0000000..d84a726 --- /dev/null +++ b/man/cpupcstat.8 @@ -0,0 +1,117 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH CPUCPSTAT 8 "9 June, 2009" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +cpupcstat \- Measure the DTLB miss rate +.SH SYNOPSIS +.B cpupcstat [options] [target] +.SH DESCRIPTION +\fBcpupcstat\fP uses oprofile to measure the DTLB miss rate of a +specified application or the kernel. It configures oprofile to count the +number of DTLB misses, optionally starts the \fBtarget\fP, and reports on the +miss rate over a specified interval as \fBtarget\fP executes. + +The following options can be used to configure how \fBcpupcstat\fP works: + +.TP +.B --vmlinux + +This allows the user to specify where the appropriate vmlinux file is for their +kernel. If this is not specified, /boot/vmlinux\-\`uname \-r\` will be used. + +.TP +.B --delay + +This allows the user to specify the reporting interval. The default is 10 +seconds. + +.TP +.B --target-global + +Gather statistics for all processes and the kernel running in the system. + +.TP +.B --target-pid + +This allows the user to specify the pid of a process already that is already +running. If this option is specified, \fBtarget\fP will be ignored. + +.TP +.B --real-target + +Use this to specify the real name of the program to monitor if the \fBtarget\fP +is a launcher script. When this is specified, \fBtarget\fP is executed but the +report will be for \fBreal-target\fP. + +.TP +.B --time-limit + +This option sets the time limit for monitoring. If this is specified the +\fBtarget\fP or \fBpid\fP will only be monitored for \fBsec\fP seconds. The +default continues monitoring while \fBtarget\fP or \fBpid\fP are still alive. + +.TP +.B --kernel + +This allows the user to request DTLB miss rate data be collected for the kernel +as well as the \fBtarget\fP. + +.TP +.B --misses-per-instruction + +This option requests that the ratio of instructions retired per TLB miss. + +.TP +.B --misses-per-cycle + +This option requests that the ratio of CPU cycles per TLB miss. + +.TP +.B --time-servicing + +This option requests that the percentage of CPU cycles spent servicing TLB +misses is displayed when \fBcpupcstat\fB exits. To use this option the cost +in CPU cycles for a single TLB miss must be specified using either the +\fB--cost-config\fB option or the \fBtlbmiss_cost.sh\fB script. + +.TP +.B --cost-config + +This option tells \fBcpupcstat\fB that the cost in CPU cycles of a TLB miss +can be found in the specified file, it should be specified as: + +TLB_MISS_COST=XX + +Where XX is the cost in cycles. This option is only used with the +\fB--time-servicing\fB option. + +.TP +.B --force-oprofile + +\fBcpupcstat\fP prefers the perf tool for data collection, only using oprofile +if perf is not present or supported. This option will force \fBcpupcstat\fP to +use oprofile for data collection. + +.SH SEE ALSO +.I oprofile(1) +.I perf(1) +.I tlbmiss_cost.sh(8) +.br +.SH AUTHORS +Eric B Munson is the primary author. See the documentation +for other contributors. + diff --git a/man/get_huge_pages.3 b/man/get_huge_pages.3 new file mode 100644 index 0000000..86d03c9 --- /dev/null +++ b/man/get_huge_pages.3 @@ -0,0 +1,73 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GET_HUGE_PAGES 3 "October 8, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +get_huge_pages, free_huge_pages \- Allocate and free hugepages +.SH SYNOPSIS +.B #include +.br + +.br +.B void *get_huge_pages(size_t len, ghp_t flags); +.br +.B void free_huge_pages(void *ptr); +.SH DESCRIPTION + +\fBget_huge_pages()\fP allocates a memory region \fBlen\fP bytes in size +backed by hugepages. Hugepages may be of benefit to applications that use +large amounts of address space and suffer a performance hit due to TLB +misses. Wall-clock time or oprofile can be used to determine if there is +a performance benefit from using hugepages or not. + +The \fBlen\fP parameter must be hugepage-aligned. In the current +implementation, only the default hugepage size may be allocated via this +function. Use \fBgethugepagesize\fP to discover what the alignment should +be. + +The \fBflags\fP argument changes the behaviour +of the function. Flags may be or'd together. + +.TP +.B GHP_DEFAULT + +Allocate a region of memory of the requested length backed by hugepages of +the default hugepage size. Return NULL if sufficient pages are not available + +.PP + +\fBfree_huge_pages()\fP frees a region of memory allocated by +\fBget_huge_pages()\fP. The behaviour of the function if another pointer +is used, valid or otherwise, is undefined. + +.SH RETURN VALUE + +On success, a pointer is returned to the allocated memory. On +error, NULL is returned. errno will be set based on what the failure of +mmap() was due to. + +.SH SEE ALSO +.I oprofile(1) +, +.I gethugepagesize(3) +, +.I get_hugepage_region(3) +, +.I libhugetlbfs(7) +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/man/get_hugepage_region.3 b/man/get_hugepage_region.3 new file mode 100644 index 0000000..63fd40c --- /dev/null +++ b/man/get_hugepage_region.3 @@ -0,0 +1,88 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GET_HUGEPAGE_REGION 3 "November 7, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +get_hugepage_region, free_hugepage_region \- Allocate and free regions of memory that use hugepages where possible +.SH SYNOPSIS +.B #include +.br + +.br +.B void *get_hugepage_region(size_t len, ghr_t flags); +.br +.B void free_hugepage_region(void *ptr); +.SH DESCRIPTION + +\fBget_hugepage_region()\fP allocates a memory region \fBlen\fP bytes in size +backed by hugepages. Hugepages may be of benefit to applications that use +large amounts of address space and suffer a performance hit due to TLB +misses. Wall-clock time or oprofile can be used to determine if there is +a performance benefit from using hugepages or not. + +Unlike \fBget_huge_pages()\fP, \fBlen\fP does not have to be hugepage-aligned +although memory may be wasted due to alignment. The caller may also specify +that base pages be used in the event there are no hugepages available. + +The \fBflags\fP argument changes the behaviour of the function. Flags may +be or'd together. + +.TP +.B GHR_FALLBACK +Use base pages if there are an insufficient number of huge pages. + +.TP +.B GHR_STRICT +Use hugepages or return NULL. + +.TP +.B GHR_COLOR +When specified, bytes that would be wasted due to alignment are used to +color the buffer by offsetting it by a random cacheline within the hugepage. +This avoids a performance problem whereby multiple buffers use the same +cache lines at the same offsets. If it is not important that the start of the +buffer be page-aligned, specify this flag. + +.TP +.B GHR_DEFAULT +The library chooses a sensible combination of flags for allocating a region of +memory. The current default is: + GHR_FALLBACK | GHR_COLOR + +.PP + +\fBfree_hugepage_region()\fP frees a region of memory allocated by +\fBget_hugepage_region()\fP. The behaviour of the function if another +pointer is used, valid or otherwise, is undefined. + +.SH RETURN VALUE + +On success, a pointer is returned for to the allocated memory. On +error, NULL is returned. errno will be set based on what the failure of +mmap() was due to. + +.SH SEE ALSO +.I oprofile(1) +, +.I gethugepagesize(3) +, +.I get_huge_pages(3) +, +.I libhugetlbfs(7) +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/man/gethugepagesize.3 b/man/gethugepagesize.3 new file mode 100644 index 0000000..9c46ccf --- /dev/null +++ b/man/gethugepagesize.3 @@ -0,0 +1,54 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" Copyright 2012 Cray Inc. +.\" All rights reserved. +.\" Licensed under LGPL 2.1 by Cray Inc. +.\" +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GETHUGEPAGESIZE 3 "March 7, 2012" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +gethugepagesize - Get the default huge page size +.SH SYNOPSIS +.B #include +.br + +.B long gethugepagesize(void) + +.SH DESCRIPTION + +The gethugepagesize() function returns the default huge page size used by +libhugetlbfs. This will be either the system default, or a valid value set +by the environment variable \fBHUGETLB_DEFAULT_PAGE_SIZE\fP. + +If the system does not support any huge page sizes an error is returned. + +.SH RETURN VALUE + +On success, the default huge page size is returned. On failure, +-1 is returned and errno is set appropriately. + +.SH ERRORS + +.TP +.B ENOSYS +The system does not support huge pages. + +.SH SEE ALSO +.I libhugetlbfs(7) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. diff --git a/man/gethugepagesizes.3 b/man/gethugepagesizes.3 new file mode 100644 index 0000000..a782094 --- /dev/null +++ b/man/gethugepagesizes.3 @@ -0,0 +1,66 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GETHUGEPAGESIZES 3 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +gethugepagesizes - Get the system supported huge page sizes +.SH SYNOPSIS +.B #include + +.br +int gethugepagesizes(long pagesizes[], int n_elem); + +.SH DESCRIPTION + +The gethugepagesizes() function returns either the number of system supported +huge page sizes or the sizes themselves. If \fBpagesizes\fP is NULL and +\fBn_elem\fP is 0, then the number of huge pages the system supports is +returned. Otherwise, \fBpagesizes\fP is filled with at most \fBn_elem\fP +page sizes. + +.SH RETURN VALUE + +On success, either the number of huge page sizes supported by the system or +the number of huge page sizes stored in pagesizes is returned. On failure, +-1 is returned and errno is set appropriately. + +.SH ERRORS + +.TP +.B EINVAL +\fBn_elem\fP is less than zero or \fBn_elem\fP is greater than zero and +\fBpagesizes\fP is NULL. +.PP +Also see opendir(3) for other possible values for errno. This error occurs +when the sysfs directory exists but cannot be opened. + +.SH NOTES + +This call will return all huge page sizes as reported by the kernel. +Not all of these sizes may be usable by the programmer since mount points +may not be available for all sizes. To test whether a size will be usable +by \fBlibhugetlbfs\fP, hugetlbfs_find_path_for_size() can be called on a +specific size to see if a mount point is configured. + +.SH SEE ALSO +.I oprofile(1), +.I opendir(3), +.I hugetlbfs_find_path_for_size(3), +.I libhugetlbfs(7) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. diff --git a/man/getpagesizes.3 b/man/getpagesizes.3 new file mode 100644 index 0000000..cbd534a --- /dev/null +++ b/man/getpagesizes.3 @@ -0,0 +1,70 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GETPAGESIZES 3 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +getpagesizes - Get the system supported huge page sizes +.SH SYNOPSIS +.B #include +.br + +.br +int getpagesizes(long pagesizes[], int n_elem); + +.SH DESCRIPTION + +The getpagesizes() function returns either the number of system supported +page sizes or the sizes themselves. If \fBpagesizes\fP is NULL and +\fBn_elem\fP is 0, then the number of pages the system supports is +returned. Otherwise, \fBpagesizes\fP is filled with at most \fBn_elem\fP +page sizes. + +.SH RETURN VALUE + +On success, either the number of page sizes supported by the system or the +number of page sizes stored in \fBpagesizes\fP is returned. On failure, +-1 is returned and errno is set appropriately. + +.SH ERRORS + +.TP +.B EINVAL +\fBn_elem\fP is less than zero or \fBn_elem\fP is greater than zero and +\fBpagesizes\fP is NULL. + +.PP + +Also see opendir(3) for other possible values for errno. This error occurs +when the sysfs directory exists but cannot be opened. + +.SH NOTES + +This call will return all page sizes as reported by the kernel. Not all of +these sizes may be usable by the programmer since mount points may not be +available for the huge page sizes. To test whether a size will be usable +by \fBlibhugetlbfs\fP, hugetlbfs_find_path_for_size() can be called on a +specific size to see if a mount point is configured. + +.SH SEE ALSO +.I oprofile(1), +.I opendir(3), +.I gethugepagesizes(3), +.I libhugetlbfs(7) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/man/hugeadm.8 b/man/hugeadm.8 new file mode 100644 index 0000000..28de91e --- /dev/null +++ b/man/hugeadm.8 @@ -0,0 +1,294 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGEADM 8 "October 1, 2009" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugeadm \- Configure the system huge page pools +.SH SYNOPSIS +.B hugeadm [options] +.SH DESCRIPTION + +\fBhugeadm\fP displays and configures the systems huge page pools. The size +of the pools is set as a minimum and maximum threshold. The minimum value +is allocated up front by the kernel and guaranteed to remain as hugepages +until the pool is shrunk. If a maximum is set, the system will dynamically +allocate pages if applications request more hugepages than the minimum size +of the pool. There is no guarantee that more pages than this minimum pool +size can be allocated. + +The following options create mounts hugetlbfs mount points. + +.TP +.B --create-mounts + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs. After creation they are mounts and are owned by +root:root with permissions set to 770. Each mount point is named +pagesize-. + +.TP +.B --create-user-mounts= + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs/user/. Mount point naming is the same as +--create-mounts. After creation they are mounted and are owned by +:root with permissions set to 700. + +.TP +.B --create-group-mounts= + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs/group/. Mount point naming is the same as +--create-mounts. After creation they are mounted and are owned by +root: with permissions set to 070. + +.TP +.B --create-global-mounts + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs/global. Mount point naming is the same as +--create-mounts. After creation they are mounted and are owned by +root:root with permissions set to 1777. + +The following options affect how mount points are created. + +.TP +.B --max-size + +This option is used in conjunction with --create-*-mounts. It limits the +maximum amount of memory used by files within the mount point rounded up +to the nearest huge page size. This can be used for example to grant +different huge page quotas to individual users or groups. + +.TP +.B --max-inodes + +This option is used in conjunction with --create-*-mounts. It limits the +number of inodes (e.g. files) that can be created on the new mount points. +This limits the number of mappings that can be created on a mount point. It +could be used for example to limit the number of application instances that +used a mount point as long as it was known how many inodes each application +instance required. + +The following options display information about the pools. + +.TP +.B --pool-list + +This displays the Minimum, Current and Maximum number of huge pages in the pool +for each pagesize supported by the system. The "Minimum" value is the size of +the static pool and there will always be at least this number of hugepages in +use by the system, either by applications or kept by the kernel in a reserved +pool. The "Current" value is the number of hugepages currently in use, either +by applications or stored on the kernels free list. The "Maximum" value is the +largest number of hugepages that can be in use at any given time. + +.TP +.B --set-recommended-min_free_kbytes + +Fragmentation avoidance in the kernel depends on avoiding pages of different +mobility types being mixed with a pageblock arena - typically the size of +the default huge page size. The more mixing that occurs, the less likely +the huge page pool will be able to dynamically resize. The easiest means of +avoiding mixing is to increase /proc/sys/vm/min_free_kbytes. This parameter +sets min_free_kbytes to a recommended value to aid fragmentation avoidance. + +.TP +.B --set-recommended-shmmax + +The maximum shared memory segment size should be set to at least the size +of the largest shared memory segment size you want available for applications +using huge pages, via /proc/sys/kernel/shmmax. Optionally, it can be set +automatically to match the maximum possible size of all huge page allocations +and thus the maximum possible shared memory segment size, using this switch. + +.TP +.B --set-shm-group= + +Users in the group specified in /proc/sys/vm/hugetlb_shm_group are granted +full access to huge pages. The sysctl takes a numeric gid, but this hugeadm +option can set it for you, using either a gid or group name. + +.TP +.B --page-sizes + +This displays every page size supported by the system and has a pool +configured. + +.TP +.B --page-sizes-all + +This displays all page sizes supported by the system, even if no pool is +available. + +.TP +.B --list-all-mounts + +This displays all active mount points for hugetlbfs. + +.PP +The following options configure the pool. + +.TP +.B --pool-pages-min=:[+|-]> + +This option sets or adjusts the Minimum number of hugepages in the pool for +pagesize \fBsize\fP. \fBsize\fP may be specified in bytes or in kilobytes, +megabytes, or gigabytes by appending K, M, or G respectively, or as DEFAULT, +which uses the system's default huge page size for \fBsize\fP. The pool size +adjustment can be specified by \fBpagecount\fP pages or by \fBmemsize\fP, if +postfixed with G, M, or K, for gigabytes, megabytes, or kilobytes, +respectively. If the adjustment is specified via \fBmemsize\fP, then the +\fBpagecount\fP will be calculated for you, based on page size \fBsize\fP. +The pool is set to \fBpagecount\fP pages if + or - are not specified. If ++ or - are specified, then the size of the pool will adjust by that amount. +Note that there is no guarantee that the system can allocate the hugepages +requested for the Minimum pool. The size of the pools should be checked after +executing this command to ensure they were successful. + +.TP +.B --obey-numa-mempol + +This option requests that allocation of huge pages to the static pool with +\fB--pool-pages-min\fP obey the NUMA memory policy of the current process. This +policy can be explicitly specified using numactl or inherited from a parent +process. + +.TP +.B --pool-pages-max=:[+|-]> + +This option sets or adjusts the Maximum number of hugepages. Note that while +the Minimum number of pages are guaranteed to be available to applications, +there is not guarantee that the system can allocate the pages on demand when +the number of huge pages requested by applications is between the Minimum and +Maximum pool sizes. See --pool-pages-min for usage syntax. + +.TP +.B --enable-zone-movable + +This option enables the use of the MOVABLE zone for the allocation of +hugepages. This zone is created when kernelcore= or movablecore= are specified +on the kernel command line but the zone is not used for the allocation of +huge pages by default as the intended use for the zone may be to guarantee +that memory can be off-lined and hot-removed. The kernel guarantees that +the pages within this zone can be reclaimed unlike some kernel buffers +for example. Unless pages are locked with mlock(), the hugepage pool can +grow to at least the size of the movable zone once this option is set. Use +sysctl to permanently enable the use of the MOVABLE zone for the allocation +of huge pages. + +.TP +.B --disable-zone-movable + +This option disables the use of the MOVABLE zone for the future allocation of +huge pages. Note that existing huge pages are not reclaimed from the zone. +Use sysctl to permanently disable the use of the MOVABLE zone for the +allocation of huge pages. + +.TP +.B --hard + + +This option is specified with --pool-pages-min to retry allocations multiple +times on failure to allocate the desired count of pages. It initially tries +to resize the pool up to 5 times and continues to try if progress is being +made towards the resize. + +.TP +.B --add-temp-swap<=count> + +This options is specified with --pool-pages-min to initialize a temporary +swap file for the duration of the pool resize. When increasing the size of +the pool, it can be necessary to reclaim pages so that contiguous memory is +freed and this often requires swap to be successful. Swap is only created for +a positive resize, and is then removed once the resize operation is completed. +The default swap size is 5 huge pages, the optional argument sets +the swap size to huge pages. + +.TP +.B --add-ramdisk-swap + +This option is specified with --pool-pages-min to initialize swap in memory +on ram disks. When increasing the size of the pool, it can be necessary to +reclaim pages so that contiguous memory is freed and this often requires swap +to be successful. If there isn't enough free disk space, swap can be +initialized in RAM using this option. If the size of one ramdisk is not +greater than the huge page size, then swap is initialized on multiple ramdisks. +Swap is only created for a positive resize, and by default is removed once +the resize operation is completed. + +.TP +.B --persist + +This option is specified with the --add-temp-swap or --add-ramdisk-swap to +make the swap space persist after the resize operation is completed. The swap +spaces can later be removed manually using the swapoff command. + +.PP +The following options tune the transparent huge page usage + +.TP +.B --thp-always + +Enable transparent huge pages always + +.TP +.B --thp-madvise + +Enable transparent huge pages only on madvised regions + +.TP +.B --thp-never + +Disable transparent huge pages + +.TP +.B --thp-khugepaged-pages + +Configure the number of pages that khugepaged should scan on each pass + +.TP +.B --thp-khugepaged-scan-sleep + +Configure how many milliseconds khugepaged should wait between passes + +.TP +.B --thp-khugepages-alloc-sleep + +Configure how many milliseconds khugepaged should wait after failing to +allocate a huge page to throttle the next attempt. + +.PP +The following options affect the verbosity of libhugetlbfs. + +.TP +.B --verbose , -v + +The default value for the verbosity level is 1 and the range of the value can +be set with --verbose from 0 to 99. The higher the value, the more verbose the +library will be. 0 is quiet and 3 will output much debugging information. The +verbosity level is increased by one each time -v is specified. + +.SH SEE ALSO +.I oprofile(1), +.I pagesize(1), +.I libhugetlbfs(7), +.I hugectl(8), +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/man/hugectl.8 b/man/hugectl.8 new file mode 100644 index 0000000..6ee70f2 --- /dev/null +++ b/man/hugectl.8 @@ -0,0 +1,141 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGECTL 8 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugectl \- Control policy for backing text, data and malloc() with hugepages +.SH SYNOPSIS +.B hugectl [options] command {arguments} +.SH DESCRIPTION + +\fBhugectl\fP runs processes with a specific policy for backing memory regions +with hugepages. The use of hugepages benefit applications that use large +amounts of address space and suffer a performance hit due to TLB misses. +Policy is enforced by \fBlibhugetlbfs\fP and \fBhugectl\fP configures the +environment based on the options provided. Wall-clock time or oprofile can +be used to determine if there is a performance benefit from using hugepages +or not. + +To effectively back text/data, the target process must be relinked to align +the ELF segments on a hugepage boundary. The library also supports more options +for the control of memory regions than are exposed by the \fBhugectl\fP +utility. See the \fBlibhugetlbfs\fP manual page for more details. + +The following options affect what memory regions are backed by hugepages. + +.TP +.B --text[=],--data[=],--bss[=] +Back the text, data or BSS segments with hugepages, optionally with pages +of the specified size. To be effective, the process must be relinked +as described in the HOWTO to align the ELF segments. It is possible to +partially back segments using the HUGETLB_FORCE_ELMAP environment variable +as described in the \fBlibhugetlbfs\fP manual page. + +.TP +.B --heap[=] +Use the glibc morecore hook to back malloc() with hugepages, optionally +with pages of the specified size. Note that this does not affect brk() +segments and applications that use custom allocators potentially do not +use hugepages for their heap even with this option specified. + +.TP +.B --shm +This option overrides shmget() to back shared memory regions with hugepages +if possible. Segment size requests will be aligned to fit to the default +hugepage size region. + +.TP +.B --share-text +Request that multiple application instances share text segments that are +backed with huge pages. This option sets the environment variable +HUGETLB_SHARE to 1. + +.TP +.B --thp +Align heap regions to huge page size for promotion by khugepaged. For more +information on transparent huge pages see linux-2.6/Documentation/transhuge.txt + +.PP +The following options affect how \fBhugectl\fP behaves. + +.TP +.B --no-preload +Disable any pre-loading of the \fBlibhugetlbfs\fP library. This may be necessary +if only the heap is being backed by hugepages and the application is already +linked against the library. \fBhugectl\fP may pre-load the library by mistake +and this option prevents that. + +.TP +.B --force-preload +Force pre-loading of the \fBlibhugetlbfs\fP library. This option is used when +the segments of the binary are aligned to the hugepage boundary of interest +but the binary is not linked against libhugetlbfs. This is useful on PPC64 +where binaries are aligned to 64K as required by the ABI and the kernel is +using a 4K base pagesize. + +.TP +.B --no-reserve +By default, huge pages are reserved at mmap() time so future faults will +succeed. This avoids unexpected application but some applications depend +on memory overcommit to create large sparse mappings. For this type of +application, this switch will create huge page backed mappings without a +reservation if the kernel is recent enough to make this operation safe. +Use this option with extreme care as in the event huge pages are not +available when the mapping is faulted, the application will be killed. + +.TP +.B --dry-run +Instead of running the process, the \fBhugectl\fP utility will describe what +environment variables it set for \fBlibhugetlbfs\fP. This is useful if +additional environment variables are to be set and a launcher shell script is +being developed. + +.TP +.B --library-use-path +By default, \fBhugectl\fP will use the version of \fBlibhugetlbfs\fP it was +installed with, even if this is not in the LD_LIBRARY_PATH environment. Using +this option forces \fBhugectl\fP to use the version of \fBlibhugetlbfs\fP +installed in the library system path. + +.TP +.B --library-path +This option forces \fBhugectl\fP to use the \fBlibhugetlbfs\fP libraries within +the given prefix. + +.PP +The following options affect the verbosity of libhugetlbfs. + +.TP +.B --verbose , -v +The default value for the verbosity level is 1 and the range of the value can +be set with --verbose from 0 to 99. The higher the value, the more verbose the +library will be. 0 is quiet and 3 will output much debugging information. The +verbosity level is increased by one each time -v is specified. + +.TP +.B -q +The -q option will drecease the verbosity level by 1 each time it is specified +to a minimum of 0. + +.SH SEE ALSO +.I oprofile(1), +.I hugeadm(7), +.I libhugetlbfs(7) +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/man/hugeedit.8 b/man/hugeedit.8 new file mode 100644 index 0000000..4bcfdfc --- /dev/null +++ b/man/hugeedit.8 @@ -0,0 +1,57 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGEEDIT 7 "October 8, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugeedit \- Set default policy for backing text and data with hugepages +.SH SYNOPSIS +.B hugeedit binary +.br +.B hugeedit [options] binary +.SH DESCRIPTION + +\fBhugectl\fP runs processes with a specific policy for backing memory +regions with hugepages. Ordinarily when processes are relinked with +\fBlibhugetlbfs\fP using the recommended linking method, either \fBhugectl\fP +is required on each execution or environment variables must be set for +each execution. \fBhugeedit\fP can be used to set bits in the ELF header +that determine if the text or data segments are backed by default without +further intervention. + +If no arguments are specified, \fBhugeedit\fP will display what the current +defaults for each segment in a binary are. The available options are + +.TP +.B --text +Back the text segments of the binary by default. + +.TP +.B --data +Back the data segments of the binary by default + +.TP +.B --disable +Back all segments using small pages by default + +.SH SEE ALSO +.I oprofile(1), +.I libhugetlbfs(7), +.I hugectl(8) +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/man/hugetlbfs_find_path.3 b/man/hugetlbfs_find_path.3 new file mode 100644 index 0000000..b2d1171 --- /dev/null +++ b/man/hugetlbfs_find_path.3 @@ -0,0 +1,52 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" Copyright 2012 Cray Inc. +.\" All rights reserved. +.\" Licensed under LGPL 2.1 by Cray Inc. +.\" +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGETLBFS_FIND_PATH 3 "March 7, 2012" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugetlbfs_find_path, hugetlbfs_find_path_for_size - Locate an appropriate hugetlbfs mount point +.SH SYNOPSIS +.B #include + +.br +const char *hugetlbfs_find_path(void); +.br +const char *hugetlbfs_find_path_for_size(long page_size); + +.SH DESCRIPTION + +These functions return a pathname for a mounted hugetlbfs filesystem for +the appropriate huge page size. For hugetlbfs_find_path, the default +huge page size is used (see gethugepagesize(3)). For +hugetlbfs_find_path_for_size, a valid huge page size must be specified +(see gethugepagesizes(3)). + +.SH RETURN VALUE + +On success, a non-NULL value is returned. +On failure, NULL is returned. + +.SH SEE ALSO +.I libhugetlbfs(7), +.I gethugepagesize(3), +.I gethugepagesizes(3) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. diff --git a/man/hugetlbfs_test_path.3 b/man/hugetlbfs_test_path.3 new file mode 100644 index 0000000..67295ea --- /dev/null +++ b/man/hugetlbfs_test_path.3 @@ -0,0 +1,53 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" Copyright 2012 Cray Inc. +.\" All rights reserved. +.\" Licensed under LGPL 2.1 by Cray Inc. +.\" +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGETLBFS_TEST_PATH 3 "March 7, 2012" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugetlbfs_test_path - Determine whether a mount point is hugetlbfs +.SH SYNOPSIS +.B #include + +.br +int hugetlbfs_test_path(const char *mount); + +.SH DESCRIPTION + +The hugetlbfs_test_path() function determines whether a given pathname +is in a hugetlbfs filesystem. + +.SH RETURN VALUE + +On success, 1 is returned for a hugetlbfs filesystem, or 0 for a +non-hugetlbfs filesystem. On failure, -1 is returned and errno is set +appropriately. + +.SH ERRORS + +.PP +Please see statfs(3) for possible values for errno. + + +.SH SEE ALSO +.I statfs(3), +.I libhugetlbfs(7) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. diff --git a/man/hugetlbfs_unlinked_fd.3 b/man/hugetlbfs_unlinked_fd.3 new file mode 100644 index 0000000..ffeb716 --- /dev/null +++ b/man/hugetlbfs_unlinked_fd.3 @@ -0,0 +1,55 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" Copyright 2012 Cray Inc. +.\" All rights reserved. +.\" Licensed under LGPL 2.1 by Cray Inc. +.\" +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGETLBFS_UNLINKED_FD 3 "March 7, 2012" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugetlbfs_unlinked_fd, hugetlbfs_unlinked_fd_for_size - Obtain a file descriptor for a new unlinked file in hugetlbfs +.SH SYNOPSIS +.B #include + +.br +.B int hugetlbfs_unlinked_fd(void); +.br +.B int hugetlbfs_unlinked_fd_for_size(long page_size); + +.SH DESCRIPTION + +These functions return an open file descriptor for a unique, newly-created +file in a hugetlbfs filesystem. To avoid leaking hugepages, the file +is unlinked automatically before the function returns. + +For hugetlbfs_unlinked_fd, the default huge page size is used (see +gethugepagesize(3)). For hugetlbfs_unlinked_fd_for_size, a valid huge +page size must be specified (see gethugepagesizes(3)). + +.SH RETURN VALUE + +On success, a valid open file descriptor is returned. On failure, +-1 is returned and errno may be set appropriately. + +.SH SEE ALSO +.I gethugepagesize(3), +.I gethugepagesizes(3), +.I mkstemp(3), +.I libhugetlbfs(7) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. diff --git a/man/ld.hugetlbfs.1 b/man/ld.hugetlbfs.1 new file mode 100644 index 0000000..60487e2 --- /dev/null +++ b/man/ld.hugetlbfs.1 @@ -0,0 +1,72 @@ +\" Hey, EMACS: -*- nroff -*- +.\" Copyright 2012 Cray Inc. +.\" All rights reserved. +.\" Licensed under LGPL 2.1 by Cray Inc. +.\" +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH LD.HUGETLBFS 1 "March 12, 2012" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +ld.hugetlbfs \- link a program for huge pages +.SH SYNOPSIS +.B ld.hugetlbfs [options] +.SH DESCRIPTION + +\fBld.hugetlbfs\fP replaces the normal \fBld\fP command for linking programs +to use hugepages. Under gcc, you should use the option +\fB-B /usr/share/libhugetlbfs\fP which tells gcc to look in a non-standard +location for the linker. This could be set in the \fBCFLAGS\fP environment +variable. + +.TP +.B -Wl,--hugetlbfs-align + +This method of linking an application permits greater flexibility at runtime. +Using HUGETLB_ELFMAP, it is possible to control which program segments are +placed in hugepages. The following four settings will cause the indicated +segments to be placed in hugepages: + + HUGETLB_ELFMAP=R Read-only segments (text) + HUGETLB_ELFMAP=W Writable segments (data/BSS) + HUGETLB_ELFMAP=RW All segments (text/data/BSS) + HUGETLB_ELFMAP=no No segments + +It is possible to select specific huge page sizes for read-only and writable +segments by using the following advanced syntax: + + HUGETLB_ELFMAP=[R[=]:[W[=]] + +.B -Wl,--hugetlbfs-link=B + +Under binutils 2.16 or older, this option will link the application to store +BSS data (only) into hugepages. + +.B -Wl,--hugetlbfs-link=BDT + +Under binutils 2.16 or older, this option will link the application to store +text, initialized data and BSS data into hugepages. + +.SH FILES +[DESTDIR|/usr/share]/doc/libhugetlbfs/HOWTO + +.SH SEE ALSO +.I libhugetlbfs(7), +.I hugectl(8), +.I hugeedit(8) +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. diff --git a/man/libhugetlbfs.7 b/man/libhugetlbfs.7 new file mode 100644 index 0000000..7131ca9 --- /dev/null +++ b/man/libhugetlbfs.7 @@ -0,0 +1,230 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH LIBHUGETLBFS 7 "September 27, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +libhugetlbfs \- preload library to back text, data, malloc() or shared memory with hugepages +.SH SYNOPSIS +.B export [environment options] +.br +.B [LD_PRELOAD=libhugetlbfs.so] target_application +.SH DESCRIPTION + +\fBlibhugetlbfs\fP is a library that can back application text, data, malloc() +and shared memory with hugepages. This is of benefit to applications that +use large amounts of address space and suffer a performance hit due to TLB +misses. Wall-clock time or oprofile can be used to determine if there is +a performance benefit from using \fBlibhugetlbfs\fP or not. In all cases +but shared memory, a hugetlbfs mount must exist and a hugepage pool defined +for hugepages to be used. + +Some limited functionality is available for unmodified dynamically linked +applications. By preloading the library, the library can back malloc() +and shared memory, and text and data segments can be partially backed if +they are large enough. + +For the effective backing of text and data with huge pages, the application +must be linked to the library and the ELF segments correctly aligned using +the ld helpers. Once linked, malloc or shared memory can still be backed +but no pre-loading is required. See /usr/share/doc/libhugetlbfs/HOWTO and +ld.hugetlbfs(1) for detailed instructions on relinking applications. + +For applications that are hugepage-aware and linked to the library +\fBget_huge_pages()\fP can be used for the direct allocation of +hugepage-backed regions. + +Unless otherwise specified, \fBlibhugetlbfs\fP will use the default hugepage +size to back memory regions. The default size is the value of Hugepagesize +displayed in /proc/meminfo. The size can be specified in bytes or in +kilobytes, megabytes, or gigabytes by appending K, M, or G respectively. It +is an error to specify a invalid, unsupported, or otherwise unconfigured +huge page size. Kernel 2.6.27 or later is required to specify any pagesize +other than the default. + +See /usr/share/docs/libhugetlbfs/HOWTO for detailed instructions on how +the library should be used, particularly when relinking the application. +This manual page provides a brief synopsis of the environment variables +as a quick reference. + +The following variables affect what memory regions are backed by hugepages. In +all cases, the environment being unset implies the feature should remain +disabled. + +.TP +.B HUGETLB_DEFAULT_PAGE_SIZE= +This sets the default hugepage size to be used by libhugetlbfs. If not +set, libhugetlbfs will use the kernel's default hugepage size. + +.TP +.B HUGETLB_MORECORE=[yes|] +This enables the hugepage malloc() feature, instructing libhugetlbfs to +override glibc's normal morecore() function with a hugepage version and use +it for malloc(). All application malloc() memory should come from hugepage +memory until it runs out, it will then fallback to base pages. Note that +applications that use custom allocators may not be able to back their heaps +using hugepages and this environment variable. It may be necessary to modify +the custom allocator to use \fBget_huge_pages()\fP. + +.TP +.B HUGETLB_SHM=yes +When this environment variable is set, the SHM_HUGETLB flag is added to +the shmget() call and the size parameter is aligned to back the shared +memory segment with hugepages. In the event hugepages cannot be used, base +pages will be used instead and a warning will be printed to explain the +failure. The pagesize cannot be specified with this parameter. To change +the kernels default hugepage size, use the pagesize= kernel boot parameter +(2.6.26 or later required). + +.TP +.B HUGETLB_ELFMAP=[no|[R[<=pagesize>]:[W[<=pagesize>]]] +If the application has been relinked (see the HOWTO for instructions), +this environment variable determines whether read-only, read-write, both +or no segments are backed by hugepages and what pagesize should be used. If +the recommended relinking method has been used, then \fBhugeedit\fP can be +used to automatically back the text or data by default. + +.TP +.B HUGETLB_FORCE_ELFMAP=yes +Force the use of hugepages for text and data segments even if the application +has not been relinked to align the ELF segments on a hugepage boundary. +Partial segment remapping is not guaranteed to work and the segments must be +large enough to contain at least one hugepage for the remapping to occur. + +.PP +The following options affect how libhugetlbfs behaves. + +.TP +.B HUGETLB_RESTRICT_EXE=e1:e2:...:eN +By default, libhugetlbfs will act on any program that it +is loaded with, either via LD_PRELOAD or by explicitly +linking with -lhugetlbfs. + +There are situations in which it is desirable to restrict +libhugetlbfs' actions to specific programs. For example, +some ISV applications are wrapped in a series of scripts +that invoke bash, python, and/or perl. It is more +convenient to set the environment variables related +to libhugetlbfs before invoking the wrapper scripts, +yet this has the unintended and undesirable consequence +of causing the script interpreters to use and consume +hugepages. There is no obvious benefit to causing the +script interpreters to use hugepages, and there is a +clear disadvantage: fewer hugepages are available to +the actual application. + +To address this scenario, set HUGETLB_RESTRICT_EXE to a +colon-separated list of programs to which the other +libhugetlbfs environment variables should apply. (If +not set, libhugetlbfs will attempt to apply the requested +actions to all programs.) For example, + + HUGETLB_RESTRICT_EXE=hpcc:long_hpcc + +will restrict libhugetlbfs' actions to programs named +/home/fred/hpcc and /bench/long_hpcc but not /bin/hpcc_no. + + +.TP +.B HUGETLB_MORECORE_SHRINK=yes +By default, the hugepage heap does not shrink. Shrinking is enabled by +setting this environment variable. It is disabled by default as glibc +occasionally exhibits strange behaviour if it mistakes the heap returned +by \fBlibhugetlbfs\fP as a foreign brk(). + +.TP +.B HUGETLB_NO_PREFAULT +By default \fBlibhugetlbfs\fP will prefault regions it creates to ensure they +can be referenced without receiving a SIGKILL. On kernels older than 2.6.27, +this was necessary as the system did not guarantee that future faults would +succeed on regions mapped MAP_PRIVATE. Prefaulting impacts the performance +of malloc() and can result in poor placement on NUMA systems. If it is known +the hugepage pool is large enough to run the application or the kernel is +2.6.27 or later, this environment variable should be set. + +.TP +.B HUGETLB_NO_RESERVE=yes + +By default, the kernel will reserve huge pages at mmap() time to ensure that +future faults will succeed. This avoids unexpected application failure at +fault time but some applications depend on memory overcommit to create +large sparse mappings. For this type of application, setting this environment +variable will create huge page backed mappings without a reservation. Use +this option with extreme care as in the event huge pages are not available +when the mapping is used, the application will be killed. On older kernels, +the use of this feature can trigger the OOM killer. Hence, even with this +variable set, reservations may still be used for safety. + +.TP +.B HUGETLB_MORECORE_HEAPBASE=address +\fBlibhugetlbfs\fP normally picks an address to use as the base of the heap for +malloc() automatically. This environment variable fixes which address is used. + +.TP +.B HUGETLB_PATH= +The path to the hugetlbfs mount is automatically determined at run-time. In the +event there are multiple mounts and the wrong one is being selected, use this +option to select the correct one. This may be the case if an +application-specific mount with a fixed quota has been created for example. + +.TP +.B HUGETLB_SHARE=1 +By default, \fBlibhugetlbfs\fP uses unlinked hugetlbfs files to store remapped +program segment data. If the same program is started multiple times using +hugepage segments, multiple hugepages will be used to store the same program +data. The reduce this wastage, setting this environment variable will share +read-only segments between multiple invocations of a program at the cost of +the memory being used whether the applications are running or not. It is +also possible that a malicious application inferfere with other applications +executable code. See the HOWTO for more detailed information on this topic. + +.PP +The following options control the verbosity of \fBlibhugetlbfs\fP. + +.TP +.B HUGETLB_VERBOSE= +The default value for this is 1 and the range of the value is from 0 to +99. The higher the value, the more verbose the output is. 0 is quiet and +3 will output much debugging information. + +.TP +.B HUGETLB_DEBUG +Once set, this will give very detailed output on what is happening in the +library and run extra diagnostics. + +.SH FILES +[DESTDIR|/usr/share]/doc/libhugetlbfs/HOWTO + +.SH SEE ALSO +.I oprofile(1), +.I ld.hugetlbfs(1), +.I hugectl(8), +.I hugeedit(8), +.I gethugepagesize(3), +.I gethugepagesizes(3), +.I getpagesizes(3), +.I hugetlbfs_test_path(3), +.I hugetlbfs_find_path(3), +.I hugetlbfs_find_path_for_size(3), +.I hugetlbfs_test_path(3), +.I hugetlbfs_test_path_for_size(3), +.I hugetlbfs_unlinked_fd(3), +.I hugetlbfs_unlinked_fd_for_size(3), +.I get_huge_pages(3), +.I free_huge_pages(3) +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. diff --git a/man/pagesize.1 b/man/pagesize.1 new file mode 100644 index 0000000..7e6efce --- /dev/null +++ b/man/pagesize.1 @@ -0,0 +1,57 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH PAGESIZE 1 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +pagesize \- Print supported system page sizes +.SH SYNOPSIS +.B pagesize [options] +.SH DESCRIPTION + +The pagesize utility prints the page sizes of a page of memory in bytes, +as returned by getpagesizes(3). This is useful when creating portable shell +scripts, configuring huge page pools with hugeadm or launching applications +to use huge pages with hugectl. + +If no parameters are specified, \fBpagesize\fP prints the system base page +size as returned by \fBgetpagesize()\fP. The following parameters affect +what other pagesizes are displayed. + +.TP +.B --huge-only, -H + +Display all huge pages supported by the system as returned by +\fBgethugepagesizes()\fP. + +.TP +.B --all, -a + +Display all page sizes supported by the system. + +.SH SEE ALSO +.I oprofile(1), +.I getpagesize(2), +.I getpagesizes(3), +.I gethugepagesizes(3), +.I hugectl(7), +.I hugeadm(7), +.I libhugetlbfs(7) + +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/mktarball b/mktarball new file mode 100755 index 0000000..8855204 --- /dev/null +++ b/mktarball @@ -0,0 +1,32 @@ +#!/bin/sh +# +# +P='mktarball' + +if [ "$#" -ne 1 ]; then + echo 1>&2 "Usage: $P " + exit 1 +fi +commit="$1" + +if [ ! .git ]; then + echo 1>&2 "$P: not in the source tree" + exit 1 +fi + +# Get the official name for this commit. +name=`git describe --tags "$commit"` + +# Build a version file to add to the tarball, we know it is not +# modified as we just took a commit which is unmodified by definition. +tmp="/tmp/tarball-$$" +mkdir -p "$tmp/libhugetlbfs-$name" +echo "$name" >"$tmp/libhugetlbfs-$name/version" + +git archive --format=tar --prefix="libhugetlbfs-$name/" "$commit" \ + >"libhugetlbfs-$name.tar" +tar -C "$tmp" -rf "libhugetlbfs-$name.tar" "libhugetlbfs-$name/version" +gzip -9 "libhugetlbfs-$name.tar" + +# Clean up the version. +[ -d "$tmp/libhugetlbfs-$name" ] && rm -rf "$tmp" diff --git a/morecore.c b/morecore.c new file mode 100644 index 0000000..6563bbd --- /dev/null +++ b/morecore.c @@ -0,0 +1,383 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hugetlbfs.h" + +#include "libhugetlbfs_internal.h" + +static int heap_fd; + +static void *heapbase; +static void *heaptop; +static long mapsize; +static long hpage_size; + +static long hugetlbfs_next_addr(long addr) +{ +#if defined(__powerpc64__) + return ALIGN(addr, 1L << SLICE_HIGH_SHIFT); +#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS) + return ALIGN(addr, 1L << SLICE_LOW_SHIFT); +#elif defined(__ia64__) + if (addr < (1UL << SLICE_HIGH_SHIFT)) + return ALIGN(addr, 1UL << SLICE_HIGH_SHIFT); + else + return ALIGN(addr, hpage_size); +#else + return ALIGN(addr, hpage_size); +#endif +} + +/* + * Our plan is to ask for pages 'roughly' at the BASE. We expect and + * require the kernel to offer us sequential pages from wherever it + * first gave us a page. If it does not do so, we return the page and + * pretend there are none this covers us for the case where another + * map is in the way. This is required because 'morecore' must have + * 'sbrk' semantics, ie. return sequential, contigious memory blocks. + * Luckily, if it does not do so and we error out malloc will happily + * go back to small pages and use mmap to get them. Hurrah. + */ +static void *hugetlbfs_morecore(ptrdiff_t increment) +{ + int ret; + void *p; + long delta; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; + int mmap_hugetlb = 0; + int using_default_pagesize = + (hpage_size == kernel_default_hugepage_size()); + + INFO("hugetlbfs_morecore(%ld) = ...\n", (long)increment); + + /* + * how much to grow the heap by = + * (size of heap) + malloc request - mmap'd space + */ + delta = (heaptop-heapbase) + increment - mapsize; + + INFO("heapbase = %p, heaptop = %p, mapsize = %lx, delta=%ld\n", + heapbase, heaptop, mapsize, delta); + + /* align to multiple of hugepagesize. */ + delta = ALIGN(delta, hpage_size); + +#ifdef MAP_HUGETLB + mmap_hugetlb = MAP_HUGETLB; +#endif + + if (delta > 0) { + /* growing the heap */ + + INFO("Attempting to map %ld bytes\n", delta); + + /* map in (extend) more of the file at the end of our last map */ + if (__hugetlb_opts.map_hugetlb && using_default_pagesize) + p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE, + mmap_hugetlb|MAP_ANONYMOUS|MAP_PRIVATE|mmap_reserve, + heap_fd, mapsize); + else + p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE, + MAP_PRIVATE|mmap_reserve, heap_fd, mapsize); + + if (p == MAP_FAILED) { + WARNING("New heap segment map at %p failed: %s\n", + heapbase+mapsize, strerror(errno)); + return NULL; + } + + /* if this is the first map */ + if (! mapsize) { + if (heapbase && (heapbase != p)) { + WARNING("Heap originates at %p instead of %p\n", + p, heapbase); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + } + /* then setup the heap variables */ + heapbase = heaptop = p; + } else if (p != (heapbase + mapsize)) { + /* Couldn't get the mapping where we wanted */ + munmap(p, delta); + WARNING("New heap segment mapped at %p instead of %p\n", + p, heapbase + mapsize); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + return NULL; + } + + /* Fault the region to ensure accesses succeed */ + if (hugetlbfs_prefault(p, delta) != 0) { + munmap(p, delta); + return NULL; + } + + /* we now have mmap'd further */ + mapsize += delta; + } else if (delta < 0) { + /* shrinking the heap */ + + if (!__hugetlb_opts.shrink_ok) { + /* shouldn't ever get here */ + WARNING("Heap shrinking is turned off\n"); + return NULL; + } + + if (!mapsize) { + WARNING("Can't shrink empty heap!\n"); + return NULL; + } + + /* + * If we are forced to change the heapaddr from the + * original brk() value we have violated brk semantics + * (which we are not supposed to do). This shouldn't + * pose a problem until glibc tries to trim the heap to an + * address lower than what we aligned heapaddr to. At that + * point the alignment "gap" causes heap corruption. + * So we don't allow the heap to shrink below heapbase. + */ + if (mapsize + delta < 0) { /* remember: delta is negative */ + WARNING("Unable to shrink heap below %p\n", heapbase); + /* unmap just what is currently mapped */ + delta = -mapsize; + /* we need heaptop + increment == heapbase, so: */ + increment = heapbase - heaptop; + } + INFO("Attempting to unmap %ld bytes @ %p\n", -delta, + heapbase + mapsize + delta); + ret = munmap(heapbase + mapsize + delta, -delta); + if (ret) { + WARNING("Unmapping failed while shrinking heap: " + "%s\n", strerror(errno)); + } else { + mapsize += delta; + /* + * the glibc assumes by default that newly allocated + * memory by morecore() will be zeroed. It would be + * wasteful to do it for allocation so we only shrink + * the top by the size of a page. + */ + increment = heapbase - heaptop + mapsize; + + if (!__hugetlb_opts.map_hugetlb && !using_default_pagesize){ + + /* + * Now shrink the hugetlbfs file. + */ + ret = ftruncate(heap_fd, mapsize); + if (ret) { + WARNING("Could not truncate hugetlbfs file to " + "shrink heap: %s\n", strerror(errno)); + } + } + } + + } + else if (increment < 0) { + /* Don't shrink by less than a page to avoid having to zero + * the memory. There is no point in lying to glibc since + * we're not freeing any memory. + */ + increment = 0; + } + + /* heap is continuous */ + p = heaptop; + /* and we now have added this much more space to the heap */ + heaptop = heaptop + increment; + + INFO("... = %p\n", p); + return p; +} + +static void *thp_morecore(ptrdiff_t increment) +{ + void *p; + long delta; + + INFO("thp_morecore(%ld) = ...\n", (long)increment); + + delta = (heaptop - heapbase) + increment - mapsize; + delta = ALIGN(delta, hpage_size); + + if (delta > 0) { + /* + * This first time we expand the mapping we need to account for + * the initial heap mapping not necessarily being huge page + * aligned + */ + if (!mapsize) + delta = hugetlbfs_next_addr((long)heapbase + delta) - + (unsigned long)heapbase; + + INFO("Adding %ld bytes to heap\n", delta); + + p = sbrk(delta); + if (p == (void *)-1) { + WARNING("sbrk returned ENOMEM\n"); + return NULL; + } + + if (!mapsize) { + if (heapbase && (heapbase != p)) { + WARNING("Heap was expected at %p instead of %p, " + "heap has been modified by someone else!\n", + heapbase, p); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + } + heapbase = heaptop = p; + } + + mapsize += delta; +#ifdef MADV_HUGEPAGE + madvise(p, delta, MADV_HUGEPAGE); +#endif + } else if (delta < 0) { + /* shrinking the heap */ + if (!mapsize) { + WARNING("Can't shrink an empty heap\n"); + return NULL; + } + + INFO("Attempting to shrink heap by %ld bytes with sbrk\n", + -delta); + p = sbrk(delta); + if (p == (void *)-1) { + WARNING("Unable to shrink heap\n"); + return heaptop; + } + + mapsize += delta; + } + + p = heaptop; + heaptop += increment; + INFO("... = %p\n", p); + return p; +} + +void hugetlbfs_setup_morecore(void) +{ + char *ep; + unsigned long heapaddr; + + if (! __hugetlb_opts.morecore) + return; + if (strcasecmp(__hugetlb_opts.morecore, "no") == 0) { + INFO("HUGETLB_MORECORE=%s, not setting up morecore\n", + __hugetlb_opts.morecore); + return; + } + + /* + * Determine the page size that will be used for the heap. + * This can be set explicitly by setting HUGETLB_MORECORE to a valid + * page size string or by setting HUGETLB_DEFAULT_PAGE_SIZE. + */ + if (strncasecmp(__hugetlb_opts.morecore, "y", 1) == 0) + hpage_size = gethugepagesize(); + else if (__hugetlb_opts.thp_morecore) + hpage_size = kernel_default_hugepage_size(); + else + hpage_size = parse_page_size(__hugetlb_opts.morecore); + + if (hpage_size <= 0) { + if (errno == ENOSYS) + WARNING("Hugepages unavailable\n"); + else if (errno == EOVERFLOW || errno == ERANGE) + WARNING("Hugepage size too large\n"); + else if (errno == EINVAL) + WARNING("Invalid huge page size\n"); + else + WARNING("Hugepage size (%s)\n", strerror(errno)); + return; + } + + /* + * We won't need an fd for the heap mmaps if we are using MAP_HUGETLB + * or we are depending on transparent huge pages + */ + if(__hugetlb_opts.thp_morecore || (__hugetlb_opts.map_hugetlb && + hpage_size == kernel_default_hugepage_size())) { + heap_fd = -1; + } else { + if (!hugetlbfs_find_path_for_size(hpage_size)) { + WARNING("Hugepage size %li unavailable", hpage_size); + return; + } + + heap_fd = hugetlbfs_unlinked_fd_for_size(hpage_size); + if (heap_fd < 0) { + WARNING("Couldn't open hugetlbfs file for morecore\n"); + return; + } + } + + /* + * THP morecore uses sbrk to allocate more heap space, counting on the + * kernel to back the area with THP. So setting heapbase is + * meaningless if thp_morecore is used. + */ + if (!__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) { + heapaddr = strtoul(__hugetlb_opts.heapbase, &ep, 16); + if (*ep != '\0') { + WARNING("Can't parse HUGETLB_MORECORE_HEAPBASE: %s\n", + __hugetlb_opts.heapbase); + return; + } + } else { + heapaddr = (unsigned long)sbrk(0); + if (!__hugetlb_opts.thp_morecore) + heapaddr = hugetlbfs_next_addr(heapaddr); + } + + INFO("setup_morecore(): heapaddr = 0x%lx\n", heapaddr); + + heaptop = heapbase = (void *)heapaddr; + if (__hugetlb_opts.thp_morecore) + __morecore = &thp_morecore; + else + __morecore = &hugetlbfs_morecore; + + /* Set some allocator options more appropriate for hugepages */ + + if (__hugetlb_opts.shrink_ok) + mallopt(M_TRIM_THRESHOLD, hpage_size + hpage_size / 2); + else + mallopt(M_TRIM_THRESHOLD, -1); + mallopt(M_TOP_PAD, hpage_size / 2); + /* we always want to use our morecore, not ordinary mmap(). + * This doesn't appear to prohibit malloc() from falling back + * to mmap() if we run out of hugepages. */ + mallopt(M_MMAP_MAX, 0); +} diff --git a/pagesize.c b/pagesize.c new file mode 100644 index 0000000..659eb27 --- /dev/null +++ b/pagesize.c @@ -0,0 +1,140 @@ +/*************************************************************************** + * User front end for using huge pages Copyright (C) 2008, IBM * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation; either version 2.1 of the * + * License, or at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the Lesser GNU General Public * + * License along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +/* + * pagesize exposes the available and hardware supported page sizes on + * the system. + * + * This program should be treated as an ABI for using libhugetlbfs. + */ + +#include +#include +#include +#include +#include + +#define _GNU_SOURCE /* for getopt_long */ +#include +#include + +#define REPORT_UTIL "pagesize" +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" + +extern int errno; +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +void print_usage() +{ + fprintf(stderr, "pagesize [options] target\n"); + fprintf(stderr, "options:\n"); + + OPTION("--help, -h", "Prints this message"); + + OPTION("--all, -a", "show all supported page sizes"); + OPTION("--huge-only, -H", "show only huge page sizes"); +} + +static int cmpsizes(const void *p1, const void *p2) +{ + return *((long *)p1) > *((long *)p2); +} + +#define MAX_PAGESIZES 32 + +int main(int argc, char** argv) +{ + int opt_all = 0; + int opt_huge = 0; + + char opts[] = "+haH"; + int ret = 0, index = 0; + struct option long_opts[] = { + {"all", no_argument, NULL, 'a'}, + {"huge-only", no_argument, NULL, 'H'}, + + {0}, + }; + + long pagesizes[MAX_PAGESIZES]; + int i; + + hugetlbfs_setup_debug(); + + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + switch (ret) { + case '?': + print_usage(); + exit(EXIT_FAILURE); + + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + case 'a': + opt_all = 1; + INFO("selecting all page sizes\n"); + break; + + case 'H': + opt_huge = 1; + opt_all = 1; + INFO("selecting only huge page sizes\n"); + break; + + case -1: + break; + + default: + WARNING("unparsed option %08x\n", ret); + ret = -1; + break; + } + } + index = optind; + if ((argc - index) != 0) { + print_usage(); + exit(EXIT_FAILURE); + } + + if (!opt_all) { + pagesizes[0] = sysconf(_SC_PAGESIZE); + ret = 1; + } else if (opt_huge) + ret = gethugepagesizes(pagesizes, MAX_PAGESIZES); + else + ret = getpagesizes(pagesizes, MAX_PAGESIZES); + if (ret < 0) { + ERROR("failed to get list of supported page sizes\n"); + exit(EXIT_FAILURE); + } + + qsort(pagesizes, ret, sizeof(long), cmpsizes); + for (i = 0; i < ret; i++) { + printf("%ld\n", pagesizes[i]); + } + + exit(EXIT_SUCCESS); +} diff --git a/privutils.lds b/privutils.lds new file mode 100644 index 0000000..5d481e2 --- /dev/null +++ b/privutils.lds @@ -0,0 +1,6 @@ +VERS_1.0 { + global: + __pu_*; + local: + *; +}; diff --git a/shm.c b/shm.c new file mode 100644 index 0000000..1f82cab --- /dev/null +++ b/shm.c @@ -0,0 +1,143 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" +#include + +#if defined(SYS_shmget) || defined(SYS_ipc) +#define HAVE_SHMGET_SYSCALL +#endif + +#ifdef HAVE_SHMGET_SYSCALL +/* + * The calls to dlsym() and dlerror() in the shmget() wrapper below force + * a dependency on libdl.so. This does not work for static executables + * as the glibc dynamic library implementation does not automatically + * have static dl* function stubs linked into static executables. + * + * Work around this problem by adding a weak attribute to the declarations + * of dlsym() and dlerror(). (The declaration is otherwise the same as in + * ). This allows a static executable to be linked without -ldl. + * If &dlsym is NULL then this is a static executable and a call to the + * system shmget() may be performed without worry as there is no dynamic + * call chain. + */ +extern void *dlsym (void *__restrict __handle, __const char *__restrict __name) + __attribute__((weak)) __THROW __nonnull ((2)); +extern char *dlerror (void) __attribute__((weak)) __THROW; + + +/* call syscall shmget through the generic syscall mechanism */ +static int syscall_shmget(key_t key, size_t size, int shmflg) +{ +#ifdef SYS_shmget + return syscall(SYS_shmget, key, size, shmflg); +#else + /* + * Some platforms do not have have a direct shmget syscall. Instead, + * all SysV IPC calls are funneled through the ipc() system call. + * + * ipc() is expected to only be used by libc implementors, so using + * it has not been smoothed out. There is no function declaration. + * The needed define for SHMGET is in linux/ipc.h, but that file + * also includes a conflicting definition of ipc_perm. So, + * just define the needed items here. + * + * When compiling -m32 on x86_64, the ipc glibc wrapper does not + * exist. Instead, just use SYS_ipc. + * + * The ipc system call below does not set the IPC_64 version flag + * with SHMGET because that would have required more private defines + * and the version number is not used for the SHMGET call. + */ + #define SHMGET 23 + + return syscall(SYS_ipc, SHMGET, key, size, shmflg, (void *)NULL, 0L); +#endif +} + +#endif /* HAVE_SHMGET_SYSCALL */ + +int shmget(key_t key, size_t size, int shmflg) +{ + static int (*real_shmget)(key_t key, size_t size, int shmflg) = NULL; + char *error; + int retval; + size_t aligned_size = size; + + DEBUG("hugetlb_shmem: entering overridden shmget() call\n"); + + /* Get a handle to the "real" shmget system call */ + if (!real_shmget) { +#ifdef HAVE_SHMGET_SYSCALL + if (&dlsym == NULL) { + /* in a static executable, call shmget directly */ + real_shmget = syscall_shmget; + } else +#endif /* HAVE_SHMGET_SYSCALL */ + { + real_shmget = dlsym(RTLD_NEXT, "shmget"); + if ((error = dlerror()) != NULL) { + ERROR("%s", error); + return -1; + } + } + } + + /* Align the size and set SHM_HUGETLB on request */ + if (__hugetlb_opts.shm_enabled) { + /* + * Use /proc/meminfo because shm always uses the system + * default huge page size. + */ + long hpage_size = kernel_default_hugepage_size(); + aligned_size = ALIGN(size, hpage_size); + if (size != aligned_size) { + DEBUG("hugetlb_shmem: size growth align %zd -> %zd\n", + size, aligned_size); + } + + INFO("hugetlb_shmem: Adding SHM_HUGETLB flag\n"); + shmflg |= SHM_HUGETLB; + } else { + DEBUG("hugetlb_shmem: shmget override not requested\n"); + } + + /* Call the "real" shmget. If hugepages fail, use small pages */ + retval = real_shmget(key, aligned_size, shmflg); + if (retval == -1 && __hugetlb_opts.shm_enabled) { + WARNING("While overriding shmget(%zd) to add SHM_HUGETLB: %s\n", + aligned_size, strerror(errno)); + shmflg &= ~SHM_HUGETLB; + retval = real_shmget(key, size, shmflg); + WARNING("Using small pages for shmget despite HUGETLB_SHM\n"); + } + + return retval; +} diff --git a/sys-aarch64elf.S b/sys-aarch64elf.S new file mode 100644 index 0000000..210558b --- /dev/null +++ b/sys-aarch64elf.S @@ -0,0 +1,38 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2013 Linaro Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall + + +direct_syscall: + uxtw x8, w0 + mov x0, x1 + mov x1, x2 + mov x2, x3 + mov x3, x4 + mov x4, x5 + mov x5, x6 + mov x6, x7 + svc 0x0 + ret + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/sys-armelf_linux_eabi.S b/sys-armelf_linux_eabi.S new file mode 100644 index 0000000..265b75d --- /dev/null +++ b/sys-armelf_linux_eabi.S @@ -0,0 +1,37 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2012 ARM Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall + .type direct_syscall,%function +direct_syscall: + mov ip, sp + stmfd sp!, {r4, r5, r6, r7} + mov r7, r0 + mov r0, r1 + mov r1, r2 + mov r2, r3 + ldmfd ip, {r3, r4, r5, r6} + swi 0x0 + ldmfd sp!, {r4, r5, r6, r7} + bx lr + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/sys-elf32ppclinux.S b/sys-elf32ppclinux.S new file mode 100644 index 0000000..6ba3f22 --- /dev/null +++ b/sys-elf32ppclinux.S @@ -0,0 +1,38 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall +direct_syscall: + mr 0,3 + mr 3,4 + mr 4,5 + mr 5,6 + mr 6,7 + mr 7,8 + mr 8,9 + sc + blr + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/sys-elf64_s390.S b/sys-elf64_s390.S new file mode 100644 index 0000000..5c31899 --- /dev/null +++ b/sys-elf64_s390.S @@ -0,0 +1,26 @@ +/* + * libhugetlbfs - direct system call for s390 (64 bit) + * + * Copyright IBM Corp. 2012 + * Author(s): Gerald Schaefer + * + * NOTE: Only 4 syscall parameters supported w/o stack access, but this is + * sufficient for the syscalls used so far (__NR_write, __NR_kill and + * __NR_getpid). + */ + + .text + + .globl direct_syscall +direct_syscall: + lgr %r1,%r2 + lgr %r2,%r3 + lgr %r3,%r4 + lgr %r4,%r5 + lgr %r5,%r6 + svc 0 + br %r14 + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/sys-elf64lppc.S b/sys-elf64lppc.S new file mode 120000 index 0000000..5502d27 --- /dev/null +++ b/sys-elf64lppc.S @@ -0,0 +1 @@ +sys-elf64ppc.S \ No newline at end of file diff --git a/sys-elf64ppc.S b/sys-elf64ppc.S new file mode 100644 index 0000000..b57a345 --- /dev/null +++ b/sys-elf64ppc.S @@ -0,0 +1,52 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text +#if _CALL_ELF != 2 + .align 2 + .globl direct_syscall + .globl .direct_syscall + .section ".opd","aw" +direct_syscall: + .quad .direct_syscall + .quad .TOC.@tocbase + .quad 0 + .previous + .type .direct_syscall,@function +.direct_syscall: +#else + .globl direct_syscall + .type direct_syscall,@function +direct_syscall: +#endif + mr 0,3 + mr 3,4 + mr 4,5 + mr 5,6 + mr 6,7 + mr 7,8 + mr 8,9 + sc + blr + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/sys-elf_i386.S b/sys-elf_i386.S new file mode 100644 index 0000000..6182b3d --- /dev/null +++ b/sys-elf_i386.S @@ -0,0 +1,46 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall +direct_syscall: + push %ebp + push %edi + push %esi + push %ebx + mov 0x2c(%esp),%ebp + mov 0x28(%esp),%edi + mov 0x24(%esp),%esi + mov 0x20(%esp),%edx + mov 0x1c(%esp),%ecx + mov 0x18(%esp),%ebx + mov 0x14(%esp),%eax + int $0x80 + pop %ebx + pop %esi + pop %edi + pop %ebp + ret + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/sys-elf_s390.S b/sys-elf_s390.S new file mode 100644 index 0000000..e6a9a96 --- /dev/null +++ b/sys-elf_s390.S @@ -0,0 +1,26 @@ +/* + * libhugetlbfs - direct system call for s390 (31 bit) + * + * Copyright IBM Corp. 2012 + * Author(s): Gerald Schaefer + * + * NOTE: Only 4 syscall parameters supported w/o stack access, but this is + * sufficient for the syscalls used so far (__NR_write, __NR_kill and + * __NR_getpid). + */ + + .text + + .globl direct_syscall +direct_syscall: + lr %r1,%r2 + lr %r2,%r3 + lr %r3,%r4 + lr %r4,%r5 + lr %r5,%r6 + svc 0 + br %r14 + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/sys-elf_x86_64.S b/sys-elf_x86_64.S new file mode 100644 index 0000000..561f49b --- /dev/null +++ b/sys-elf_x86_64.S @@ -0,0 +1,38 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall +direct_syscall: + mov %rdi,%rax + mov %rsi,%rdi + mov %rdx,%rsi + mov %rcx,%rdx + mov %r8,%r10 + mov %r9,%r8 + mov 0x8(%rsp),%r9 + syscall + retq + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..ad219cf --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +shmoverride_linked.c diff --git a/tests/Makefile b/tests/Makefile new file mode 100644 index 0000000..073df96 --- /dev/null +++ b/tests/Makefile @@ -0,0 +1,301 @@ +PREFIX = /usr/local + +LIB_TESTS = gethugepagesize test_root find_path unlinked_fd misalign \ + readback truncate shared private fork-cow empty_mounts large_mounts \ + meminfo_nohuge ptrace-write-hugepage icache-hygiene slbpacaflush \ + chunk-overcommit mprotect alloc-instantiate-race mlock \ + truncate_reserve_wraparound truncate_sigbus_versus_oom \ + map_high_truncate_2 truncate_above_4GB direct \ + misaligned_offset brk_near_huge task-size-overrun stack_grow_into_huge \ + counters quota heap-overflow get_huge_pages get_hugepage_region \ + shmoverride_linked gethugepagesizes \ + madvise_reserve fadvise_reserve readahead_reserve \ + shm-perms \ + mremap-expand-slice-collision \ + mremap-fixed-normal-near-huge mremap-fixed-huge-near-normal \ + corrupt-by-cow-opt noresv-preserve-resv-page noresv-regarded-as-resv \ + fallocate_basic fallocate_align fallocate_stress +LIB_TESTS_64 = +LIB_TESTS_64_STATIC = straddle_4GB huge_at_4GB_normal_below \ + huge_below_4GB_normal_above +LIB_TESTS_64_ALL = $(LIB_TESTS_64) $(LIB_TESTS_64_STATIC) +NOLIB_TESTS = malloc malloc_manysmall dummy heapshrink shmoverride_unlinked +LDSCRIPT_TESTS = zero_filesize_segment +HUGELINK_TESTS = linkhuge linkhuge_nofd linkshare +HUGELINK_RW_TESTS = linkhuge_rw +STRESS_TESTS = mmap-gettest mmap-cow shm-gettest shm-getraw shm-fork +# NOTE: all named tests in WRAPPERS must also be named in TESTS +WRAPPERS = quota counters madvise_reserve fadvise_reserve \ + readahead_reserve mremap-expand-slice-collision \ + mremap-fixed-normal-near-huge mremap-fixed-huge-near-normal \ + fallocate_basic fallocate_align fallocate_stress +HELPERS = get_hugetlbfs_path compare_kvers +HELPER_LIBS = libheapshrink.so +BADTOOLCHAIN = bad-toolchain.sh + +CFLAGS = -O2 -Wall -g +CPPFLAGS = -I.. +STATIC_LIBHUGE = -Wl,--whole-archive -lhugetlbfs -Wl,--no-whole-archive +STATIC_LDLIBS = -Wl,--no-as-needed -lpthread +LDLIBS = $(STATIC_LDLIBS) -ldl -lhugetlbfs_privutils +LDFLAGS32 = -L../obj32 +LDFLAGS64 = -L../obj64 +INSTALL = install + +TESTS = $(LIB_TESTS) $(NOLIB_TESTS) $(STRESS_TESTS) dummy.ldscript +ifdef ELF32 +ifeq ($(CUSTOM_LDSCRIPTS),yes) +TESTS += $(LDSCRIPT_TESTS) $(HUGELINK_TESTS) $(HUGELINK_TESTS:%=xB.%) \ + $(HUGELINK_TESTS:%=xBDT.%) $(HUGELINK_RW_TESTS) +else +TESTS += $(LDSCRIPT_TESTS) $(HUGELINK_TESTS) $(HUGELINK_RW_TESTS) +endif + +else +ifdef ELF64 +ifeq ($(CUSTOM_LDSCRIPTS),yes) +TESTS += $(LDSCRIPT_TESTS) $(HUGELINK_TESTS) $(HUGELINK_TESTS:%=xB.%) \ + $(HUGELINK_TESTS:%=xBDT.%) $(HUGELINK_RW_TESTS) +else +TESTS += $(LDSCRIPT_TESTS) $(HUGELINK_TESTS) $(HUGELINK_RW_TESTS) +endif + +endif +endif + +ifneq ($(ARCH),ia64) +TESTS_64 = $(LIB_TESTS_64) +TESTS_64_STATIC = $(LIB_TESTS_64_STATIC) +endif + +SCRIPTS=../ldscripts +SCRIPTS32 = $(SCRIPTS)/$(ELF32) +SCRIPTS64 = $(SCRIPTS)/$(ELF64) +HUGETLBFS_LD=../ld.hugetlbfs +INST_TESTSDIR32 = $(LIBDIR32)/libhugetlbfs/tests +INST_TESTSDIR64 = $(LIBDIR64)/libhugetlbfs/tests + +ifdef V +VECHO = : +else +VECHO = echo " " +.SILENT: +endif + +DEPFILES = $(LIB_TESTS:%=%.d) $(NOLIB_TESTS:%=%.d) $(HUGELINK_TESTS:%=%.d) \ + $(HELPERS:%=%.d) testutils.d + +ALLTESTS = $(foreach DIR,$(OBJDIRS),$(TESTS:%=$(DIR)/%)) +ALLHELPERS = $(foreach DIR,$(OBJDIRS),$(HELPERS:%=$(DIR)/%)) +ALLHELPERLIBS = $(foreach DIR,$(OBJDIRS),$(HELPER_LIBS:%=$(DIR)/%)) +ifdef CC64 +ALLTESTS += $(TESTS_64:%=obj64/%) +ALLTESTS += $(TESTS_64_STATIC:%=obj64/%_static) +endif + +# For now, build only one test as a static binary. +# Can be changed once libhugetlbfs has better support for static linking. +# Also, some tests should be changed to use syscall() instead of +# dlsym() / rtld_next(). +ifdef CC32 +#ALLTESTS += $(LIB_TESTS:%=obj32/%_static) $(STRESS_TESTS:%=obj32/%_static) +ALLTESTS += obj32/shmoverride_linked_static +endif +ifdef CC64 +#ALLTESTS += $(LIB_TESTS:%=obj64/%_static) $(STRESS_TESTS:%=obj64/%_static) +ALLTESTS += obj64/shmoverride_linked_static +endif + +objs_needing_wrappers = \ + $(foreach W,$(WRAPPERS:%.sh=%),$(filter $(1)/$(W),$(ALLTESTS))) +WRAPPERS32 = $(addsuffix .sh,$(call objs_needing_wrappers,obj32)) +WRAPPERS64 = $(addsuffix .sh,$(call objs_needing_wrappers,obj64)) +ALLWRAPPERS = $(WRAPPERS32) $(WRAPPERS64) + +all: $(ALLTESTS) $(ALLHELPERS) $(ALLHELPERLIBS) $(ALLWRAPPERS) + +shmoverride_linked.c: shmoverride_unlinked.c + ln -s shmoverride_unlinked.c shmoverride_linked.c + +obj32/%.o: %.c + @$(VECHO) CC32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj64/%.o: %.c + @$(VECHO) CC64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj32/%-pic.o: %.c + @$(VECHO) CC32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) $(CFLAGS) -fPIC -o $@ -c $< + +obj64/%-pic.o: %.c + @$(VECHO) CC64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) $(CFLAGS) -fPIC -o $@ -c $< + +obj32/libheapshrink.so: obj32/heapshrink-helper-pic.o + @$(VECHO) LD32 "(shared)" $@ + @mkdir -p obj32 + $(CC32) -Wl,-soname,$(notdir $@) -shared -o $@ $^ + +obj64/libheapshrink.so: obj64/heapshrink-helper-pic.o + @$(VECHO) LD64 "(shared)" $@ + @mkdir -p obj64 + $(CC64) -Wl,-soname,$(notdir $@) -shared -o $@ $^ + +$(LIB_TESTS:%=obj32/%): %: %.o obj32/testutils.o obj32/libtestutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(LIB_TESTS:%=obj64/%) $(LIB_TESTS_64_ALL:%=obj64/%): %: %.o obj64/testutils.o obj64/libtestutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(LIB_TESTS:%=obj32/%_static): %_static: %.o obj32/testutils.o obj32/libtestutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) -static $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +$(LIB_TESTS:%=obj64/%_static) $(LIB_TESTS_64_ALL:%=obj64/%_static): %_static: %.o obj64/testutils.o obj64/libtestutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) -static $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +$(NOLIB_TESTS:%=obj32/%): %: %.o obj32/testutils.o + @$(VECHO) LD32 "(nolib test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) + +$(NOLIB_TESTS:%=obj64/%): %: %.o obj64/testutils.o + @$(VECHO) LD64 "(nolib test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) + +obj32/%.ldscript: obj32/%.o obj32/testutils.o + @$(VECHO) SCRIPT32 $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -Wl,--verbose -o/dev/null $^ $(LDLIBS) > $@ + +obj64/%.ldscript: obj64/%.o obj64/testutils.o + @$(VECHO) SCRIPT64 $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -Wl,--verbose -o/dev/null $^ $(LDLIBS) > $@ + +$(LDSCRIPT_TESTS:%=obj32/%): obj32/%: %.ld obj32/%.o obj32/testutils.o + @$(VECHO) LD32 "(preload test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ -Lobj32 $^ $(LDLIBS) || cp $(BADTOOLCHAIN) $@ + +$(LDSCRIPT_TESTS:%=obj64/%): obj64/%: %.ld obj64/%.o obj64/testutils.o + @$(VECHO) LD64 "(preload test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ -Lobj64 $^ $(LDLIBS) || cp $(BADTOOLCHAIN) $@ + +$(HUGELINK_TESTS:%=obj32/%): %: %.o obj32/testutils.o + @$(VECHO) LD32 "(hugelink test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) + +$(HUGELINK_TESTS:%=obj64/%): %: %.o obj64/testutils.o + @$(VECHO) LD64 "(hugelink test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) + +$(HUGELINK_RW_TESTS:%=obj32/%): %: %.o $(HUGETLBFS_LD) obj32/testutils.o + @$(VECHO) LD32 "(hugelink_rw test)" $@ + @mkdir -p obj32 + @ln -sf ../$(HUGETLBFS_LD) obj32/ld + $(CC32) -B./obj32 $(LDFLAGS) $(LDFLAGS32) -o $@ $(LDLIBS) -Wl,--hugetlbfs-align $(filter %.o,$^) + +$(HUGELINK_RW_TESTS:%=obj64/%): %: %.o $(HUGETLBFS_LD) obj64/testutils.o + @$(VECHO) LD64 "(hugelink_rw test)" $@ + @mkdir -p obj64 + @ln -sf ../$(HUGETLBFS_LD) obj64/ld + $(CC64) -B./obj64 $(LDFLAGS) $(LDFLAGS64) -o $@ $(LDLIBS) -Wl,--hugetlbfs-align $(filter %.o,$^) + +$(STRESS_TESTS:%=obj32/%): %: %.o obj32/testutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(STRESS_TESTS:%=obj64/%): %: %.o obj64/testutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(STRESS_TESTS:%=obj32/%_static): %_static: %.o obj32/testutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) -static $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +$(STRESS_TESTS:%=obj64/%_static): %_static: %.o obj64/testutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) -static $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +obj32/xB.%: $(SCRIPTS32).xB $(HUGETLBFS_LD) obj32/%.o obj32/testutils.o + @$(VECHO) LD32 "(xB test)" $@ + @mkdir -p obj32 + @ln -sf ../$(HUGETLBFS_LD) obj32/ld + HUGETLB_DEPRECATED_LINK=1 $(CC32) -B./obj32 $(LDFLAGS) $(LDFLAGS32) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=B $(filter %.o,$^) + +obj64/xB.%: $(SCRIPTS64).xB $(HUGETLBFS_LD) obj64/%.o obj64/testutils.o + @$(VECHO) LD64 "(xB test)" $@ + @mkdir -p obj64 + @ln -sf ../$(HUGETLBFS_LD) obj64/ld + HUGETLB_DEPRECATED_LINK=1 $(CC64) -B./obj64 $(LDFLAGS) $(LDFLAGS64) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=B $(filter %.o,$^) + +obj32/xBDT.%: $(SCRIPTS32).xBDT $(HUGETLBFS_LD) obj32/%.o obj32/testutils.o + @$(VECHO) LD32 "(xBDT test)" $@ + @mkdir -p obj32 + @ln -sf ../$(HUGETLBFS_LD) obj32/ld + HUGETLB_DEPRECATED_LINK=1 $(CC32) -B./obj32 $(LDFLAGS) $(LDFLAGS32) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=BDT $(filter %.o,$^) + +obj64/xBDT.%: $(SCRIPTS64).xBDT $(HUGETLBFS_LD) obj64/%.o obj64/testutils.o + @$(VECHO) LD64 "(xBDT test)" $@ + @mkdir -p obj64 + @ln -sf ../$(HUGETLBFS_LD) obj64/ld + HUGETLB_DEPRECATED_LINK=1 $(CC64) -B./obj64 $(LDFLAGS) $(LDFLAGS64) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=BDT $(filter %.o,$^) + +$(HELPERS:%=obj32/%): %: %.o + @$(VECHO) LD32 "(helper)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(HELPERS:%=obj64/%): %: %.o + @$(VECHO) LD64 "(helper)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(WRAPPERS32): obj32/%.sh: %.sh obj32/% + @$(VECHO) COPY "(wrapped test)" $@ + @cp -f $< $@ + +$(WRAPPERS64): obj64/%.sh: %.sh obj64/% + @$(VECHO) COPY "(wrapped test)" $@ + @cp -f $< $@ + +clean: + @$(VECHO) CLEAN "(tests)" + rm -f *~ *.o *.so *.a *.d core a.out + rm -rf obj* + rm -f shmoverride_linked.c # Autogenerated file + rm -f $(TESTS) + +%.d: %.c + @$(CC) $(CPPFLAGS) -MM -MT "$(foreach DIR,$(OBJDIRS),$(DIR)/$*.o) $@" $< > $@ + +-include $(DEPFILES) + +obj32/install: + @$(VECHO) INSTALL32 $(INST_TESTSDIR32) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR32) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(TESTS:%=obj32/%) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(WRAPPERS32) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 wrapper-utils.sh $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(HELPERS:%=obj32/%) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(HELPER_LIBS:%=obj32/%) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 run_tests.py $(DESTDIR)$(INST_TESTSDIR32) + +obj64/install: + @$(VECHO) INSTALL64 $(INST_TESTSDIR64) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR64) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(TESTS:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(WRAPPERS64) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 wrapper-utils.sh $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(HELPERS:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(HELPER_LIBS:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(TESTS_64:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 run_tests.py $(DESTDIR)$(INST_TESTSDIR64) + +install: $(OBJDIRS:%=%/install) diff --git a/tests/alloc-instantiate-race.c b/tests/alloc-instantiate-race.c new file mode 100644 index 0000000..7f84e8a --- /dev/null +++ b/tests/alloc-instantiate-race.c @@ -0,0 +1,273 @@ +/* + * Test rationale: + * + * This test is designed to detect a kernel allocation race introduced + * with hugepage demand-faulting. The problem is that no lock is held + * between allocating a hugepage and instantiating it in the + * pagetables or page cache index. In between the two, the (huge) + * page is cleared, so there's substantial time. Thus two processes + * can race instantiating the (same) last available hugepage - one + * will fail on the allocation, and thus cause an OOM fault even + * though the page it actually wants is being instantiated by the + * other racing process. + * + * + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +pid_t gettid(void) +{ + return syscall(__NR_gettid); +} + +static long hpage_size; +static pid_t child1, child2; +static pthread_t thread1, thread2; + +void cleanup(void) +{ + if (child1) + kill(child1, SIGKILL); + if (child2) + kill(child2, SIGKILL); +} + +static int one_racer(void *p, int cpu, + volatile int *mytrigger, volatile int *othertrigger) +{ + volatile int *pi = p; + cpu_set_t cpuset; + int err; + + /* Split onto different cpus to encourage the race */ + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + + err = sched_setaffinity(gettid(), CPU_SETSIZE/8, &cpuset); + if (err != 0) + CONFIG("sched_setaffinity(cpu%d): %s", cpu, strerror(errno)); + + /* Ready.. */ + *mytrigger = 1; + /* Set.. */ + while (! *othertrigger) + ; + + /* Instantiate! */ + *pi = 1; + + return 0; +} + +static void proc_racer(void *p, int cpu, + volatile int *mytrigger, volatile int *othertrigger) +{ + exit(one_racer(p, cpu, mytrigger, othertrigger)); +} + +struct racer_info { + void *p; /* instantiation address */ + int cpu; + int race_type; + volatile int *mytrigger; + volatile int *othertrigger; + int status; +}; + +static void *thread_racer(void *info) +{ + struct racer_info *ri = info; + + one_racer(ri->p, ri->cpu, ri->mytrigger, ri->othertrigger); + return ri; +} +static void run_race(void *syncarea, int race_type) +{ + volatile int *trigger1, *trigger2; + int fd; + void *p; + int status1, status2; + int ret; + + memset(syncarea, 0, sizeof(*trigger1) + sizeof(*trigger2)); + trigger1 = syncarea; + trigger2 = trigger1 + 1; + + /* Get a new file for the final page */ + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + verbose_printf("Mapping final page.. "); + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, race_type, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + verbose_printf("%p\n", p); + + if (race_type == MAP_SHARED) { + child1 = fork(); + if (child1 < 0) + FAIL("fork(): %s", strerror(errno)); + if (child1 == 0) + proc_racer(p, 0, trigger1, trigger2); + + child2 = fork(); + if (child2 < 0) + FAIL("fork(): %s", strerror(errno)); + if (child2 == 0) + proc_racer(p, 1, trigger2, trigger1); + + /* wait() calls */ + ret = waitpid(child1, &status1, 0); + if (ret < 0) + FAIL("waitpid() child 1: %s", strerror(errno)); + verbose_printf("Child 1 status: %x\n", status1); + + + ret = waitpid(child2, &status2, 0); + if (ret < 0) + FAIL("waitpid() child 2: %s", strerror(errno)); + verbose_printf("Child 2 status: %x\n", status2); + + if (WIFSIGNALED(status1)) + FAIL("Child 1 killed by signal %s", + strsignal(WTERMSIG(status1))); + if (WIFSIGNALED(status2)) + FAIL("Child 2 killed by signal %s", + strsignal(WTERMSIG(status2))); + + status1 = WEXITSTATUS(status1); + status2 = WEXITSTATUS(status2); + } else { + struct racer_info ri1 = { + .p = p, + .cpu = 0, + .mytrigger = trigger1, + .othertrigger = trigger2, + }; + struct racer_info ri2 = { + .p = p, + .cpu = 1, + .mytrigger = trigger2, + .othertrigger = trigger1, + }; + void *tret1, *tret2; + + ret = pthread_create(&thread1, NULL, thread_racer, &ri1); + if (ret != 0) + FAIL("pthread_create() 1: %s\n", strerror(errno)); + + ret = pthread_create(&thread2, NULL, thread_racer, &ri2); + if (ret != 0) + FAIL("pthread_create() 2: %s\n", strerror(errno)); + + ret = pthread_join(thread1, &tret1); + if (ret != 0) + FAIL("pthread_join() 1: %s\n", strerror(errno)); + if (tret1 != &ri1) + FAIL("Thread 1 returned %p not %p, killed?\n", + tret1, &ri1); + ret = pthread_join(thread2, &tret2); + if (ret != 0) + FAIL("pthread_join() 2: %s\n", strerror(errno)); + if (tret2 != &ri2) + FAIL("Thread 2 returned %p not %p, killed?\n", + tret2, &ri2); + + status1 = ri1.status; + status2 = ri2.status; + } + + if (status1 != 0) + FAIL("Racer 1 terminated with code %d", status1); + + if (status2 != 0) + FAIL("Racer 2 terminated with code %d", status2); +} + +int main(int argc, char *argv[]) +{ + unsigned long totpages; + int fd; + void *p, *q; + unsigned long i; + int race_type; + + test_init(argc, argv); + + if (argc != 2) + CONFIG("Usage: alloc-instantiate-race "); + + hpage_size = check_hugepagesize(); + totpages = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + + if (strcmp(argv[1], "shared") == 0) { + race_type = MAP_SHARED; + } else if (strcmp(argv[1], "private") == 0) { + race_type = MAP_PRIVATE; + } else { + CONFIG("Usage: alloc-instantiate-race "); + } + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* Get a shared normal page for synchronization */ + verbose_printf("Mapping synchronization area.."); + q = mmap(NULL, getpagesize(), PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (q == MAP_FAILED) + FAIL("mmap() sync area: %s", strerror(errno)); + verbose_printf("done\n"); + + verbose_printf("Mapping %ld/%ld pages.. ", totpages-1, totpages); + p = mmap(NULL, (totpages-1)*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + /* Allocate all save one of the pages up front */ + verbose_printf("instantiating.. "); + for (i = 0; i < (totpages - 1); i++) + memset(p + (i * hpage_size), 0, sizeof(int)); + verbose_printf("done\n"); + + run_race(q, race_type); + + PASS(); +} diff --git a/tests/bad-toolchain.sh b/tests/bad-toolchain.sh new file mode 100755 index 0000000..2535aa0 --- /dev/null +++ b/tests/bad-toolchain.sh @@ -0,0 +1,5 @@ +#! /bin/sh + +echo "Bad toolchain: can't build this testcase" + +exit 1 diff --git a/tests/brk_near_huge.c b/tests/brk_near_huge.c new file mode 100644 index 0000000..f6d1e07 --- /dev/null +++ b/tests/brk_near_huge.c @@ -0,0 +1,114 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * Certain kernels have a bug where brk() does not perform the same + * checks that a MAP_FIXED mmap() will, allowing brk() to create a + * normal page VMA in a hugepage only address region. This can lead + * to oopses or other badness. + */ + +/* Possibly these functions should go in the library itself.. */ +#ifdef __powerpc64__ +void *next_chunk(void *addr) +{ + if ((unsigned long)addr < 0x100000000UL) + /* 256M segments below 4G */ + return PALIGN(addr, 0x10000000UL); + else + /* 1TB segments above */ + return PALIGN(addr, 0x10000000000UL); +} +#elif defined(__powerpc__) && !defined(PPC_NO_SEGMENTS) +void *next_chunk(void *addr) +{ + return PALIGN(addr, 0x10000000UL); +} +#elif defined(__ia64__) +void *next_chunk(void *addr) +{ + return PALIGN(addr, 0x8000000000000000UL); +} +#else +void *next_chunk(void *addr) +{ + return PALIGN(addr, gethugepagesize()); +} +#endif + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *brk0, *hugemap_addr, *newbrk; + char *p; + int err; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + brk0 = sbrk(0); + verbose_printf("Initial break at %p\n", brk0); + + hugemap_addr = next_chunk(brk0) + hpage_size; + + p = mmap(hugemap_addr, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + if (p != hugemap_addr) + FAIL("mmap() at unexpected address %p instead of %p\n", p, + hugemap_addr); + + verbose_printf("Hugepage mapped at %p-%p\n", p, p+hpage_size-1); + + err = test_addr_huge((void *)p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + newbrk = next_chunk(brk0) + getpagesize(); + err = brk((void *)newbrk); + if (err == -1) + /* Failing the brk() is an acceptable kernel response */ + PASS(); + + /* Suceeding the brk() is acceptable iff the new memory is + * properly accesible and we don't have a kernel blow up when + * we touch it. */ + memset(brk0, 0, newbrk-brk0); + + PASS(); +} diff --git a/tests/chunk-overcommit.c b/tests/chunk-overcommit.c new file mode 100644 index 0000000..e8f20e0 --- /dev/null +++ b/tests/chunk-overcommit.c @@ -0,0 +1,114 @@ +/* + * Test rationale: + * + * Some kernel versions after hugepage demand allocation was added + * used a dubious heuristic to check if there was enough hugepage + * space available for a given mapping. The number of + * not-already-instantiated pages in the mapping was compared against + * the total hugepage free pool. It was very easy to confuse this + * heuristic into overcommitting by allocating hugepage memory in + * chunks, each less than the total available pool size but together + * more than available. This would generally lead to OOM SIGKILLs of + * one process or another when it tried to instantiate pages beyond + * the available pool. + * + * + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + long hpage_size; + unsigned long totpages, chunk1, chunk2; + int fd; + void *p, *q; + pid_t child, ret; + int status; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + totpages = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + chunk1 = (totpages / 2) + 1; + chunk2 = totpages - chunk1 + 1; + + verbose_printf("overcommit: %ld hugepages available: " + "chunk1=%ld chunk2=%ld\n", totpages, chunk1, chunk2); + + p = mmap(NULL, chunk1*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() chunk1: %s", strerror(errno)); + + q = mmap(NULL, chunk2*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, chunk1*hpage_size); + if (q == MAP_FAILED) { + if (errno != ENOMEM) + FAIL("mmap() chunk2: %s", strerror(errno)); + else + PASS(); + } + + verbose_printf("Looks like we've overcommitted, testing...\n"); + + /* Looks like we're overcommited, but we need to confirm that + * this is bad. We touch it all in a child process because an + * overcommit will generally lead to a SIGKILL which we can't + * handle, of course. */ + child = fork(); + if (child < 0) + FAIL("fork(): %s", strerror(errno)); + + if (child == 0) { + memset(p, 0, chunk1*hpage_size); + memset(q, 0, chunk2*hpage_size); + exit(0); + } + + ret = waitpid(child, &status, 0); + if (ret < 0) + FAIL("waitpid(): %s", strerror(errno)); + + if (WIFSIGNALED(status)) + FAIL("Killed by signal \"%s\" due to overcommit", + strsignal(WTERMSIG(status))); + + PASS(); +} diff --git a/tests/compare_kvers.c b/tests/compare_kvers.c new file mode 100644 index 0000000..e2ef62a --- /dev/null +++ b/tests/compare_kvers.c @@ -0,0 +1,41 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include "libhugetlbfs_privutils.h" + +int main (int argc, char **argv) +{ + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return -1; + } + + switch (test_compare_kver(argv[1], argv[2])) { + case 0: /* Equal to */ + return 0; + case -1: /* Less than */ + return 1; + case 1: /* Greater than */ + return 2; + default: + return -1; + } +} diff --git a/tests/corrupt-by-cow-opt.c b/tests/corrupt-by-cow-opt.c new file mode 100644 index 0000000..1c46ecc --- /dev/null +++ b/tests/corrupt-by-cow-opt.c @@ -0,0 +1,78 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2013 Joonsoo Kim, LG Electronics. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include "hugetests.h" + +#define P "corrupt-by-cow-opt" +#define DESC \ + "* Test sanity of cow optimization on page cache. If a page *\n"\ + "* in page cache has only 1 ref count, it is mapped for a private *\n"\ + "* mapping directly and is overwritten freely, so next time we *\n"\ + "* access the page, we can see corrupt data. *\n"\ + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + char *p; + char c; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + check_free_huge_pages(2); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + *p = 's'; + verbose_printf("Write %c to %p via shared mapping\n", *p, p); + munmap(p, hpage_size); + + p = mmap(NULL, hpage_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + *p = 'p'; + verbose_printf("Write %c to %p via private mapping\n", *p, p); + munmap(p, hpage_size); + + p = mmap(NULL, hpage_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 3: %s", strerror(errno)); + + c = *p; + verbose_printf("Read %c from %p via shared mapping\n", *p, p); + munmap(p, hpage_size); + + if (c != 's') + FAIL("data corrupt"); + + PASS(); +} diff --git a/tests/counters.c b/tests/counters.c new file mode 100644 index 0000000..0284809 --- /dev/null +++ b/tests/counters.c @@ -0,0 +1,414 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2007 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +/* + * Test Rationale: + * + * The hugetlb pool maintains 4 global counters to track pages as they + * transition between various states. Due to the complex relationships between + * the counters, regressions are likely to occur in the future. This test + * performs operations that change the counters in known ways. It emulates the + * expected kernel behavior and compares the expected result to the actual + * values after each operation. + */ + +extern int errno; + +/* Global test configuration */ +#define DYNAMIC_SYSCTL "/proc/sys/vm/nr_overcommit_hugepages" +static long saved_nr_hugepages = -1; +static long saved_oc_hugepages = -1; +static long hpage_size; +static int private_resv; + +/* State arrays for our mmaps */ +#define NR_SLOTS 2 +#define SL_SETUP 0 +#define SL_TEST 1 +static int map_fd[NR_SLOTS]; +static char *map_addr[NR_SLOTS]; +static unsigned long map_size[NR_SLOTS]; +static unsigned int touched[NR_SLOTS]; + +/* Keep track of expected counter values */ +static long prev_total; +static long prev_free; +static long prev_resv; +static long prev_surp; + +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#define max(a,b) (((a) > (b)) ? (a) : (b)) + +/* Restore original nr_hugepages */ +void cleanup(void) { + if (hpage_size <= 0) + return; + if (saved_nr_hugepages >= 0) + set_nr_hugepages(hpage_size, saved_nr_hugepages); + if (saved_oc_hugepages >= 0) + set_nr_overcommit_hugepages(hpage_size, saved_oc_hugepages); +} + +void verify_dynamic_pool_support(void) +{ + saved_oc_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_OC); + if (saved_oc_hugepages < 0) + FAIL("Kernel appears to lack dynamic hugetlb pool support"); + set_nr_overcommit_hugepages(hpage_size, 10); +} + +void bad_value(int line, const char *name, long expect, long actual) +{ + if (actual == -1) + ERROR("%s not found in /proc/meminfo", name); + else + FAIL("Line %i: Bad %s: expected %li, actual %li", + line, name, expect, actual); +} + +void verify_counters(int line, long et, long ef, long er, long es) +{ + long t, f, r, s; + + t = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + f = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + r = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + s = get_huge_page_counter(hpage_size, HUGEPAGES_SURP); + + /* Invariant checks */ + if (t < 0 || f < 0 || r < 0 || s < 0) + ERROR("Negative counter value"); + if (f < r) + ERROR("HugePages_Free < HugePages_Rsvd"); + + /* Check actual values against expected values */ + if (t != et) + bad_value(line, "HugePages_Total", et, t); + + if (f != ef) + bad_value(line, "HugePages_Free", ef, f); + + if (r != er) + bad_value(line, "HugePages_Rsvd", er, r); + + if (s != es) + bad_value(line, "HugePages_Surp", es, s); + + /* Everything's good. Update counters */ + prev_total = t; + prev_free = f; + prev_resv = r; + prev_surp = s; +} + +/* Memory operations: + * Each of these has a predefined effect on the counters + */ +#define persistent_huge_pages (et - es) +void _set_nr_hugepages(unsigned long count, int line) +{ + long min_size; + long et, ef, er, es; + + if (set_nr_hugepages(hpage_size, count)) + FAIL("Cannot set nr_hugepages"); + + /* The code below is based on set_max_huge_pages in mm/hugetlb.c */ + es = prev_surp; + et = prev_total; + ef = prev_free; + er = prev_resv; + + /* + * Increase the pool size + * First take pages out of surplus state. Then make up the + * remaining difference by allocating fresh huge pages. + */ + while (es && count > persistent_huge_pages) + es--; + while (count > persistent_huge_pages) { + et++; + ef++; + } + if (count >= persistent_huge_pages) + goto out; + + /* + * Decrease the pool size + * First return free pages to the buddy allocator (being careful + * to keep enough around to satisfy reservations). Then place + * pages into surplus state as needed so the pool will shrink + * to the desired size as pages become free. + */ + min_size = max(count, er + et - ef); + while (min_size < persistent_huge_pages) { + ef--; + et--; + } + while (count < persistent_huge_pages) { + es++; + } + +out: + verify_counters(line, et, ef, er, es); +} +#undef set_nr_hugepages +#define set_nr_hugepages(c) _set_nr_hugepages(c, __LINE__) + +void _map(int s, int hpages, int flags, int line) +{ + long et, ef, er, es; + + map_fd[s] = hugetlbfs_unlinked_fd(); + if (map_fd[s] < 0) + CONFIG("Unable to open hugetlbfs file: %s", strerror(errno)); + map_size[s] = hpages * hpage_size; + map_addr[s] = mmap(NULL, map_size[s], PROT_READ|PROT_WRITE, flags, + map_fd[s], 0); + if (map_addr[s] == MAP_FAILED) + FAIL("mmap failed: %s", strerror(errno)); + touched[s] = 0; + + et = prev_total; + ef = prev_free; + er = prev_resv; + es = prev_surp; + + /* + * When using MAP_SHARED, a reservation will be created to guarantee + * pages to the process. If not enough pages are available to + * satisfy the reservation, surplus pages are added to the pool. + * NOTE: This code assumes that the whole mapping needs to be + * reserved and hence, will not work with partial reservations. + * + * If the kernel supports private reservations, then MAP_PRIVATE + * mappings behave like MAP_SHARED at mmap time. Otherwise, + * no counter updates will occur. + */ + if ((flags & MAP_SHARED) || private_resv) { + unsigned long shortfall = 0; + if (hpages + prev_resv > prev_free) + shortfall = hpages - prev_free + prev_resv; + et += shortfall; + ef = prev_free + shortfall; + er = prev_resv + hpages; + es = prev_surp + shortfall; + } + + verify_counters(line, et, ef, er, es); +} +#define map(s, h, f) _map(s, h, f, __LINE__) + +void _unmap(int s, int hpages, int flags, int line) +{ + long et, ef, er, es; + unsigned long i; + + munmap(map_addr[s], map_size[s]); + close(map_fd[s]); + map_fd[s] = -1; + map_addr[s] = NULL; + map_size[s] = 0; + + et = prev_total; + ef = prev_free; + er = prev_resv; + es = prev_surp; + + /* + * When a VMA is unmapped, the instantiated (touched) pages are + * freed. If the pool is in a surplus state, pages are freed to the + * buddy allocator, otherwise they go back into the hugetlb pool. + * NOTE: This code assumes touched pages have only one user. + */ + for (i = 0; i < touched[s]; i++) { + if (es) { + et--; + es--; + } else + ef++; + } + + /* + * mmap may have created some surplus pages to accomodate a + * reservation. If those pages were not touched, then they will + * not have been freed by the code above. Free them here. + */ + if ((flags & MAP_SHARED) || private_resv) { + int unused_surplus = min(hpages - touched[s], es); + et -= unused_surplus; + ef -= unused_surplus; + er -= hpages - touched[s]; + es -= unused_surplus; + } + + verify_counters(line, et, ef, er, es); +} +#define unmap(s, h, f) _unmap(s, h, f, __LINE__) + +void _touch(int s, int hpages, int flags, int line) +{ + long et, ef, er, es; + int nr; + char *c; + + for (c = map_addr[s], nr = hpages; + hpages && c < map_addr[s] + map_size[s]; + c += hpage_size, nr--) + *c = (char) (nr % 2); + /* + * Keep track of how many pages were touched since we can't easily + * detect that from user space. + * NOTE: Calling this function more than once for a mmap may yield + * results you don't expect. Be careful :) + */ + touched[s] = max(touched[s], hpages); + + /* + * Shared (and private when supported) mappings and consume resv pages + * that were previously allocated. Also deduct them from the free count. + * + * Unreserved private mappings may need to allocate surplus pages to + * satisfy the fault. The surplus pages become part of the pool + * which could elevate total, free, and surplus counts. resv is + * unchanged but free must be decreased. + */ + if (flags & MAP_SHARED || private_resv) { + et = prev_total; + ef = prev_free - hpages; + er = prev_resv - hpages; + es = prev_surp; + } else { + if (hpages + prev_resv > prev_free) + et = prev_total + (hpages - prev_free + prev_resv); + else + et = prev_total; + er = prev_resv; + es = prev_surp + et - prev_total; + ef = prev_free - hpages + et - prev_total; + } + verify_counters(line, et, ef, er, es); +} +#define touch(s, h, f) _touch(s, h, f, __LINE__) + +void run_test(char *desc, int base_nr) +{ + verbose_printf("%s...\n", desc); + set_nr_hugepages(base_nr); + + /* untouched, shared mmap */ + map(SL_TEST, 1, MAP_SHARED); + unmap(SL_TEST, 1, MAP_SHARED); + + /* untouched, private mmap */ + map(SL_TEST, 1, MAP_PRIVATE); + unmap(SL_TEST, 1, MAP_PRIVATE); + + /* touched, shared mmap */ + map(SL_TEST, 1, MAP_SHARED); + touch(SL_TEST, 1, MAP_SHARED); + unmap(SL_TEST, 1, MAP_SHARED); + + /* touched, private mmap */ + map(SL_TEST, 1, MAP_PRIVATE); + touch(SL_TEST, 1, MAP_PRIVATE); + unmap(SL_TEST, 1, MAP_PRIVATE); + + /* Explicit resizing during outstanding surplus */ + /* Consume surplus when growing pool */ + map(SL_TEST, 2, MAP_SHARED); + set_nr_hugepages(max(base_nr, 1)); + + /* Add pages once surplus is consumed */ + set_nr_hugepages(max(base_nr, 3)); + + /* Release free huge pages first */ + set_nr_hugepages(max(base_nr, 2)); + + /* When shrinking beyond committed level, increase surplus */ + set_nr_hugepages(base_nr); + + /* Upon releasing the reservation, reduce surplus counts */ + unmap(SL_TEST, 2, MAP_SHARED); + + verbose_printf("OK.\n"); +} + +int main(int argc, char ** argv) +{ + int base_nr; + + test_init(argc, argv); + hpage_size = check_hugepagesize(); + saved_nr_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + verify_dynamic_pool_support(); + check_must_be_root(); + + if ((private_resv = kernel_has_private_reservations()) == -1) + FAIL("kernel_has_private_reservations() failed\n"); + + /* + * This test case should require a maximum of 3 huge pages. + * Run through the battery of tests multiple times, with an increasing + * base pool size. This alters the circumstances under which surplus + * pages need to be allocated and increases the corner cases tested. + */ + for (base_nr = 0; base_nr <= 3; base_nr++) { + verbose_printf("Base pool size: %i\n", base_nr); + /* Run the tests with a clean slate */ + run_test("Clean", base_nr); + + /* Now with a pre-existing untouched, shared mmap */ + map(SL_SETUP, 1, MAP_SHARED); + run_test("Untouched, shared", base_nr); + unmap(SL_SETUP, 1, MAP_SHARED); + + /* Now with a pre-existing untouched, private mmap */ + map(SL_SETUP, 1, MAP_PRIVATE); + run_test("Untouched, private", base_nr); + unmap(SL_SETUP, 1, MAP_PRIVATE); + + /* Now with a pre-existing touched, shared mmap */ + map(SL_SETUP, 1, MAP_SHARED); + touch(SL_SETUP, 1, MAP_SHARED); + run_test("Touched, shared", base_nr); + unmap(SL_SETUP, 1, MAP_SHARED); + + /* Now with a pre-existing touched, private mmap */ + map(SL_SETUP, 1, MAP_PRIVATE); + touch(SL_SETUP, 1, MAP_PRIVATE); + run_test("Touched, private", base_nr); + unmap(SL_SETUP, 1, MAP_PRIVATE); + } + + PASS(); +} diff --git a/tests/counters.sh b/tests/counters.sh new file mode 100755 index 0000000..e3ffabe --- /dev/null +++ b/tests/counters.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +. wrapper-utils.sh + +# Huge page overcommit was not available until 2.6.24 +compare_kvers `uname -r` "2.6.24" +if [ $? -eq 1 ]; then + EXP_RC=$RC_FAIL +else + EXP_RC=$RC_PASS +fi + +exec_and_check $EXP_RC counters "$@" diff --git a/tests/direct.c b/tests/direct.c new file mode 100644 index 0000000..3418422 --- /dev/null +++ b/tests/direct.c @@ -0,0 +1,101 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define P0 "ffffffff" +#define IOSZ 4096 +char buf[IOSZ] __attribute__ ((aligned (IOSZ))); +#define TMPFILE "/tmp/direct" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd, dfd; + void *p; + size_t ret; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + dfd = open(TMPFILE, O_CREAT|O_EXCL|O_DIRECT|O_RDWR, 0600); + if (dfd < 0) + CONFIG("Failed to open direct-IO file: %s", strerror(errno)); + unlink(TMPFILE); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap hugetlbfs file: %s", strerror(errno)); + + memcpy(p, P0, 8); + + /* Direct write from huge page */ + ret = write(dfd, p, IOSZ); + if (ret == -1) + FAIL("Direct-IO write from huge page: %s", strerror(errno)); + if (ret != IOSZ) + FAIL("Short direct-IO write from huge page"); + if (lseek(dfd, 0, SEEK_SET) == -1) + FAIL("lseek: %s", strerror(errno)); + + /* Check for accuracy */ + ret = read(dfd, buf, IOSZ); + if (ret == -1) + FAIL("Direct-IO read to normal memory: %s", strerror(errno)); + if (ret != IOSZ) + FAIL("Short direct-IO read to normal memory"); + if (memcmp(P0, buf, 8)) + FAIL("Memory mismatch after Direct-IO write"); + if (lseek(dfd, 0, SEEK_SET) == -1) + FAIL("lseek: %s", strerror(errno)); + + /* Direct read to huge page */ + memset(p, 0, IOSZ); + ret = read(dfd, p, IOSZ); + if (ret == -1) + FAIL("Direct-IO read to huge page: %s\n", strerror(errno)); + if (ret != IOSZ) + FAIL("Short direct-IO read to huge page"); + + /* Check for accuracy */ + if (memcmp(p, P0, 8)) + FAIL("Memory mismatch after Direct-IO read"); + + close(dfd); + unlink(TMPFILE); + + PASS(); +} diff --git a/tests/dummy.c b/tests/dummy.c new file mode 100644 index 0000000..0c02ff1 --- /dev/null +++ b/tests/dummy.c @@ -0,0 +1,31 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + + /* If we're even able to load, that's enough */ + PASS(); +} diff --git a/tests/empty_mounts.c b/tests/empty_mounts.c new file mode 100644 index 0000000..ea818ae --- /dev/null +++ b/tests/empty_mounts.c @@ -0,0 +1,69 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* We override the normal open, so libhugetlbfs gets an apparently + * empty /proc/mounts or /etc/mtab */ +int open(const char *path, int flags, ...) +{ + int (*old_open)(const char *, int, ...); + int fd; + + if ((strcmp(path, "/proc/mounts") == 0) + || (strcmp(path, "/etc/mtab") == 0)) + path = "/dev/null"; + + old_open = dlsym(RTLD_NEXT, "open"); + if (flags & O_CREAT) { + va_list ap; + + va_start(ap, flags); + fd = (*old_open)(path, flags, va_arg(ap, mode_t)); + va_end(ap); + return fd; + } else { + return (*old_open)(path, flags); + } +} + +int main(int argc, char *argv[]) +{ + int fd; + + test_init(argc, argv); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + PASS(); + + FAIL("Mysteriously found a mount"); +} diff --git a/tests/fadvise_reserve.c b/tests/fadvise_reserve.c new file mode 100644 index 0000000..9d72677 --- /dev/null +++ b/tests/fadvise_reserve.c @@ -0,0 +1,86 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _XOPEN_SOURCE 600 +#include +#include +#include +#include +#include +#include +#include + +#include +#include "hugetests.h" + +/* + * Test rationale: + * + * fadvise() on some kernels can cause the reservation counter to get + * corrupted. The problem is that the patches are allocated for the + * reservation but not faulted in at the time of allocation. The + * counters do not get updated and effectively "leak". This test + * identifies whether the kernel is vunerable to the problem or not. + * It's fixed in kernel by commit f2deae9d4e70793568ef9e85d227abb7bef5b622. + */ +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long initial_rsvd, map_rsvd, fadvise_rsvd, end_rsvd; + + test_init(argc, argv); + + /* Setup */ + hpage_size = check_hugepagesize(); + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + /* mmap a region and record reservations */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + map_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after map: %lu\n", map_rsvd); + + /* fadvise the region and record reservations */ + if (posix_fadvise(fd, 0, hpage_size, POSIX_FADV_WILLNEED) == -1) + FAIL("fadvise(): %s", strerror(errno)); + fadvise_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after fadvise: %lu\n", fadvise_rsvd); + + /* Write the region */ + memset(p, 1, hpage_size); + + /* Free region */ + munmap(p, hpage_size); + close(fd); + end_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after close(): %lu\n", end_rsvd); + + /* Reserve count should match initial reserve count */ + if (end_rsvd != initial_rsvd) + FAIL("Reserve leaked: %lu != %lu\n", end_rsvd, initial_rsvd); + + PASS(); +} diff --git a/tests/fadvise_reserve.sh b/tests/fadvise_reserve.sh new file mode 100755 index 0000000..74496ec --- /dev/null +++ b/tests/fadvise_reserve.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# fadvise is known broken before 2.6.30 +compare_kvers `uname -r` "2.6.30" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC fadvise_reserve "$@" +fi + diff --git a/tests/fallocate_align.c b/tests/fallocate_align.c new file mode 100644 index 0000000..1ab2e94 --- /dev/null +++ b/tests/fallocate_align.c @@ -0,0 +1,143 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 20015 Mike Kravetz, Oracle Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define P "fallocate-align" +#define DESC \ + "* Test alignment of fallocate arguments. fallocate will take *\n"\ + "* non-huge page aligned offsets and addresses. However, *\n"\ + "* operations are only performed on huge pages. This is different *\n"\ + "* that than fallocate behavior in "normal" filesystems. *" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + int err; + unsigned long free_before, free_after; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + free_before = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + + /* + * First preallocate file with with just 1 byte. Allocation sizes + * are rounded up, so we should get an entire huge page. + */ + err = fallocate(fd, 0, 0, 1); + if (err) { + if (errno == EOPNOTSUPP) + IRRELEVANT(); + if (err) + FAIL("fallocate(): %s", strerror(errno)); + } + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_before - free_after != 1) + FAIL("fallocate 1 byte did not preallocate entire huge page\n"); + + /* + * Now punch a hole with just 1 byte. On hole punch, sizes are + * rounded down. So, this operation should not create a hole. + */ + err = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, 1); + if (err) + FAIL("fallocate(FALLOC_FL_PUNCH_HOLE): %s", strerror(errno)); + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_after == free_before) + FAIL("fallocate hole punch 1 byte free'ed a huge page\n"); + + /* + * Now punch a hole with of 2 * hpage_size - 1 byte. This size + * should be rounded down to a single huge page and the hole created. + */ + err = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, (2 * hpage_size) - 1); + if (err) + FAIL("fallocate(FALLOC_FL_PUNCH_HOLE): %s", strerror(errno)); + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_after != free_before) + FAIL("fallocate hole punch 2 * hpage_size - 1 byte did not free huge page\n"); + + /* + * Perform a preallocate operation with offset 1 and size of + * hpage_size. The offset should be rounded down and the + * size rounded up to preallocate two huge pages. + */ + err = fallocate(fd, 0, 1, hpage_size); + if (err) + FAIL("fallocate(): %s", strerror(errno)); + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_before - free_after != 2) + FAIL("fallocate 1 byte offset, huge page size did not preallocate two huge pages\n"); + + /* + * The hole punch code will only delete 'whole' huge pags that are + * in the specified range. The offset is rounded up, and (offset + * + size) is rounded down to determine the huge pages to be deleted. + * In this case, after rounding the range is (hpage_size, hpage_size). + * So, no pages should be deleted. + */ + err = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 1, hpage_size); + if (err) + FAIL("fallocate(FALLOC_FL_PUNCH_HOLE): %s", strerror(errno)); + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_before - free_after != 2) + FAIL("fallocate hole punch 1 byte offset, huge page size incorrectly deleted a huge page\n"); + + /* + * To delete both huge pages, the range passed to hole punch must + * overlap the allocated pages + */ + err = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, 2 * hpage_size); + if (err) + FAIL("fallocate(FALLOC_FL_PUNCH_HOLE): %s", strerror(errno)); + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_after != free_before) + FAIL("fallocate hole punch did not delete two huge pages\n"); + + PASS(); +} diff --git a/tests/fallocate_align.sh b/tests/fallocate_align.sh new file mode 100755 index 0000000..5105151 --- /dev/null +++ b/tests/fallocate_align.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +. wrapper-utils.sh + +# +# hugetlbfs fallocate support was not available until 4.3 +# +compare_kvers `uname -r` "4.3.0" +if [ $? -eq 1 ]; then + echo "FAIL no fallocate support in kernels before 4.3.0" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC fallocate_align "$@" +fi diff --git a/tests/fallocate_basic.c b/tests/fallocate_basic.c new file mode 100644 index 0000000..3e7bb7b --- /dev/null +++ b/tests/fallocate_basic.c @@ -0,0 +1,91 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 20015 Mike Kravetz, Oracle Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define P "fallocate-basic" +#define DESC \ + "* Test basic fallocate functionality in hugetlbfs. Preallocate *\n"\ + "* huge pages to a file in hugetlbfs, and then remove the pages *\n"\ + "* via hole punch. *" + +#define min(a,b) (((a) < (b)) ? (a) : (b)) + +#define MAX_PAGES_TO_USE 5 + +int main(int argc, char *argv[]) +{ + long hpage_size; + long nr_hpages_free; + int fd; + int err; + int max_iterations; + unsigned long free_before, free_after; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + nr_hpages_free = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + max_iterations = min(nr_hpages_free, MAX_PAGES_TO_USE); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + free_before = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + + /* First preallocate file with max_iterations pages */ + err = fallocate(fd, 0, 0, hpage_size * max_iterations); + if (err) { + if (errno == EOPNOTSUPP) + IRRELEVANT(); + if (err) + FAIL("fallocate(): %s", strerror(errno)); + } + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_before - free_after != max_iterations) + FAIL("fallocate did not preallocate %u huge pages\n", + max_iterations); + + /* Now punch a hole of the same size */ + err = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, hpage_size * max_iterations); + if (err) + FAIL("fallocate(FALLOC_FL_PUNCH_HOLE): %s", strerror(errno)); + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_after != free_before) + FAIL("fallocate hole punch did not release %u huge pages\n", + max_iterations); + + PASS(); +} diff --git a/tests/fallocate_basic.sh b/tests/fallocate_basic.sh new file mode 100755 index 0000000..904dfd6 --- /dev/null +++ b/tests/fallocate_basic.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +. wrapper-utils.sh + +# +# hugetlbfs fallocate support was not available until 4.3 +# +compare_kvers `uname -r` "4.3.0" +if [ $? -eq 1 ]; then + echo "FAIL no fallocate support in kernels before 4.3.0" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC fallocate_basic "$@" +fi diff --git a/tests/fallocate_stress.c b/tests/fallocate_stress.c new file mode 100644 index 0000000..121a2ca --- /dev/null +++ b/tests/fallocate_stress.c @@ -0,0 +1,225 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 20015 Mike Kravetz, Oracle Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define P "fallocate-stress" +#define DESC \ + "* Stress test fallocate. This test starts three threads. Thread *\n"\ + "* one will continually punch/fill holes via falloc. Thread two *\n"\ + "* will continually fault in those same pages. Thread three will *\n"\ + "* continually mmap/munmap that page range. *" + +#define min(a,b) (((a) < (b)) ? (a) : (b)) + +#define MAX_PAGES_TO_USE 100 + +static int htlb_fd; +static long max_hpages; +static long hpage_size; + +#define FALLOCATE_ITERATIONS 100000 +static void *thread_fallocate(void *arg) +{ + int i, err; + long tpage; + + for (i=0; i < FALLOCATE_ITERATIONS; i++) { + tpage = ((long long)random()) % (max_hpages); + err = fallocate(htlb_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + tpage * hpage_size, hpage_size); + if (err) + FAIL("fallocate(): %s", strerror(errno)); + err = fallocate(htlb_fd, 0, tpage * hpage_size, hpage_size); + if (err) + FAIL("fallocate(FALLOC_FL_PUNCH_HOLE): %s", + strerror(errno)); + } + + return NULL; +} + +static void *fault_mmap_addr = NULL; + +static void thread_fault_cleanup(void *arg) +{ + if (fault_mmap_addr) + munmap(fault_mmap_addr, max_hpages * hpage_size); +} + +static void *thread_fault(void *arg) +{ + long tpage; + char foo; + struct timespec ts; + + fault_mmap_addr = mmap(NULL, max_hpages * hpage_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + htlb_fd, 0); + if (fault_mmap_addr == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + pthread_cleanup_push(thread_fault_cleanup, NULL); + + ts.tv_sec = 0; + ts.tv_nsec = 0; + + while (1) { + tpage = ((long long)random()) % (max_hpages); + + foo = *((char *)(fault_mmap_addr + (tpage * hpage_size))); + *((char *)(fault_mmap_addr + (tpage * hpage_size))) = foo; + + nanosleep(&ts, NULL); /* thread cancellation point */ + } + + pthread_cleanup_pop(1); + + return NULL; +} + +static void *mmap_munmap_addr = NULL; + +static void thread_mmap_munmap_cleanup(void *arg) +{ + if (mmap_munmap_addr) + munmap(mmap_munmap_addr, max_hpages * hpage_size); +} + +static void *thread_mmap_munmap(void *arg) +{ + int err; + struct timespec ts; + + pthread_cleanup_push(thread_mmap_munmap_cleanup, NULL); + + ts.tv_sec = 0; + ts.tv_nsec = 0; + + while (1) { + mmap_munmap_addr = mmap(NULL, max_hpages * hpage_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, htlb_fd, 0); + if (mmap_munmap_addr == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + err = munmap(mmap_munmap_addr, max_hpages * hpage_size); + if (err) + FAIL("munmap(): %s", strerror(errno)); + mmap_munmap_addr = NULL; + + nanosleep(&ts, NULL); /* thread cancellation point */ + } + + pthread_cleanup_pop(1); + + return NULL; +} + +int main(int argc, char *argv[]) +{ + long nr_hpages_free; + int err; + unsigned long free_before, free_after; + unsigned long rsvd_before, rsvd_after; + pthread_t falloc_th, fault_th, mmap_munmap_th; + void *falloc_th_ret, *fault_th_ret, *mmap_munmap_th_ret; + + test_init(argc, argv); + + srandom((int)getpid() * time(NULL)); + hpage_size = check_hugepagesize(); + nr_hpages_free = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + max_hpages = min(nr_hpages_free, MAX_PAGES_TO_USE); + + htlb_fd = hugetlbfs_unlinked_fd(); + if (htlb_fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + free_before = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + rsvd_before = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + + /* First preallocate file with max_hpages pages */ + err = fallocate(htlb_fd, 0, 0, hpage_size * max_hpages); + if (err) { + if (errno == EOPNOTSUPP) + IRRELEVANT(); + if (err) + FAIL("fallocate(): %s", strerror(errno)); + } + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (free_before - free_after != max_hpages) + FAIL("fallocate did not preallocate %ld huge pages\n", + max_hpages); + + err = pthread_create(&falloc_th, NULL, thread_fallocate, NULL); + if (err != 0) + FAIL("pthread_create(): %s\n", strerror(errno)); + err = pthread_create(&fault_th, NULL, thread_fault, NULL); + if (err != 0) + FAIL("pthread_create(): %s\n", strerror(errno)); + err = pthread_create(&mmap_munmap_th, NULL, thread_mmap_munmap, NULL); + if (err != 0) + FAIL("pthread_create(): %s\n", strerror(errno)); + + err = pthread_join(falloc_th, &falloc_th_ret); + if (err != 0) + FAIL("pthread_join(): %s\n", strerror(errno)); + if (falloc_th_ret) + FAIL("thread_fallocate unexpected exit code\n"); + + err = pthread_cancel(fault_th); + if (err != 0) + FAIL("pthread_cancel(): %s\n", strerror(errno)); + err = pthread_join(fault_th, &fault_th_ret); + if (err != 0) + FAIL("pthread_join(): %s\n", strerror(errno)); + + err = pthread_cancel(mmap_munmap_th); + if (err != 0) + FAIL("pthread_cancel(): %s\n", strerror(errno)); + err = pthread_join(mmap_munmap_th, &mmap_munmap_th_ret); + if (err != 0) + FAIL("pthread_join(): %s\n", strerror(errno)); + + if (close(htlb_fd)) + FAIL("close(): %s", strerror(errno)); + + free_after = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + rsvd_after = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + if (free_after != free_before || rsvd_after != rsvd_before) + FAIL("free or reserve counts not correct after fallocate stress testing\n"); + + PASS(); +} diff --git a/tests/fallocate_stress.sh b/tests/fallocate_stress.sh new file mode 100755 index 0000000..622084f --- /dev/null +++ b/tests/fallocate_stress.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +. wrapper-utils.sh + +# +# hugetlbfs fallocate support was not available until 4.3 +# +compare_kvers `uname -r` "4.3.0" +if [ $? -eq 1 ]; then + echo "FAIL no fallocate support in kernels before 4.3.0" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC fallocate_stress "$@" +fi diff --git a/tests/find_path.c b/tests/find_path.c new file mode 100644 index 0000000..86019da --- /dev/null +++ b/tests/find_path.c @@ -0,0 +1,44 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + const char *dir; + + test_init(argc, argv); + + dir = hugetlbfs_find_path(); + + if (! dir) + CONFIG("No hugepage mount"); + + verbose_printf("Found hugetlbfs path at %s\n", dir); + + if (hugetlbfs_test_path(dir) == 1) + PASS(); + + FAIL(""); +} diff --git a/tests/fork-cow.c b/tests/fork-cow.c new file mode 100644 index 0000000..70d3904 --- /dev/null +++ b/tests/fork-cow.c @@ -0,0 +1,176 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +/* + * Test rationale: + * + * This checks copy-on-write semantics, specifically the semantics of + * a MAP_PRIVATE mapping across a fork(). Some versions of the + * powerpc kernel had a bug in huge_ptep_set_wrprotect() which would + * fail to flush the hash table after setting the write protect bit in + * the parent's page tables, thus allowing the parent to pollute the + * child's mapping. + */ + +#define RANDOM_CONSTANT 0x1234ABCD +#define OTHER_CONSTANT 0xfeef5678 + +/* + * The parent uses this to check if the child terminated badly. + */ +static void sigchld_handler(int signum, siginfo_t *si, void *uc) +{ + if (WEXITSTATUS(si->si_status) != 0) + FAIL("Child failed: %d", WEXITSTATUS(si->si_status)); + if (WIFSIGNALED(si->si_status)) + FAIL("Child recived signal %s", + strsignal(WTERMSIG(si->si_status))); +} + +int main(int argc, char ** argv) +{ + int fd, ret, status; + void *syncarea; + volatile unsigned int *p; + volatile unsigned int *trigger, *child_readback; + unsigned int parent_readback; + long hpage_size; + pid_t pid; + struct sigaction sa = { + .sa_sigaction = sigchld_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + check_free_huge_pages(2); + + if (argc != 1) + CONFIG("Usage: fork-cow\n"); + + /* Get a shared normal page for synchronization */ + verbose_printf("Mapping synchronization area.."); + syncarea = mmap(NULL, getpagesize(), PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (syncarea == MAP_FAILED) + FAIL("mmap() sync area: %s", strerror(errno)); + verbose_printf("done\n"); + + trigger = syncarea; + *trigger = 0; + + child_readback = trigger + 1; + *child_readback = 0; + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("hugetlbfs_unlinked_fd() failed\n"); + + verbose_printf("Mapping hugepage area..."); + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + verbose_printf("mapped at %p\n", p); + + /* Touch the page for write in parent */ + verbose_printf("Parent writes pre-fork..."); + *p = RANDOM_CONSTANT; + verbose_printf("%x\n", RANDOM_CONSTANT); + + ret = sigaction(SIGCHLD, &sa, NULL); + if (ret) + FAIL("sigaction(): %s", strerror(errno)); + + if ((pid = fork()) < 0) + FAIL("fork(): %s", strerror(errno)); + + if (pid != 0) { + /* Parent */ + verbose_printf("Parent writes post-fork..."); + *p = ~RANDOM_CONSTANT; + verbose_printf("%x\n", ~RANDOM_CONSTANT); + + *trigger = 1; + + while (*trigger != 2) + ; + + verbose_printf("Parent reads.."); + parent_readback = *p; + verbose_printf("%x\n", parent_readback); + + *trigger = 3; + } else { + /* Child */ + verbose_printf("Child starts..\n"); + + while (*trigger != 1) + ; + + verbose_printf("Child reads..."); + *child_readback = *p; + verbose_printf("%x\n", *child_readback); + + verbose_printf("Child writes..."); + *p = OTHER_CONSTANT; + verbose_printf("%x\n", OTHER_CONSTANT); + + *trigger = 2; + + while (*trigger != 3) + ; + + verbose_printf("Child exits...\n"); + exit(0); + } + + verbose_printf("child_readback = 0x%x, parent_readback = 0x%x\n", + *child_readback, parent_readback); + + if (*child_readback != RANDOM_CONSTANT) + FAIL("Child read back 0x%x instead of 0x%x", + *child_readback, RANDOM_CONSTANT); + if (parent_readback != ~RANDOM_CONSTANT) + FAIL("Parent read back 0x%x instead of 0x%x", + parent_readback, RANDOM_CONSTANT); + + ret = waitpid(pid, &status, 0); + if (ret < 0) + FAIL("waitpid(): %s", strerror(errno)); + + PASS(); +} diff --git a/tests/get_huge_pages.c b/tests/get_huge_pages.c new file mode 100644 index 0000000..2dc4e3d --- /dev/null +++ b/tests/get_huge_pages.c @@ -0,0 +1,76 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +long hpage_size; +long oc_hugepages = -1; + +/* Restore nr_overcommit_hugepages */ +void cleanup(void) +{ + if (oc_hugepages != -1) + set_nr_overcommit_hugepages(hpage_size, oc_hugepages); +} + +/* Confirm a region really frees, only really important for GHP_FALLBACK */ +void free_and_confirm_region_free(void *p, int line) { + unsigned char vec = 0; + free_huge_pages(p); + if (mincore(p, 4, &vec) == 0 || vec) + FAIL("free_huge_pages did not free region at line %d", line); +} + +void test_get_huge_pages(int num_hugepages) +{ + unsigned long long mapping_size; + void *p = get_huge_pages(num_hugepages * hpage_size, GHP_DEFAULT); + if (p == NULL) + FAIL("get_huge_pages() for %d hugepages", num_hugepages); + + memset(p, 1, hpage_size); + + mapping_size = get_mapping_page_size( + (void *)p + (num_hugepages -1) * hpage_size); + if (mapping_size != hpage_size) + FAIL("Returned page is not hugepage"); + + free_and_confirm_region_free(p, __LINE__); + mapping_size = get_mapping_page_size( + (void *)p + (num_hugepages -1) * hpage_size); + if (mapping_size) + FAIL("hugepage was not correctly freed"); +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + hpage_size = gethugepagesize(); + check_free_huge_pages(4); + test_get_huge_pages(1); + test_get_huge_pages(4); + + PASS(); +} diff --git a/tests/get_hugepage_region.c b/tests/get_hugepage_region.c new file mode 100644 index 0000000..292d201 --- /dev/null +++ b/tests/get_hugepage_region.c @@ -0,0 +1,137 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +long hpage_size; +long oc_hugepages = -1; + +/* Restore nr_overcommit_hugepages */ +void cleanup(void) +{ + if (oc_hugepages != -1) + set_nr_overcommit_hugepages(hpage_size, oc_hugepages); +} + +/* Confirm a region really frees, only really important for GHR_FALLBACK */ +void free_and_confirm_region_free(void *p, int line) { + unsigned char vec = 0; + free_hugepage_region(p); + if (mincore(p, 4, &vec) == 0 || vec) + FAIL("free_hugepage_region did not free region at line %d", line); +} + +int test_unaligned_addr_huge(void *p) +{ + unsigned long long mapping_size; + p = (void *)((unsigned long)p & ~((gethugepagesize()) - 1)); + mapping_size = get_mapping_page_size(p); + return (mapping_size == hpage_size); +} + +#define TESTLEN ((num_hugepages - 1) * hpage_size + hpage_size / 2) + +void test_GHR_STRICT(int num_hugepages) +{ + int err; + void *p = get_hugepage_region(TESTLEN, GHR_DEFAULT); + if (p == NULL) + FAIL("get_hugepage_region() for %d hugepages", num_hugepages); + + memset(p, 1, TESTLEN); + + err = test_unaligned_addr_huge(p + (num_hugepages - 1) * hpage_size); + if (err != 1) + FAIL("Returned page is not hugepage"); + + free_and_confirm_region_free(p, __LINE__); + err = test_unaligned_addr_huge(p); + if (err == 1) + FAIL("hugepage was not correctly freed"); +} + +void test_GHR_FALLBACK(void) +{ + int err; + long rsvd_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + long num_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL) + - rsvd_hugepages; + + /* We must disable overcommitted huge pages to test this */ + oc_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_OC); + set_nr_overcommit_hugepages(hpage_size, 0); + + /* We should be able to allocate the whole pool */ + void *p = get_hugepage_region(TESTLEN, GHR_DEFAULT); + if (p == NULL) + FAIL("test_GHR_FALLBACK(GHR_DEFAULT) failed for %ld hugepages", + num_hugepages); + memset(p, 1, TESTLEN); + err = test_unaligned_addr_huge(p + (num_hugepages - 1) * hpage_size); + if (err != 1) + FAIL("Returned page is not hugepage"); + free_and_confirm_region_free(p, __LINE__); + + /* We should fail allocating too much */ + num_hugepages++; + p = get_hugepage_region(TESTLEN, GHR_STRICT); + if (p != NULL) + FAIL("test_GHR_FALLBACK() for %ld expected fail, got success", num_hugepages); + + /* GHR_FALLBACK should succeed by allocating base pages */ + p = get_hugepage_region(TESTLEN, GHR_FALLBACK); + if (p == NULL) + FAIL("test_GHR_FALLBACK(GHR_FALLBACK) failed for %ld hugepages", + num_hugepages); + memset(p, 1, TESTLEN); + err = test_unaligned_addr_huge(p + (num_hugepages - 1) * hpage_size); + if (err == 1) + FAIL("Returned page is not a base page"); + + /* + * We allocate a second fallback region to see can they be told apart + * on free. Merging VMAs would cause problems + */ + void *pb = get_hugepage_region(TESTLEN, GHR_FALLBACK); + if (pb == NULL) + FAIL("test_GHR_FALLBACK(GHR_FALLBACK) x2 failed for %ld hugepages", + num_hugepages); + memset(pb, 1, TESTLEN); + + free_and_confirm_region_free(pb, __LINE__); + free_and_confirm_region_free(p, __LINE__); +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + hpage_size = gethugepagesize(); + check_free_huge_pages(4); + test_GHR_STRICT(1); + test_GHR_STRICT(4); + test_GHR_FALLBACK(); + + PASS(); +} diff --git a/tests/get_hugetlbfs_path.c b/tests/get_hugetlbfs_path.c new file mode 100644 index 0000000..a3de22a --- /dev/null +++ b/tests/get_hugetlbfs_path.c @@ -0,0 +1,40 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * Copyright (C) 2006 Nishanth Aravamudan, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + const char *dir; + + dir = hugetlbfs_find_path(); + + if (!dir) + return -1; + + printf("%s\n", dir); + + return 0; +} diff --git a/tests/gethugepagesize.c b/tests/gethugepagesize.c new file mode 100644 index 0000000..7668b04 --- /dev/null +++ b/tests/gethugepagesize.c @@ -0,0 +1,44 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + + test_init(argc, argv); + + hpage_size = gethugepagesize(); + + if (hpage_size > 0) { + verbose_printf("Huge page size is %ld bytes\n", hpage_size); + PASS(); + } + + if (hpage_size < 0) + CONFIG("No hugepage kernel support"); + + FAIL(""); +} diff --git a/tests/gethugepagesizes.c b/tests/gethugepagesizes.c new file mode 100644 index 0000000..9551b38 --- /dev/null +++ b/tests/gethugepagesizes.c @@ -0,0 +1,420 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hugetests.h" + +int faked_data = 0; +char fake_sysfs[] = "/tmp/sysfs-XXXXXX"; +char fake_meminfo[] = "/tmp/meminfo-XXXXXX"; + +#define REAL_SYSFS_DIR "/sys/kernel/mm/hugepages/" +DIR *(*real_opendir)(const char *name); + +int (*real_open)(const char *name, int flags, int mode); + +enum { + OVERRIDE_OFF, /* Pass-through to real function */ + OVERRIDE_ON, /* Ovewrride with local function */ + OVERRIDE_MISSING, /* Emulate missing support */ +}; +int meminfo_state = OVERRIDE_OFF; +int sysfs_state = OVERRIDE_OFF; + +/* + * Override opendir so we'll open the fake sysfs dir if intended + */ +DIR *opendir(const char *name) +{ + if (!real_opendir) + real_opendir = dlsym(RTLD_NEXT, "opendir"); + + /* Only override calls to the sysfs dir */ + if (strcmp(name, REAL_SYSFS_DIR)) + return real_opendir(name); + + switch (sysfs_state) { + case OVERRIDE_OFF: + return real_opendir(name); + case OVERRIDE_ON: + /* Only safe to override of fake_sysfs was set up */ + if (faked_data) + return real_opendir(fake_sysfs); + else + FAIL("Trying to override opendir before initializing " + "fake_sysfs directory\n"); + default: + errno = ENOENT; + return NULL; + } +} + +#define HPAGE_KB 2048 +#define __HPAGE_STR_QUOTE(val) #val +#define __HPAGE_STR(val) __HPAGE_STR_QUOTE(val) +#define HPAGE_STR __HPAGE_STR(HPAGE_KB) + +/* + * Override open to simulate various contents for meminfo + */ +int open(const char *file, int flags, ...) +{ + int mode = 0; + if (flags & O_CREAT) { + va_list arg; + va_start(arg, flags); + mode = va_arg(arg, int); + va_end(arg); + } + + if (!real_open) + real_open = dlsym(RTLD_NEXT, "open"); + + switch (meminfo_state) { + case OVERRIDE_OFF: + break; + case OVERRIDE_ON: { + char fname[PATH_MAX]; + sprintf(fname, "%s/meminfo-hugepages", fake_meminfo); + file = fname; + break; + } + case OVERRIDE_MISSING: { + char fname[PATH_MAX]; + sprintf(fname, "%s/meminfo-none", fake_meminfo); + file = fname; + break; + } + default: + return -1; + } + return real_open(file, flags, mode); +} + +void cleanup_fake_data(void) +{ + DIR *dir; + struct dirent *ent; + char fname[PATH_MAX+1]; + + meminfo_state = OVERRIDE_OFF; + sysfs_state = OVERRIDE_OFF; + + faked_data = 0; + dir = opendir(fake_sysfs); + if (!dir) + FAIL("opendir %s: %s", fake_sysfs, strerror(errno)); + + while ((ent = readdir(dir))) { + if (strncmp(ent->d_name, "hugepages-", 10)) + continue; + snprintf(fname, PATH_MAX, "%s/%s", fake_sysfs, + ent->d_name); + if (rmdir(fname)) + FAIL("rmdir %s: %s", fake_sysfs, strerror(errno)); + } + closedir(dir); + if (rmdir(fake_sysfs)) + FAIL("rmdir %s: %s", fake_sysfs, strerror(errno)); + + sprintf(fname, "%s/meminfo-none", fake_meminfo); + if (unlink(fname) < 0) + FAIL("unlink %s: %s", fname, strerror(errno)); + sprintf(fname, "%s/meminfo-hugepages", fake_meminfo); + if (unlink(fname) < 0) + FAIL("unlink %s: %s", fname, strerror(errno)); + if (rmdir(fake_meminfo)) + FAIL("rmdir %s: %s", fake_meminfo, strerror(errno)); +} + +char *meminfo_base = "\ +MemTotal: 4004132 kB\n\ +MemFree: 3563748 kB\n\ +Buffers: 34804 kB\n\ +Cached: 252544 kB\n\ +SwapCached: 0 kB\n\ +Active: 108912 kB\n\ +Inactive: 187420 kB\n\ +SwapTotal: 8008392 kB\n\ +SwapFree: 8008392 kB\n\ +Dirty: 4 kB\n\ +Writeback: 0 kB\n\ +AnonPages: 9100 kB\n\ +Mapped: 7908 kB\n\ +Slab: 40212 kB\n\ +SReclaimable: 33312 kB\n\ +SUnreclaim: 6900 kB\n\ +PageTables: 1016 kB\n\ +NFS_Unstable: 0 kB\n\ +Bounce: 0 kB\n\ +WritebackTmp: 0 kB\n\ +CommitLimit: 9974616 kB\n\ +Committed_AS: 29616 kB\n\ +VmallocTotal: 34359738367 kB\n\ +VmallocUsed: 23760 kB\n\ +VmallocChunk: 34359714543 kB\n\ +"; + +char *meminfo_huge = "\ +HugePages_Total: 35\n\ +HugePages_Free: 35\n\ +HugePages_Rsvd: 0\n\ +HugePages_Surp: 0\n\ +Hugepagesize: " HPAGE_STR " kB\n\ +"; + +void setup_fake_data(long sizes[], int n_elem) +{ + int old_meminfo_state = meminfo_state; + int old_sysfs_state = sysfs_state; + + int i; + char fname[PATH_MAX+1]; + int fd; + + meminfo_state = OVERRIDE_OFF; + sysfs_state = OVERRIDE_OFF; + + if (faked_data) + cleanup_fake_data(); + + /* Generate some fake sysfs data. */ + if (!mkdtemp(fake_sysfs)) + FAIL("mkdtemp: %s", strerror(errno)); + faked_data = 1; + + for (i = 0; i < n_elem; i++) { + snprintf(fname, PATH_MAX, "%s/hugepages-%lukB", fake_sysfs, + sizes[i] / 1024); + if (mkdir(fname, 0700)) + FAIL("mkdir %s: %s", fname, strerror(errno)); + } + + /* Generate fake meminfo data. */ + if (!mkdtemp(fake_meminfo)) + FAIL("mkdtemp: %s", strerror(errno)); + + sprintf(fname, "%s/meminfo-none", fake_meminfo); + fd = open(fname, O_WRONLY|O_CREAT); + if (fd < 0) + FAIL("open: %s", strerror(errno)); + if (write(fd, meminfo_base, + strlen(meminfo_base)) != strlen(meminfo_base)) + FAIL("write: %s", strerror(errno)); + if (close(fd) < 0) + FAIL("close: %s", strerror(errno)); + + sprintf(fname, "%s/meminfo-hugepages", fake_meminfo); + fd = open(fname, O_WRONLY|O_CREAT); + if (fd < 0) + FAIL("open: %s", strerror(errno)); + if (write(fd, meminfo_base, + strlen(meminfo_base)) != strlen(meminfo_base)) + FAIL("write: %s", strerror(errno)); + if (write(fd, meminfo_huge, + strlen(meminfo_huge)) != strlen(meminfo_huge)) + FAIL("write: %s", strerror(errno)); + if (close(fd) < 0) + FAIL("close: %s", strerror(errno)); + + meminfo_state = old_meminfo_state; + sysfs_state = old_sysfs_state; +} + +void cleanup(void) +{ + if (faked_data) + cleanup_fake_data(); +} + +void validate_sizes(int line, long actual_sizes[], int actual, + int max, int maxmax, + long expected_sizes[], int expected) +{ + int i, j; + + verbose_printf("Line %d: Expecting sizes:", line); + for (i = 0; i < expected; i++) + verbose_printf(" %ld", expected_sizes[i]); + verbose_printf("\n"); + verbose_printf("Line %d: Actual sizes are:", line); + for (i = 0; i < actual; i++) + verbose_printf(" %ld", actual_sizes[i]); + verbose_printf("\n"); + + if (((expected <= max) && (expected != actual)) + || ((expected > max) && (actual < max))) + FAIL("Line %i: Wrong number of sizes returned -- expected %i " + "got %i", line, expected, actual); + else if (actual > max) + FAIL("Line %i: %i sizes returned > maximum %i", + line, actual, max); + + for (i = 0; i < actual; i++) { + for (j = 0; j < expected; j++) + if (actual_sizes[i] == expected_sizes[j]) + break; + if (j >= expected) + FAIL("Line %i: Actual size %li not found in expected " + "results", line, expected_sizes[i]); + } + + for (i = 0; i < actual; i++) + for (j = i+1; j < actual; j++) + if (actual_sizes[i] == actual_sizes[j]) + FAIL("Line %i: Duplicate size %li at %i/%i", + line, actual_sizes[i], i, j); + + for (i = actual; i < maxmax; i++) + if (actual_sizes[i] != 42) + FAIL("Line %i: Wrote past official limit at %i", + line, i); +} + +#define MAX 16 +#define EXPECT_SIZES(func, max, count, expected) \ +({ \ + long __a[MAX] = { [0 ... MAX-1] = 42 }; \ + int __na; \ + \ + __na = func(__a, max); \ + \ + validate_sizes(__LINE__, __a, __na, max, MAX, expected, count); \ + \ + __na; \ +}) + +#define INIT_LIST(a, values...) \ +({ \ + long __e[] = { values }; \ + memcpy(a, __e, sizeof(__e)); \ +}) + +int main(int argc, char *argv[]) +{ + int i, fakes_no; + long expected_sizes[MAX], actual_sizes[MAX], fake_sizes[MAX]; + long base_size = sysconf(_SC_PAGESIZE); + + test_init(argc, argv); + + /* + * === + * Argment error checking tests + * === + */ + meminfo_state = OVERRIDE_OFF; + sysfs_state = OVERRIDE_OFF; + kernel_default_hugepage_size_reset(); + + if (gethugepagesizes(actual_sizes, -1) != -1 || errno != EINVAL) + FAIL("Mishandled params (n_elem < 0)"); + if (gethugepagesizes(NULL, 1) != -1 || errno != EINVAL) + FAIL("Mishandled params (pagesizes == NULL, n_elem > 0)"); + + if (getpagesizes(actual_sizes, -1) != -1 || errno != EINVAL) + FAIL("Mishandled params (n_elem < 0)"); + if (getpagesizes(NULL, 1) != -1 || errno != EINVAL) + FAIL("Mishandled params (pagesizes == NULL, n_elem > 0)"); + + /* + * === + * Test some corner cases using a fake system configuration + * === + */ + + INIT_LIST(expected_sizes, HPAGE_KB * 1024, 1024 * 1024, 64 * 1024); + fakes_no = 0; + for (i = 0; i < 3; i++) + /* don't include base_size in 'fake' hugepagesizes */ + if (base_size != expected_sizes[i]) { + fake_sizes[fakes_no] = expected_sizes[i]; + fakes_no++; + } + setup_fake_data(fake_sizes, fakes_no); + + /* + * Check handling when /proc/meminfo indicates no huge page support + * and the sysfs heirachy is not present. + */ + meminfo_state = OVERRIDE_MISSING; + sysfs_state = OVERRIDE_MISSING; + kernel_default_hugepage_size_reset(); + + EXPECT_SIZES(gethugepagesizes, MAX, 0, expected_sizes); + + INIT_LIST(expected_sizes, base_size); + EXPECT_SIZES(getpagesizes, MAX, 1, expected_sizes); + + /* ... only the meminfo size is returned. */ + meminfo_state = OVERRIDE_ON; + kernel_default_hugepage_size_reset(); + + INIT_LIST(expected_sizes, HPAGE_KB * 1024); + EXPECT_SIZES(gethugepagesizes, MAX, 1, expected_sizes); + + INIT_LIST(expected_sizes, base_size, HPAGE_KB * 1024); + EXPECT_SIZES(getpagesizes, MAX, 2, expected_sizes); + + /* + * When sysfs defines additional sizes ... + */ + sysfs_state = OVERRIDE_ON; + kernel_default_hugepage_size_reset(); + + memcpy(expected_sizes, fake_sizes, sizeof(fake_sizes)); + + /* ... make sure all sizes are returned without duplicates */ + /* ... while making sure we do not overstep our limit */ + EXPECT_SIZES(gethugepagesizes, MAX, fakes_no, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 1, fakes_no, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 2, fakes_no, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 3, fakes_no, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 4, fakes_no, expected_sizes); + + memcpy(expected_sizes, fake_sizes, sizeof(fake_sizes)); + expected_sizes[fakes_no] = base_size; + EXPECT_SIZES(getpagesizes, MAX, fakes_no + 1, expected_sizes); + EXPECT_SIZES(getpagesizes, 1, fakes_no + 1, expected_sizes); + EXPECT_SIZES(getpagesizes, 2, fakes_no + 1, expected_sizes); + EXPECT_SIZES(getpagesizes, 3, fakes_no + 1, expected_sizes); + EXPECT_SIZES(getpagesizes, 4, fakes_no + 1, expected_sizes); + EXPECT_SIZES(getpagesizes, 5, fakes_no + 1, expected_sizes); + + /* ... we can check how many sizes are supported. */ + if (gethugepagesizes(NULL, 0) != fakes_no) + FAIL("Unable to check the number of supported sizes"); + + if (getpagesizes(NULL, 0) != fakes_no + 1) + FAIL("Unable to check the number of supported sizes"); + + PASS(); +} diff --git a/tests/heap-overflow.c b/tests/heap-overflow.c new file mode 100644 index 0000000..044c3fd --- /dev/null +++ b/tests/heap-overflow.c @@ -0,0 +1,110 @@ +/* + * Test heap overflow for libhugetlbfs. + * Copyright 2008 Cray Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, 5th Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +long oc_pool = -1; +long hpagesize; + +void cleanup(void) +{ + if (oc_pool > 0) + restore_overcommit_pages(hpagesize, oc_pool); +} + +int main(int argc, char **argv) +{ + int freepages; + long size1, size2; + void *p1, *p2; + int st, pid, rv; + unsigned long long mapping_size; + + test_init(argc, argv); + + if (!getenv("HUGETLB_MORECORE")) + CONFIG("Must have HUGETLB_MORECORE=yes"); + + hpagesize = check_hugepagesize(); + + /* Must be root because this test modifies the overcommit pool */ + check_must_be_root(); + + oc_pool = read_nr_overcommit(hpagesize); + if (oc_pool > 0) + set_nr_overcommit_hugepages(hpagesize, 0); + + freepages = get_huge_page_counter(hpagesize, HUGEPAGES_FREE); + if (freepages < 3) + CONFIG("Must have at least 3 free hugepages"); + + /* + * Allocation 1: one hugepage. Due to malloc overhead, morecore + * will probably mmap two hugepages. + */ + size1 = hpagesize; + p1 = malloc(size1); + if (!p1) + FAIL("Couldn't malloc %ld bytes", size1); + mapping_size = get_mapping_page_size(p1); + if (mapping_size != hpagesize) + FAIL("First allocation %p not on hugepages", p1); + + /* + * Allocation 2: all free hugepages to ensure we exhaust the free pool. + */ + size2 = freepages * hpagesize; + p2 = malloc(size2); + if (!p2) + FAIL("Couldn't malloc %ld bytes", size2); + mapping_size = get_mapping_page_size(p1); + st = (mapping_size == hpagesize); + verbose_printf("Second allocation %p huge? %s\n", p2, st < 0 ? "??" : + (st ? "yes" : "no")); + + /* + * Touch the pages in a child process. Kernel sends a SIGKILL if + * we run out of hugepages. + */ + pid = fork(); + if (pid < 0) + FAIL("fork: %s", strerror(errno)); + + if (pid == 0) { + memset(p1, 0, size1); + memset(p2, 0, size2); + exit(0); + } + + rv = waitpid(pid, &st, 0); + if (rv < 0) + FAIL("waitpid: %s\n", strerror(errno)); + if (WIFSIGNALED(st)) + FAIL("Child killed by signal %d touching malloc'ed memory", + WTERMSIG(st)); + + PASS(); +} diff --git a/tests/heapshrink-helper.c b/tests/heapshrink-helper.c new file mode 100644 index 0000000..e793ff6 --- /dev/null +++ b/tests/heapshrink-helper.c @@ -0,0 +1,25 @@ +/* + * Test heap shrinking for libhugetlbfs. + * Copyright 2008 Cray Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, 5th Floor, Boston, MA 02110-1301, USA. + */ + +#include + +static void __attribute__((constructor)) setup_heapshrink_helper(void) +{ + (void) malloc(1); +} diff --git a/tests/heapshrink.c b/tests/heapshrink.c new file mode 100644 index 0000000..d2934aa --- /dev/null +++ b/tests/heapshrink.c @@ -0,0 +1,96 @@ +/* + * Test heap shrinking for libhugetlbfs. + * Copyright 2007 Cray Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, 5th Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include "hugetests.h" + +/* + * We cannot test mapping size against huge page size because we are not linked + * against libhugetlbfs so gethugepagesize() won't work. So instead we define + * our MIN_PAGE_SIZE as 64 kB (the largest base page available) and make sure + * the mapping page size is larger than this. + */ +#define MIN_PAGE_SIZE 65536 +#define MAX(a, b) a > b ? a : b + +int main(int argc, char **argv) +{ + int is_huge, have_env, shrink_ok, have_helper, tcache_enabled; + unsigned long long mapping_size; + void *p; + long size = MAX(32*1024*1024, kernel_default_hugepage_size()); + + test_init(argc, argv); + + have_env = getenv("HUGETLB_MORECORE") != NULL; + shrink_ok = getenv("HUGETLB_MORECORE_SHRINK") != NULL; + p = getenv("LD_PRELOAD"); + have_helper = p != NULL && strstr(p, "heapshrink") != NULL; + + /* + * After upstream commit: (glibc-2.25.90-688-gd5c3fafc43) glibc has a + * new per-thread caching mechanism that will NOT allow this test to + * successfully measure if heap has shrunk or not due to the fact that + * heap won't have its sized reduced right away. + * + * In order to disable it you need to have the tunable GLIBC in place. + * Unfortunately, it requires to be set before program is loaded, as an + * environment variable, since we can't re-initialize malloc() from the + * program context (not even with a constructor function), and the + * tunable is only evaluated during malloc() initialization. + * + * GLIBC_TUNABLES=glibc.malloc.tcache_count=0 + */ + p = getenv("GLIBC_TUNABLES"); + tcache_enabled = p != NULL && strstr(p, "malloc.tcache_count=0"); + + p = malloc(size); + if (!p) { + if (shrink_ok && have_helper) { + /* Hitting unexpected behavior in malloc() */ + PASS_INCONCLUSIVE(); + } else + FAIL("malloc(%ld) failed\n", size); + } + memset(p, 0, size); + mapping_size = get_mapping_page_size(p); + is_huge = (mapping_size > MIN_PAGE_SIZE); + if (have_env && !is_huge) { + if (shrink_ok && have_helper) { + /* Hitting unexpected behavior in malloc() */ + PASS_INCONCLUSIVE(); + } else + FAIL("Heap not on hugepages"); + } + if (!have_env && is_huge) + FAIL("Heap unexpectedly on hugepages"); + + free(p); + mapping_size = get_mapping_page_size(p+size-1); + if (shrink_ok && mapping_size > MIN_PAGE_SIZE) { + if (tcache_enabled) + FAIL("Heap did not shrink"); + else + FAIL("Heap didn't shrink. Check malloc.tcache_count=0"); + } + + PASS(); +} diff --git a/tests/huge_at_4GB_normal_below.c b/tests/huge_at_4GB_normal_below.c new file mode 100644 index 0000000..0f5d8b7 --- /dev/null +++ b/tests/huge_at_4GB_normal_below.c @@ -0,0 +1,111 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * Designed to pick up a bug on ppc64 where + * touches_hugepage_high_range() falsely reported true for ranges + * reaching below 4GB + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + */ + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + int fd; + void *p, *q; + unsigned long lowaddr; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + + hpage_size = check_hugepagesize(); + + if (sizeof(void *) <= 4) + IRRELEVANT(); + + if (hpage_size > FOURGB) + CONFIG("Huge page size is too large"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap((void *)FOURGB, hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (p == MAP_FAILED) { + /* slice 0 (high) spans from 4G-1T */ + unsigned long below_start = FOURGB; + unsigned long above_end = 1024L*1024*1024*1024; + if (range_is_mapped(below_start, above_end) == 1) { + verbose_printf("region 4G-1T is not free\n"); + verbose_printf("mmap() failed: %s\n", strerror(errno)); + PASS_INCONCLUSIVE(); + } else + FAIL("mmap() huge: %s\n", strerror(errno)); + } + if (p != (void *)FOURGB) + FAIL("Wrong address with MAP_FIXED huge"); + + verbose_printf("Mapped hugetlb at %p\n", p); + + memset(p, 0, hpage_size); + + err = test_addr_huge(p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + /* Test just below 4GB to check for off-by-one errors */ + lowaddr = FOURGB - page_size; + q = mmap((void *)lowaddr, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED|MAP_ANONYMOUS, 0, 0); + if (p == MAP_FAILED) { + unsigned long below_start = FOURGB - page_size; + unsigned long above_end = FOURGB; + if (range_is_mapped(below_start, above_end) == 1) { + verbose_printf("region (4G-page)-4G is not free\n"); + verbose_printf("mmap() failed: %s\n", strerror(errno)); + PASS_INCONCLUSIVE(); + } else + FAIL("mmap() normal: %s\n", strerror(errno)); + } + if (q != (void *)lowaddr) + FAIL("Wrong address with MAP_FIXED normal"); + + memset(q, 0, page_size); + + PASS(); +} diff --git a/tests/huge_below_4GB_normal_above.c b/tests/huge_below_4GB_normal_above.c new file mode 100644 index 0000000..b78bee8 --- /dev/null +++ b/tests/huge_below_4GB_normal_above.c @@ -0,0 +1,142 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * Designed to pick up a bug on ppc64 where + * touches_hugepage_low_range() could give false positives because of + * the peculiar (undefined) behaviour of << for large shifts + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + */ + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + int fd; + void *p, *q; + unsigned long lowaddr, highaddr; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + + hpage_size = check_hugepagesize(); + + if (sizeof(void *) <= 4) + IRRELEVANT(); + + if (hpage_size > FOURGB) + CONFIG("Huge page size is too large"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + + + /* We use a low address right below 4GB so we can test for + * off-by-one errors */ + lowaddr = FOURGB - hpage_size; + verbose_printf("Mapping hugepage at at %lx...", lowaddr); + p = mmap((void *)lowaddr, hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) { + /* This is last low slice - 256M just before 4G */ + unsigned long below_start = FOURGB - 256L*1024*1024; + unsigned long above_end = FOURGB; + if (range_is_mapped(below_start, above_end) == 1) { + verbose_printf("region (4G-256M)-4G is not free\n"); + verbose_printf("mmap() failed: %s\n", strerror(errno)); + PASS_INCONCLUSIVE(); + } else + FAIL("mmap() huge: %s\n", strerror(errno)); + } + if (p != (void *)lowaddr) + FAIL("Wrong address with MAP_FIXED huge"); + verbose_printf("done\n"); + + memset(p, 0, hpage_size); + + err = test_addr_huge(p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + /* Test for off by one errors */ + highaddr = FOURGB; + verbose_printf("Mapping normal page at %lx...", highaddr); + q = mmap((void *)highaddr, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED|MAP_ANONYMOUS, 0, 0); + if (p == MAP_FAILED) { + unsigned long below_start = FOURGB; + unsigned long above_end = FOURGB + page_size; + if (range_is_mapped(below_start, above_end) == 1) { + verbose_printf("region 4G-(4G+page) is not free\n"); + verbose_printf("mmap() failed: %s\n", strerror(errno)); + PASS_INCONCLUSIVE(); + } else + FAIL("mmap() normal 1: %s\n", strerror(errno)); + } + if (q != (void *)highaddr) + FAIL("Wrong address with MAP_FIXED normal 2"); + verbose_printf("done\n"); + + memset(q, 0, page_size); + + /* Why this address? Well on ppc64, we're working with 256MB + * segment numbers, hence >>28. In practice the shift + * instructions only start wrapping around with shifts 128 or + * greater. */ + highaddr = ((lowaddr >> 28) + 128) << 28; + verbose_printf("Mapping normal page at %lx...", highaddr); + q = mmap((void *)highaddr, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED|MAP_ANONYMOUS, 0, 0); + if (p == MAP_FAILED) { + unsigned long below_start = highaddr; + unsigned long above_end = highaddr + page_size; + if (range_is_mapped(below_start, above_end) == 1) { + verbose_printf("region haddr-(haddr+page) not free\n"); + verbose_printf("mmap() failed: %s\n", strerror(errno)); + PASS_INCONCLUSIVE(); + } else + FAIL("mmap() normal 2: %s\n", strerror(errno)); + } + if (q != (void *)highaddr) + FAIL("Wrong address with MAP_FIXED normal 2"); + verbose_printf("done\n"); + + memset(q, 0, page_size); + + PASS(); +} diff --git a/tests/hugetests.h b/tests/hugetests.h new file mode 100644 index 0000000..8b1d8d9 --- /dev/null +++ b/tests/hugetests.h @@ -0,0 +1,143 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MECHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HUGETESTS_H +#define _HUGETESTS_H + +#include +#include + +#include "libhugetlbfs_privutils.h" +#include "libhugetlbfs_testprobes.h" + +#define DEBUG + +/* Test return codes */ +#define RC_PASS 0 +#define RC_CONFIG 1 +#define RC_FAIL 2 +#define RC_XFAIL 3 /* Expected Failure */ +#define RC_XPASS 4 /* Unexpected Pass */ +#define RC_BUG 99 + +#define FOURGB (1UL << 32) + +extern int verbose_test; +extern char *test_name; +void check_free_huge_pages(int nr_pages_needed); +void check_must_be_root(void); +void check_hugetlb_shm_group(void); +void test_init(int argc, char *argv[]); +int test_addr_huge(void *p); +unsigned long long get_mapping_page_size(void *p); +long read_meminfo(const char *tag); +ino_t get_addr_inode(void *p); +int range_is_mapped(unsigned long low, unsigned long high); + +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) +#define PALIGN(p, a) ((void *)ALIGN((unsigned long)(p), (a))) + +#ifndef barrier +# ifdef mb +# define barrier() mb() +# else +# define barrier() __asm__ __volatile__ ("" : : : "memory") +# endif +#endif + +/* Each test case must define this function */ +void cleanup(void); + +#define verbose_printf(...) \ + if (verbose_test) { \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } +#define ERR "ERR: " +#define ERROR(fmt, args...) fprintf(stderr, ERR fmt, ## args) + + +#define PASS() \ + do { \ + cleanup(); \ + printf("PASS\n"); \ + exit(RC_PASS); \ + } while (0) + +#define PASS_INCONCLUSIVE() \ + do { \ + cleanup(); \ + printf("PASS (inconclusive)\n"); \ + exit(RC_PASS); \ + } while (0) + +#define IRRELEVANT() \ + do { \ + cleanup(); \ + printf("PASS (irrelevant)\n"); \ + exit(RC_PASS); \ + } while (0) + +/* Look out, gcc extension below... */ +#define FAIL(fmt, ...) \ + do { \ + cleanup(); \ + printf("FAIL\t" fmt "\n", ##__VA_ARGS__); \ + exit(RC_FAIL); \ + } while (0) + +#define CONFIG(fmt, ...) \ + do { \ + cleanup(); \ + printf("Bad configuration: " fmt "\n", ##__VA_ARGS__); \ + exit(RC_CONFIG); \ + } while (0) + +#define TEST_BUG(fmt, ...) \ + do { \ + cleanup(); \ + printf("BUG in testsuite: " fmt "\n", ##__VA_ARGS__); \ + exit(RC_BUG); \ + } while (0) + +/* stressutils.c stuff */ +int remove_shmid(int shmid); + +extern long gethugepagesize (void) __attribute__ ((weak)); + +static inline long check_hugepagesize() +{ + long __hpage_size = gethugepagesize(); + if (__hpage_size < 0) { + if (errno == ENOSYS) + CONFIG("No hugepage kernel support\n"); + else if (errno == EOVERFLOW) + CONFIG("Hugepage size too large"); + else + CONFIG("Hugepage size (%s)", strerror(errno)); + } + return __hpage_size; +} + +int using_system_hpage_size(const char *mount); + +/* WARNING: Racy -- use for test cases only! */ +int kernel_has_private_reservations(void); + +#endif /* _HUGETESTS_H */ diff --git a/tests/icache-hygiene.c b/tests/icache-hygiene.c new file mode 100644 index 0000000..0e344d9 --- /dev/null +++ b/tests/icache-hygiene.c @@ -0,0 +1,226 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Test rationale: + * + * Older ppc64 kernels don't properly flush dcache to icache before + * giving a cleared page to userspace. With some exceedingly hairy + * code, this attempts to test for this bug. + * + * This test will never trigger (obviously) on machines with coherent + * icache and dcache (including x86 and POWER5). On any given run, + * even on a buggy kernel there's a chance the bug won't trigger - + * either because we don't get the same physical page back when we + * remap, or because the icache happens to get flushed in the interim. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define COPY_SIZE 128 +#define NUM_REPETITIONS 64 /* Seems to be enough to trigger reliably */ + +static long hpage_size; + +static void cacheflush(void *p) +{ +#if defined(__powerpc__) + asm volatile("dcbst 0,%0; sync; icbi 0,%0; isync" : : "r"(p)); +#elif defined(__arm__) || defined(__aarch64__) + __clear_cache(p, p + COPY_SIZE); +#endif +} + +static void jumpfunc(int copy, void *p) +{ + /* gcc bug workaround: if there is exactly one &&label + * construct in the function, gcc assumes the computed goto + * goes there, leading to the complete elision of the goto in + * this case */ + void *l = &&dummy; + l = &&jumplabel; + + if (copy) { + memcpy(p, l, COPY_SIZE); + cacheflush(p); + } + + goto *p; + dummy: + printf("unreachable?\n"); + + jumplabel: + return; +} + +static sigjmp_buf sig_escape; +static void *sig_expected; + +static void sig_handler(int signum, siginfo_t *si, void *uc) +{ +#if defined(__powerpc__) || defined(__powerpc64__) || defined(__ia64__) || \ + defined(__s390__) || defined(__s390x__) || defined(__sparc__) || \ + defined(__aarch64__) + /* On powerpc, ia64, s390 and Aarch64, 0 bytes are an illegal + * instruction, so, if the icache is cleared properly, we SIGILL + * as soon as we jump into the cleared page */ + if (signum == SIGILL) { + verbose_printf("SIGILL at %p (sig_expected=%p)\n", si->si_addr, + sig_expected); + if (si->si_addr == sig_expected) { + siglongjmp(sig_escape, 1); + } + FAIL("SIGILL somewhere unexpected"); + } +#elif defined(__i386__) || defined(__x86_64__) || defined(__arm__) + /* On x86, zero bytes form a valid instruction: + * add %al,(%eax) (i386) + * or add %al,(%rax) (x86_64) + * + * So, behaviour depends on the contents of [ER]AX, which in + * turn depends on the details of code generation. If [ER]AX + * contains a valid pointer, we will execute the instruction + * repeatedly until we run off that hugepage and get a SIGBUS + * on the second, truncated page. If [ER]AX does not contain + * a valid pointer, we will SEGV on the first instruction in + * the cleared page. We check for both possibilities + * below. + * + * On 32 bit ARM, zero bytes are interpreted as follows: + * andeq r0, r0, r0 (ARM state, 4 bytes) + * movs r0, r0 (Thumb state, 2 bytes) + * + * So, we only expect to run off the end of the huge page and + * generate a SIGBUS. */ + if (signum == SIGBUS) { + verbose_printf("SIGBUS at %p (sig_expected=%p)\n", si->si_addr, + sig_expected); + if (sig_expected + && (ALIGN((unsigned long)sig_expected, gethugepagesize()) + == (unsigned long)si->si_addr)) { + siglongjmp(sig_escape, 2); + } + FAIL("SIGBUS somewhere unexpected"); + } +#if defined(__x86_64__) || defined(__i386__) + if (signum == SIGSEGV) { +#ifdef __x86_64__ + void *pc = (void *)((ucontext_t *)uc)->uc_mcontext.gregs[REG_RIP]; +#else + void *pc = (void *)((ucontext_t *)uc)->uc_mcontext.gregs[REG_EIP]; +#endif + + verbose_printf("SIGSEGV at %p, PC=%p (sig_expected=%p)\n", + si->si_addr, pc, sig_expected); + if (sig_expected == pc) { + siglongjmp(sig_escape, 1); + } + FAIL("SIGSEGV somewhere unexpected"); + } +#endif +#else +#error Need to setup signal conditions for this arch +#endif +} + +static void test_once(int fd) +{ + void *p, *q; + + ftruncate(fd, 0); + + if (sigsetjmp(sig_escape, 1)) { + sig_expected = NULL; + ftruncate(fd, 0); + return; + } + + p = mmap(NULL, 2*hpage_size, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + ftruncate(fd, hpage_size); + + q = p + hpage_size - COPY_SIZE; + + jumpfunc(1, q); + + ftruncate(fd, 0); + p = mmap(p, hpage_size, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_SHARED|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + q = p + hpage_size - COPY_SIZE; + sig_expected = q; + + jumpfunc(0, q); /* This should blow up */ + + FAIL("icache unclean"); +} + +int main(int argc, char *argv[]) +{ + int fd; + int err; + int i; + + test_init(argc, argv); + + struct sigaction sa = { + .sa_sigaction = sig_handler, + .sa_flags = SA_SIGINFO, + }; + + hpage_size = check_hugepagesize(); + + err = sigaction(SIGILL, &sa, NULL); + if (err) + FAIL("Can't install SIGILL handler: %s", strerror(errno)); + + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("Can't install SIGBUS handler: %s", strerror(errno)); + + err = sigaction(SIGSEGV, &sa, NULL); + if (err) + FAIL("Can't install SIGSEGV handler: %s", strerror(errno)); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("Couldn't get hugepage fd"); + + for (i = 0; i < NUM_REPETITIONS; i++) + test_once(fd); + + PASS(); +} diff --git a/tests/large_mounts.c b/tests/large_mounts.c new file mode 100644 index 0000000..14376e9 --- /dev/null +++ b/tests/large_mounts.c @@ -0,0 +1,117 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Eric Munson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define BUF_SIZE 4096 +#define FILLER "tmpfs /var/run tmpfs rw,nosuid,nodev,noexec,mode=755 0 0\n" + +int in_test; /* = 0; */ +int tmp_mounts_fd; /* = 0; */ +FILE *tmp_stream; /* = NULL; */ + +/* + * We override the normal open, so we can remember the fd for the + * mounts file + */ +int open(const char *path, int flags, ...) +{ + int (*old_open)(const char *, int, ...); + int fd; + va_list ap; + + old_open = dlsym(RTLD_NEXT, "open"); + if (in_test && strcmp(path, "/proc/mounts") == 0) + return tmp_mounts_fd; + va_start(ap, flags); + fd = (old_open)(path, flags, va_arg(ap, mode_t)); + va_end(ap); + return fd; +} + +void make_test_mounts() +{ + char buf[BUF_SIZE]; + int mounts_fd; + unsigned int written = 0; + int ret; + int filler_sz; + + mounts_fd = open("/proc/mounts", O_RDONLY); + if (mounts_fd < 0) + FAIL("Unable to open /proc/mounts: %s", strerror(errno)); + tmp_stream = tmpfile(); + if (!tmp_stream) + FAIL("Unable to open temporary mounts file: %s", strerror(errno)); + + tmp_mounts_fd = fileno(tmp_stream); + if (tmp_mounts_fd < 0) + FAIL("Unable to get file descriptor from stream."); + + filler_sz = strlen(FILLER); + + while (written < BUF_SIZE) { + if (write(tmp_mounts_fd, FILLER, filler_sz) < 0) + FAIL("Unable to write to temp mounts file: %s", + strerror(errno)); + written += filler_sz; + } + + while ((ret = read(mounts_fd, buf, BUF_SIZE)) > 0) + if (write(tmp_mounts_fd, buf, ret) < 0) + FAIL("Unable to write to temp mounts file: %s", + strerror(errno)); + + close(mounts_fd); + if (lseek(tmp_mounts_fd, 0, SEEK_SET) < 0) + FAIL("Unable to move temp mounts stream to beginning of file: %s", + strerror(errno)); +} + +int main(int argc, char *argv[]) +{ + int fd; + + make_test_mounts(); + test_init(argc, argv); + in_test = 1; + + fd = hugetlbfs_unlinked_fd(); + + fclose(tmp_stream); + if (fd < 0) + FAIL("Unable to find mount point\n"); + + PASS(); +} diff --git a/tests/libtestutils.c b/tests/libtestutils.c new file mode 100644 index 0000000..4eeb880 --- /dev/null +++ b/tests/libtestutils.c @@ -0,0 +1,138 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hugetlbfs.h" +#include "libhugetlbfs_privutils.h" +#include "hugetests.h" + +void check_free_huge_pages(int nr_pages_needed) +{ + long hpage_size = gethugepagesize(); + int freepages = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (freepages < nr_pages_needed) + CONFIG("Must have at least %i free hugepages", nr_pages_needed); +} + +int using_system_hpage_size(const char *mount) +{ + struct statfs64 sb; + int err; + long meminfo_size, mount_size; + + if (!mount) + FAIL("using_system_hpage_size: hugetlbfs is not mounted\n"); + + err = statfs64(mount, &sb); + if (err) + FAIL("statfs64: %s\n", strerror(errno)); + + meminfo_size = read_meminfo("Hugepagesize:"); + if (meminfo_size < 0) + FAIL("using_system_hpage_size: Failed to read /proc/meminfo\n"); + + mount_size = sb.f_bsize / 1024; /* Compare to meminfo in kB */ + if (mount_size == meminfo_size) + return 1; + else + return 0; +} + +/* WARNING: This function relies on the hugetlb pool counters in a way that + * is known to be racy. Due to the expected usage of hugetlbfs test cases, the + * risk of a race is acceptible. This function should NOT be used for real + * applications. + */ +int kernel_has_private_reservations(void) +{ + int fd; + long t, f, r, s; + long nt, nf, nr, ns; + long hpage_size = gethugepagesize(); + void *map; + + /* Read pool counters */ + t = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + f = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + r = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + s = get_huge_page_counter(hpage_size, HUGEPAGES_SURP); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) { + ERROR("kernel_has_private_reservations: hugetlbfs_unlinked_fd: " + "%s\n", strerror(errno)); + return -1; + } + map = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (map == MAP_FAILED) { + ERROR("kernel_has_private_reservations: mmap: %s\n", + strerror(errno)); + return -1; + } + + /* Recheck the counters */ + nt = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + nf = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + nr = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + ns = get_huge_page_counter(hpage_size, HUGEPAGES_SURP); + + munmap(map, hpage_size); + close(fd); + + /* + * There are only three valid cases: + * 1) If a surplus page was allocated to create a reservation, all + * four pool counters increment + * 2) All counters remain the same except for Hugepages_Rsvd, then + * a reservation was created using an existing pool page. + * 3) All counters remain the same, indicates that no reservation has + * been created + */ + if ((nt == t + 1) && (nf == f + 1) && (ns == s + 1) && (nr == r + 1)) { + return 1; + } else if ((nt == t) && (nf == f) && (ns == s)) { + if (nr == r + 1) + return 1; + else if (nr == r) + return 0; + } else { + ERROR("kernel_has_private_reservations: bad counter state - " + "T:%li F:%li R:%li S:%li -> T:%li F:%li R:%li S:%li\n", + t, f, r, s, nt, nf, nr, ns); + } + return -1; +} diff --git a/tests/linkhuge.c b/tests/linkhuge.c new file mode 100644 index 0000000..05d9924 --- /dev/null +++ b/tests/linkhuge.c @@ -0,0 +1,176 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include "hugetests.h" + +#define BLOCK_SIZE 16384 +#define CONST 0xdeadbeef + +#define BIG_INIT { \ + [0] = CONST, [17] = CONST, [BLOCK_SIZE-1] = CONST, \ +} +static int small_data = 1; +static int big_data[BLOCK_SIZE] = BIG_INIT; + +static int small_bss; +static int big_bss[BLOCK_SIZE]; + +const int small_const = CONST; +const int big_const[BLOCK_SIZE] = BIG_INIT; + +static int static_func(int x) +{ + return x; +} + +int global_func(int x) +{ + return x; +} + +static struct test_entry { + const char *name; + void *data; + int size; + char linkchar; + int writable, execable; + int is_huge; +} testtab[] = { +#define RWENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 1, 0, } +#define ROENT(name, linkchar) { #name, (void *)&name, sizeof(name), linkchar, 0, 0, } +#define RXENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 0, 1, } + RWENT(small_data, 'D'), + RWENT(big_data, 'D'), + RWENT(small_bss, 'B'), + RWENT(big_bss, 'B'), + ROENT(small_const, 'T'), + ROENT(big_const, 'T'), + RXENT(static_func, 'T'), + RXENT(global_func, 'T'), +}; + +#define NUM_TESTS (sizeof(testtab) / sizeof(testtab[0])) + +static char link_string[32]; + +static void get_link_string(const char *argv0) +{ + const char *p, *q; + + /* Find program basename */ + p = strrchr(argv0, '/'); + if (p) + p++; + else + p = argv0; + + if (*p != 'x') + return; /* just a plain ordinary link */ + + q = strchr(p, '.'); + if (!q) + /* ERROR? */ + return; + + memcpy(link_string, p, q-p); +} + +static void do_test(struct test_entry *te) +{ + int i; + volatile int *p = te->data; + + if (te->writable) { + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i] = CONST ^ i; + + barrier(); + + for (i = 0; i < (te->size / sizeof(*p)); i++) + if (p[i] != (CONST ^ i)) + FAIL("mismatch on %s", te->name); + } else if (te->execable) { + int (*pf)(int) = te->data; + + if ((*pf)(CONST) != CONST) + FAIL("%s returns incorrect results", te->name); + } else { + /* Otherwise just read touch it */ + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i]; + } + + te->is_huge = (test_addr_huge(te->data) == 1); +} + +int main(int argc, char *argv[]) +{ + int i; + char *env; + int elfmap_inhibited; + + test_init(argc, argv); + + get_link_string(argv[0]); + + env = getenv("HUGETLB_ELFMAP"); + + verbose_printf("Link string is [%s], HUGETLB_ELFMAP=%s\n", + link_string, env); + + elfmap_inhibited = env && (strcasecmp(env, "no") == 0); + + for (i = 0; i < NUM_TESTS; i++) { + do_test(testtab + i); + } + + verbose_printf("Hugepages used for:"); + for (i = 0; i < NUM_TESTS; i++) + if (testtab[i].is_huge) + verbose_printf(" %s", testtab[i].name); + verbose_printf("\n"); + + for (i = 0; i < NUM_TESTS; i++) { + char linkchar = testtab[i].linkchar; + + if (elfmap_inhibited) { + if (testtab[i].is_huge) + FAIL("%s is hugepage despite HUGETLB_ELFMAP=%s\n", + testtab[i].name, env); + } else { + if (linkchar && strchr(link_string, linkchar)) { + if (! testtab[i].is_huge) + FAIL("%s is not hugepage\n", + testtab[i].name); + } + if (linkchar && !strchr(link_string, linkchar)) { + if (testtab[i].is_huge) + FAIL("%s is hugepage\n", + testtab[i].name); + } + } + } + PASS(); +} diff --git a/tests/linkhuge_nofd.c b/tests/linkhuge_nofd.c new file mode 100644 index 0000000..f04cd8e --- /dev/null +++ b/tests/linkhuge_nofd.c @@ -0,0 +1,42 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include "hugetests.h" + +/* Override the working version from libhugetlbfs */ +int hugetlbfs_unlinked_fd_for_size(long page_size) +{ + return -1; +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + + /* All we're testing is that we survive the library attempting + * and failing to remap us into hugepages */ + + PASS(); +} diff --git a/tests/linkhuge_rw.c b/tests/linkhuge_rw.c new file mode 100644 index 0000000..c1c2e96 --- /dev/null +++ b/tests/linkhuge_rw.c @@ -0,0 +1,264 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2008 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hugetests.h" + +#define BLOCK_SIZE 16384 +#define CONST 0xdeadbeef +#define RETURN_ADDRESS 0x0 + +#define BIG_INIT { \ + [0] = CONST, [17] = CONST, [BLOCK_SIZE-1] = CONST, \ +} +static int small_data = 1; +static int big_data[BLOCK_SIZE] = BIG_INIT; + +static int small_bss; +static int big_bss[BLOCK_SIZE]; + +const int small_const = CONST; +const int big_const[BLOCK_SIZE] = BIG_INIT; + +/* + * Turn function pointer into address from .text. + * + * On some ABIs function pointer may not refer to .text section. For example + * on powerPC 64-bit ABI, function pointer may refer to call stub from + * .opd section. + * + * This function expects that parameter data is a function pointer of type: + * long f(long), and when called with special parameter, it returns an address + * corresponding to actual code of the function. Current implementation relies + * on gcc's __builtin_return_address, see get_pc() below. + */ +static inline void *get_text_addr(void *data) +{ + long (*gettext)(long) = data; + + return (void *)gettext(RETURN_ADDRESS); +} + +static void __attribute__ ((noinline)) *get_pc(void) +{ +#if defined(__s390__) && __WORDSIZE == 32 + /* taken from sysdeps/unix/sysv/linux/s390/s390-32/profil-counter.h + * 31-bit s390 pointers don't use the 32th bit, however integers do, + * so wrap the value around at 31 bits */ + return (void *) + ((unsigned long) __builtin_return_address(0) & 0x7fffffffUL); +#else + return __builtin_return_address(0); +#endif +} + +static long static_func(long x) +{ + if (x == RETURN_ADDRESS) + return (long)get_pc(); + return x; +} + +long global_func(long x) +{ + if (x == RETURN_ADDRESS) + return (long)get_pc(); + return x; +} + +static struct test_entry { + const char *name; + void *data; + int size; + int writable; + int execable; + int is_huge; +} testtab[] = { +#define ENT(entry_name, exec) { \ + .name = #entry_name, \ + .data = (void *)&entry_name, \ + .size = sizeof(entry_name), \ + .writable = 0, \ + .execable = exec } + + ENT(small_data, 0), + ENT(big_data, 0), + ENT(small_bss, 0), + ENT(big_bss, 0), + ENT(small_const, 0), + ENT(big_const, 0), + ENT(static_func, 1), + ENT(global_func, 1), +}; + + +#define NUM_TESTS (sizeof(testtab) / sizeof(testtab[0])) + +static +int parse_elf(struct dl_phdr_info *info, size_t size, void *data) +{ + int i; + unsigned long text_end, data_start; + long *min_align = (long *)data; + long actual_align; + + text_end = data_start = 0; + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + if (info->dlpi_phdr[i].p_flags & PF_X) + text_end = info->dlpi_phdr[i].p_vaddr + + info->dlpi_phdr[i].p_memsz; + else if (info->dlpi_phdr[i].p_flags & PF_W) + data_start = info->dlpi_phdr[i].p_vaddr; + + if (text_end && data_start) + break; + } + + actual_align = (data_start - text_end) / 1024; + if (actual_align < *min_align) + FAIL("Binary not suitably aligned"); + + return 1; +} + +static void check_if_writable(struct test_entry *te) +{ + int pid, ret, status; + + pid = fork(); + if (pid < 0) + FAIL("fork: %s", strerror(errno)); + else if (pid == 0) { + void *data; + + if (te->execable) + data = get_text_addr(te->data); + else + data = te->data; + + (*(char *)data) = 0; + exit (0); + } else { + ret = waitpid(pid, &status, 0); + if (ret < 0) + FAIL("waitpid(): %s", strerror(errno)); + if (WIFSIGNALED(status)) + te->writable = 0; + else + te->writable = 1; + } +} + +static void do_test(struct test_entry *te) +{ + int i; + void *data = te->data; + + check_if_writable(te); + verbose_printf("entry: %s, data: %p, writable: %d\n", + te->name, data, te->writable); + + if (te->writable) { + volatile int *p = data; + + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i] = CONST ^ i; + + barrier(); + + for (i = 0; i < (te->size / sizeof(*p)); i++) + if (p[i] != (CONST ^ i)) + FAIL("mismatch on %s", te->name); + } else if (te->execable) { + long (*pf)(long) = data; + + data = get_text_addr(data); + + if ((*pf)(CONST) != CONST) + FAIL("%s returns incorrect results", te->name); + } else { + /* Otherwise just read touch it */ + volatile int *p = data; + + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i]; + } + + te->is_huge = (test_addr_huge(data) == 1); + verbose_printf("entry: %s, data: %p, is_huge: %d\n", + te->name, data, te->is_huge); +} + +int main(int argc, char *argv[]) +{ + int i; + char *env; + int elfmap_readonly, elfmap_writable; + long hpage_size = gethugepagesize() / 1024; + + test_init(argc, argv); + + /* Test that the binary has been aligned enough by the linker */ + if ((argc > 1) && !strcmp("--test-alignment", argv[1])) + dl_iterate_phdr(parse_elf, &hpage_size); + + env = getenv("HUGETLB_ELFMAP"); + verbose_printf("HUGETLB_ELFMAP=%s\n", env); + + elfmap_readonly = env && strchr(env, 'R'); + elfmap_writable = env && strchr(env, 'W'); + + for (i = 0; i < NUM_TESTS; i++) { + do_test(testtab + i); + } + + verbose_printf("Hugepages used for:"); + for (i = 0; i < NUM_TESTS; i++) + if (testtab[i].is_huge) + verbose_printf(" %s", testtab[i].name); + verbose_printf("\n"); + + for (i = 0; i < NUM_TESTS; i++) { + if (testtab[i].writable) { + if (elfmap_writable && !testtab[i].is_huge) + FAIL("%s is not hugepage", testtab[i].name); + if (!elfmap_writable && testtab[i].is_huge) + FAIL("%s is hugepage", testtab[i].name); + } else if (!testtab[i].writable) { + if (elfmap_readonly && !testtab[i].is_huge) + FAIL("%s is not hugepage", testtab[i].name); + if (!elfmap_readonly && testtab[i].is_huge) + FAIL("%s is hugepage", testtab[i].name); + } + } + PASS(); +} diff --git a/tests/linkshare.c b/tests/linkshare.c new file mode 100644 index 0000000..f86e041 --- /dev/null +++ b/tests/linkshare.c @@ -0,0 +1,373 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2006 Nishanth Aravamudan, IBM Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hugetests.h" + +#define BLOCK_SIZE 16384 +#define CONST 0xdeadbeef +#define SHM_KEY 0xdeadcab +#define NUM_CHILDREN 2 + +#define BIG_INIT { \ + [0] = CONST, [17] = CONST, [BLOCK_SIZE-1] = CONST, \ +} +static int small_data = 1; +static int big_data[BLOCK_SIZE] = BIG_INIT; + +static int small_bss; +static int big_bss[BLOCK_SIZE]; + +const int small_const = CONST; +const int big_const[BLOCK_SIZE] = BIG_INIT; + +static int static_func(int x) +{ + return x; +} + +int global_func(int x) +{ + return x; +} + +static struct test_entry { + const char *name; + void *data; + int size; + char linkchar; + int writable, execable; + int is_huge; +} testtab[] = { +#define RWENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 1, 0, } +#define ROENT(name, linkchar) { #name, (void *)&name, sizeof(name), linkchar, 0, 0, } +#define RXENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 0, 1, } + RWENT(small_data, 'D'), + RWENT(big_data, 'D'), + RWENT(small_bss, 'B'), + RWENT(big_bss, 'B'), + ROENT(small_const, 'T'), + ROENT(big_const, 'T'), + RXENT(static_func, 'T'), + RXENT(global_func, 'T'), +}; + +#define NUM_TESTS (sizeof(testtab) / sizeof(testtab[0])) + +static int sharing; +static int elfmap_off; +static int shmid; +static ino_t *shm; + +static char link_string[32]; + +static void get_link_string(const char *argv0) +{ + const char *p, *q; + + /* Find program basename */ + p = strrchr(argv0, '/'); + if (p) + p++; + else + p = argv0; + + if (*p != 'x') + return; /* just a plain ordinary link */ + + q = strchr(p, '.'); + if (!q) + /* ERROR? */ + return; + + memcpy(link_string, p, q-p); +} + +static ino_t do_test(struct test_entry *te) +{ + int i; + volatile int *p = te->data; + + if (te->writable) { + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i] = CONST ^ i; + + barrier(); + + for (i = 0; i < (te->size / sizeof(*p)); i++) { + if (p[i] != (CONST ^ i)) { + verbose_printf("mismatch on %s", te->name); + exit(RC_FAIL); + } + } + } else if (te->execable) { + int (*pf)(int) = te->data; + + if ((*pf)(CONST) != CONST) { + verbose_printf("%s returns incorrect results", te->name); + exit(RC_FAIL); + } + } else { + /* Otherwise just read touch it */ + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i]; + } + + te->is_huge = (test_addr_huge(te->data) == 1); + + return get_addr_inode(te->data); +} + +static void parse_env(void) +{ + char *env; + + env = getenv("HUGETLB_ELFMAP"); + if (env && (strcasecmp(env, "no") == 0)) { + verbose_printf("Segment remapping disabled\n"); + elfmap_off = 1; + } else { + env = getenv("HUGETLB_SHARE"); + if (env) + sharing = atoi(env); + verbose_printf("Segment remapping enabled, " + "sharing = %d\n", sharing); + } +} + +static pid_t spawn_child(char *self, int index) +{ + int ret; + char execarg1[5]; + + ret = snprintf(execarg1, 5, "%d", index); + if (ret < 0) + FAIL("snprintf failed: %s", strerror(errno)); + + ret = fork(); + if (ret) { + if (ret < 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("fork failed: %s", + strerror(errno)); + } + } else { + ret = execlp(self, self, execarg1, NULL); + if (ret) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("execl(%s, %s, %s failed: %s", + self, self, execarg1, + strerror(errno)); + } + } + + return ret; +} + +static int child_process(char *self, int index) +{ + int i; + ino_t ino; + + get_link_string(self); + + shmid = shmget(SHM_KEY, NUM_CHILDREN * NUM_TESTS * + sizeof(ino_t), 0666); + if (shmid < 0) { + verbose_printf("Child's shmget failed: %s", strerror(errno)); + exit(RC_FAIL); + } + + shm = shmat(shmid, NULL, 0); + if (shm == (void *)-1) { + verbose_printf("Child's shmat failed: %s", strerror(errno)); + exit(RC_FAIL); + } + + for (i = 0; i < NUM_TESTS; i++) { + if (!test_addr_huge(testtab + i)) { + /* don't care about non-huge addresses */ + shm[index * NUM_TESTS + i] = 0; + } else { + ino = do_test(testtab + i); + if ((int)ino < 0) { + shmdt(shm); + exit(RC_FAIL); + } + shm[index * NUM_TESTS + i] = ino; + } + } + shmdt(shm); + return 0; +} + +static void verify_inodes() +{ + int i, j; + + for (i = 0; i < NUM_TESTS; i++) { + ino_t base = shm[i]; + for (j = 1; j < NUM_CHILDREN; j++) { + ino_t comp = shm[j * NUM_TESTS + i]; + if (base != comp) { + /* + * we care if we mismatch if + * sharing only read-only + * segments and this is one + */ + if (sharing == 1 && testtab[i].writable == 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("Inodes do not match " + "(%u != %u)", + (int)base, (int)comp); + } + } else { + /* + * we care if we match if + * a) not remapping or + * b) not sharing or + * c) sharing only read-only + * segments and this is not one + * BUT only if the inode is not + * 0 (don't care about the file) + */ + if (base == 0) + continue; + + if (elfmap_off == 1 || sharing == 0 || + (sharing == 1 && testtab[i].writable == 1)) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + if (sharing == 1 && testtab[i].writable == 1) + verbose_printf("Incorrectly sharing a writable segment...\n"); + FAIL("Inodes match, but we should not be " + "sharing this segment (%d == %d)", + (int)base, (int)comp); + } + } + } + } +} + +static void sigsegv_handler(int signum, siginfo_t *si, void *context) +{ + FAIL("Segmentation fault in parent at address %p", si->si_addr); +} + +int main(int argc, char *argv[], char *envp[]) +{ + test_init(argc, argv); + + if (argc == 1) { + /* + * first process + */ + pid_t children_pids[NUM_CHILDREN]; + int ret, i; + int status; + /* + * We catch children's segfaults via waitpid's status, + * but this is to catch the parent itself segfaulting. + * This can happen, for instance, if an old (bad) + * segment file is left lying around in the hugetlbfs + * mountpoint + */ + struct sigaction sa_seg = { + .sa_sigaction = sigsegv_handler, + .sa_flags = SA_SIGINFO, + }; + + parse_env(); + + ret = sigaction(SIGSEGV, &sa_seg, NULL); + if (ret < 0) + FAIL("Installing SIGSEGV handler failed: %s", + strerror(errno)); + + shmid = shmget(SHM_KEY, NUM_CHILDREN * NUM_TESTS * + sizeof(ino_t), IPC_CREAT | IPC_EXCL | + 0666); + if (shmid < 0) + FAIL("Parent's shmget failed: %s", strerror(errno)); + + shm = shmat(shmid, NULL, 0); + if (shm == (void *)-1) + FAIL("Parent's shmat failed: %s", strerror(errno)); + + for (i = 0; i < NUM_CHILDREN; i++) + children_pids[i] = spawn_child(argv[0], i); + + for (i = 0; i < NUM_CHILDREN; i++) { + ret = waitpid(children_pids[i], &status, 0); + if (ret < 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("waitpid failed: %s", strerror(errno)); + } + + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("Child %d exited with non-zero status: %d", + i + 1, WEXITSTATUS(status)); + } + + if (WIFSIGNALED(status)) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("Child %d killed by signal: %s", i + 1, + strsignal(WTERMSIG(status))); + } + } + + verify_inodes(); + + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + PASS(); + } else { + if (argc == 2) { + /* + * child process + * arg1 = index + 1 into shared memory array + */ + child_process(argv[0], atoi(argv[1])); + } else { + FAIL("Invalid arguments\n"); + } + } + + return 0; +} diff --git a/tests/madvise_reserve.c b/tests/madvise_reserve.c new file mode 100644 index 0000000..2f7bd67 --- /dev/null +++ b/tests/madvise_reserve.c @@ -0,0 +1,81 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include + +#include +#include "hugetests.h" + +/* + * Test rationale: + * + * madvise() on some kernels can cause the reservation counter to get + * corrupted. The problem is that the patches are allocated for the + * reservation but not faulted in at the time of allocation. The + * counters do not get updated and effectively "leak". This test + * identifies whether the kernel is vunerable to the problem or not. + * It is fixed in kernel by commit f2deae9d4e70793568ef9e85d227abb7bef5b622 + */ +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long initial_rsvd, map_rsvd, madvise_rsvd, end_rsvd; + + test_init(argc, argv); + + /* Setup */ + hpage_size = check_hugepagesize(); + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + /* mmap a region and record reservations */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + map_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after map: %lu\n", map_rsvd); + + /* madvise the region and record reservations */ + if (madvise(p, hpage_size, MADV_WILLNEED) == -1) + FAIL("madvise(): %s", strerror(errno)); + madvise_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after madvise: %lu\n", madvise_rsvd); + + /* Free region */ + munmap(p, hpage_size); + close(fd); + end_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after close(): %lu\n", end_rsvd); + + /* Reserve count should match initial reserve count */ + if (end_rsvd != initial_rsvd) + FAIL("Reserve leaked: %lu != %lu\n", end_rsvd, initial_rsvd); + + PASS(); +} diff --git a/tests/madvise_reserve.sh b/tests/madvise_reserve.sh new file mode 100755 index 0000000..cfe582d --- /dev/null +++ b/tests/madvise_reserve.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# madvise is known broken before 2.6.30 +compare_kvers `uname -r` "2.6.30" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC madvise_reserve "$@" +fi + diff --git a/tests/malloc.c b/tests/malloc.c new file mode 100644 index 0000000..a1af5e1 --- /dev/null +++ b/tests/malloc.c @@ -0,0 +1,87 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include "hugetests.h" + +/* + * We cannot test mapping size against huge page size because we are not linked + * against libhugetlbfs so gethugepagesize() won't work. So instead we define + * our MIN_PAGE_SIZE as 64 kB (the largest base page available) and make sure + * the mapping page size is larger than this. + */ +#define MIN_PAGE_SIZE 65536 + +static int block_sizes[] = { + sizeof(int), 1024, 128*1024, 1024*1024, 16*1024*1024, + 32*1024*1024, +}; +#define NUM_SIZES (sizeof(block_sizes) / sizeof(block_sizes[0])) + +int main(int argc, char *argv[]) +{ + int i; + char *env1, *env2, *exe; + int expect_hugepage = 0; + char *p; + + test_init(argc, argv); + exe = strrchr(test_name, '/'); + if (exe) + exe++; /* skip over "/" */ + else + exe = test_name; + + env1 = getenv("HUGETLB_MORECORE"); + verbose_printf("HUGETLB_MORECORE=%s\n", env1); + env2 = getenv("HUGETLB_RESTRICT_EXE"); + verbose_printf("HUGETLB_RESTRICT_EXE=%s\n", env2); + if (env1 && (!env2 || strstr(env2, exe))) + expect_hugepage = 1; + verbose_printf("expect_hugepage=%d\n", expect_hugepage); + + for (i = 0; i < NUM_SIZES; i++) { + int size = block_sizes[i]; + unsigned long long mapping_size; + + p = malloc(size); + if (! p) + FAIL("malloc()"); + + verbose_printf("malloc(%d) = %p\n", size, p); + + memset(p, 0, size); + + mapping_size = get_mapping_page_size(p); + + if (expect_hugepage && (mapping_size <= MIN_PAGE_SIZE)) + FAIL("Address is not hugepage"); + if (!expect_hugepage && (mapping_size > MIN_PAGE_SIZE)) + FAIL("Address is unexpectedly huge"); + + free(p); + } + + PASS(); +} diff --git a/tests/malloc_manysmall.c b/tests/malloc_manysmall.c new file mode 100644 index 0000000..25086a8 --- /dev/null +++ b/tests/malloc_manysmall.c @@ -0,0 +1,76 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include "hugetests.h" + +/* + * We cannot test mapping size against huge page size because we are not linked + * against libhugetlbfs so gethugepagesize() won't work. So instead we define + * our MIN_PAGE_SIZE as 64 kB (the largest base page available) and make sure + * the mapping page size is larger than this. + */ +#define MIN_PAGE_SIZE 65536 + +#define ALLOC_SIZE (128) +#define NUM_ALLOCS (262144) + +int main(int argc, char *argv[]) +{ + int i; + char *env; + char *p; + int expect_hugepage = 0; + + test_init(argc, argv); + + env = getenv("HUGETLB_MORECORE"); + verbose_printf("HUGETLB_MORECORE=%s\n", env); + if (env) + expect_hugepage = 1; + + for (i = 0; i < NUM_ALLOCS; i++) { + p = malloc(ALLOC_SIZE); + if (! p) + FAIL("malloc()"); + + if (i < 16) + verbose_printf("p = %p\n", p); + + memset(p, 0, ALLOC_SIZE); + + if ((i % 157) == 0) { + /* With this many allocs, testing every one + * takes forever */ + unsigned long long mapping_size = + get_mapping_page_size(p); + if (expect_hugepage && (mapping_size <= MIN_PAGE_SIZE)) + FAIL("Address is not hugepage"); + if (!expect_hugepage && (mapping_size > MIN_PAGE_SIZE)) + FAIL("Address is unexpectedly huge"); + } + } + + PASS(); +} diff --git a/tests/map_high_truncate_2.c b/tests/map_high_truncate_2.c new file mode 100644 index 0000000..2a2560b --- /dev/null +++ b/tests/map_high_truncate_2.c @@ -0,0 +1,104 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _LARGEFILE64_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, a misconversion of hugetlb_vmtruncate_list to a + * prio_tree meant that on 32-bit machines, certain combinations of + * mapping and truncations could truncate incorrect pages, or + * overwrite pmds from other VMAs, triggering BUG_ON()s or other + * wierdness. + * + * Test adapted to the libhugetlbfs framework from an example by + * Kenneth Chen + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + * + * The kernel bug in question was fixed with commit + * 856fc29505556cf263f3dcda2533cf3766c14ab6. + */ +#define MAP_LENGTH (4 * hpage_size) +#if defined(__s390__) && __WORDSIZE == 32 +#define TRUNCATE_POINT 0x20000000UL +#else +#define TRUNCATE_POINT 0x60000000UL +#endif +#define HIGH_ADDR 0xa0000000UL + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + char *p, *q; + unsigned long i; + int err; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + check_free_huge_pages(4); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First mapping */ + p = mmap(0, MAP_LENGTH + TRUNCATE_POINT, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_NORESERVE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + munmap(p, 4*hpage_size + TRUNCATE_POINT); + + q = mmap((void *)HIGH_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (q == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + verbose_printf("High map at %p\n", q); + + for (i = 0; i < MAP_LENGTH; i += hpage_size) + q[i] = 1; + + err = ftruncate(fd, TRUNCATE_POINT); + if (err != 0) + FAIL("ftruncate(): %s", strerror(errno)); + + if (q[0] != 1) + FAIL("data mismatch"); + + PASS(); +} diff --git a/tests/meminfo_nohuge.c b/tests/meminfo_nohuge.c new file mode 100644 index 0000000..7cbc624 --- /dev/null +++ b/tests/meminfo_nohuge.c @@ -0,0 +1,79 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* We override the normal open, so libhugetlbfs gets a /proc/meminfo + * which doesn't contain any hugepage information */ +int open(const char *path, int flags, ...) +{ + int (*old_open)(const char *, int, ...); + int fd; + + if (strcmp(path, "/proc/meminfo") == 0) { + FILE *f; + + f = popen("/bin/grep -vi ^hugepage /proc/meminfo", "r"); + return fileno(f); + } + + if (strcmp(path, "/proc/mounts") == 0) { + FILE *f; + + f = popen("/bin/grep -vi hugetlbfs /proc/mounts", "r"); + return fileno(f); + } + + old_open = dlsym(RTLD_NEXT, "open"); + if (flags & O_CREAT) { + va_list ap; + + va_start(ap, flags); + fd = (*old_open)(path, flags, va_arg(ap, mode_t)); + va_end(ap); + return fd; + } else { + return (*old_open)(path, flags); + } +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + + test_init(argc, argv); + + hpage_size = gethugepagesize(); + if (hpage_size == -1) + PASS(); + + FAIL("Mysteriously found a hugepage size of %ld\n", hpage_size); +} diff --git a/tests/misalign.c b/tests/misalign.c new file mode 100644 index 0000000..de1bf98 --- /dev/null +++ b/tests/misalign.c @@ -0,0 +1,145 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2007 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * Just as normal mmap()s can't have an address, length or offset + * which is not page aligned, so hugepage mmap()s can't have an + * address, length or offset with is not hugepage aligned. + * + * However, from time to time when the various mmap() / + * get_unmapped_area() paths are updated, somebody misses one of the + * necessary checks for the hugepage paths. This testcase ensures + * that attempted hugepage mappings with parameters which are not + * correctly hugepage aligned are rejected. + * + * However starting with 3.10-rc1, length passed in mmap() doesn't need + * to be aligned because commit af73e4d9506d3b797509f3c030e7dcd554f7d9c4 + * added ALIGN() to kernel side, in mmap_pgoff(), when mapping huge page + * files. + */ +int main(int argc, char *argv[]) +{ + long page_size, hpage_size; + int fd; + void *p, *q; + int err; + struct utsname buf; + + test_init(argc, argv); + + if (uname(&buf) != 0) + FAIL("uname failed %s", strerror(errno)); + + page_size = getpagesize(); + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First see what an ok mapping looks like, as a basis for our + * bad addresses and so forth */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() without hint failed: %s", strerror(errno)); + if (((unsigned long)p % hpage_size) != 0) + FAIL("mmap() without hint at misaligned address"); + + verbose_printf("Mapped at %p, length 0x%lx\n", p, hpage_size); + + err = munmap(p, hpage_size); + if (err != 0) + FAIL("munmap() without hint failed: %s", strerror(errno)); + + /* 1) Try a misaligned hint address */ + q = mmap(p + page_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (q == MAP_FAILED) + /* Bad hint shouldn't fail, just ignore the hint */ + FAIL("mmap() with hint failed: %s", strerror(errno)); + if (((unsigned long)q % hpage_size) != 0) + FAIL("mmap() with hint at misaligned address"); + + err = munmap(q, hpage_size); + if (err != 0) + FAIL("munmap() with hint failed: %s", strerror(errno)); + + /* 2) Try a misaligned address with MAP_FIXED */ + q = mmap(p + page_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_FIXED, fd, 0); + if (q != MAP_FAILED) + FAIL("mmap() MAP_FIXED at misaligned address succeeded"); + + /* 3) Try a misaligned length */ + q = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + + if (test_compare_kver(buf.release, "3.10.0") < 0) { + if (q != MAP_FAILED) + FAIL("mmap() with misaligned length 0x%lx succeeded", + page_size); + } else { + if (q == MAP_FAILED) + FAIL("mmap() with misaligned length 0x%lx failed", + page_size); + } + + /* 4) Try a misaligned length with MAP_FIXED */ + q = mmap(p, page_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_FIXED, fd, 0); + + if (test_compare_kver(buf.release, "3.10.0") < 0) { + if (q != MAP_FAILED) + FAIL("mmap() MAP_FIXED with misaligned length 0x%lx " + "succeeded", page_size); + } else { + if (q == MAP_FAILED) + FAIL("mmap() MAP_FIXED with misaligned length 0x%lx " + "failed", page_size); + } + + /* 5) Try a misaligned offset */ + q = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, page_size); + if (q != MAP_FAILED) + FAIL("mmap() with misaligned offset 0x%lx succeeded", + page_size); + + /* 6) Try a misaligned offset with MAP_FIXED*/ + q = mmap(p, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_FIXED, fd, page_size); + if (q != MAP_FAILED) + FAIL("mmap() MAP_FIXED with misaligned offset 0x%lx succeeded", + page_size); + + PASS(); +} diff --git a/tests/misaligned_offset.c b/tests/misaligned_offset.c new file mode 100644 index 0000000..e82ffe1 --- /dev/null +++ b/tests/misaligned_offset.c @@ -0,0 +1,140 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * Copyright (C) 2006 Hugh Dickins + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, a misconversion of hugetlb_vmtruncate_list to a + * prio_tree meant that on 32-bit machines, truncates at or above 4GB + * could truncate lower pages, resulting in BUG_ON()s. + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + * + * The kernel bug in question was fixed with commit + * 856fc29505556cf263f3dcda2533cf3766c14ab6. + */ + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + off_t buggy_offset; + int fd; + void *p, *q; + volatile int *pi; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First, we make a 2 page sane hugepage mapping. Then we + * memset() it to ensure that the ptes are instantiated for + * it. Then we attempt to replace the second half of the map + * with one at a bogus offset. We leave the first page of + * sane mapping in place to ensure that the corresponding + * pud/pmd/whatever entries aren't cleaned away. It's those + * bad entries which can trigger bad_pud() checks if the + * backout path for the bogus mapping is buggy, which it was + * in some kernels. */ + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + verbose_printf("Mapping reference map..."); + /* First get arena of three hpages size, at file offset 4GB */ + p = mmap(NULL, 2*hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() offset 4GB: %s", strerror(errno)); + verbose_printf("%p-%p\n", p, p+2*hpage_size-1); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + /* Instantiate the pages */ + verbose_printf("Instantiating..."); + memset(p, 0, 2*hpage_size); + pi = p; + *pi = RANDOM_CONSTANT; + verbose_printf("done.\n"); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + /* Toggle the permissions on the first page. This forces TLB + * entries (including hash page table on powerpc) to be + * flushed, so that the page tables must be accessed for the + * test further down. In the buggy case, those page tables + * can get thrown away by a pud_clear() */ + err = mprotect(p, hpage_size, PROT_READ); + if (err) + FAIL("mprotect(%p, 0x%lx, PROT_READ): %s", p, hpage_size, + strerror(errno)); + + /* Replace top hpage by hpage mapping at confusing file offset */ + buggy_offset = page_size; + verbose_printf("Replacing map at %p with map from offset 0x%lx...", + p + hpage_size, (unsigned long)buggy_offset); + q = mmap(p + hpage_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, fd, buggy_offset); + if (q != MAP_FAILED) + FAIL("bogus offset mmap() succeeded at %p: %s", q, strerror(errno)); + if (errno != EINVAL) + FAIL("bogus mmap() failed with \"%s\" instead of \"%s\"", + strerror(errno), strerror(EINVAL)); + verbose_printf("%s\n", strerror(errno)); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + if (*pi != RANDOM_CONSTANT) + FAIL("Pre-existing mapping clobbered: %x instead of %x", + *pi, RANDOM_CONSTANT); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + /* The real test is whether we got a bad_pud() or similar + * during the run. The check above, combined with the earlier + * mprotect()s to flush the TLB are supposed to catch it, but + * it's hard to be certain. Once bad_pud() is called + * behaviour can be very strange. */ + PASS_INCONCLUSIVE(); +} diff --git a/tests/mlock.c b/tests/mlock.c new file mode 100644 index 0000000..88859f3 --- /dev/null +++ b/tests/mlock.c @@ -0,0 +1,72 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "hugetests.h" + +static void test_simple_mlock(int flags) +{ + int fd = hugetlbfs_unlinked_fd(); + void *p; + int ret; + long hpage_size = check_hugepagesize(); + + p = mmap(0, hpage_size, PROT_READ|PROT_WRITE, flags, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() failed (flags=%x): %s", flags, strerror(errno)); + + ret = mlock(p, hpage_size); + if (ret) + FAIL("mlock() failed (flags=%x): %s", flags, strerror(errno)); + + ret = munlock(p, hpage_size); + if (ret) + FAIL("munlock() failed (flags=%x): %s", flags, strerror(errno)); + + ret = munmap(p, hpage_size); + if (ret) + FAIL("munmap() failed (flags=%x): %s", flags, strerror(errno)); + + close(fd); +} + +int main(int argc, char *argv[]) +{ + struct rlimit limit_info; + if(getrlimit(RLIMIT_MEMLOCK, &limit_info)) + ERROR("Unable to read locked memory rlimit: %s", strerror(errno)); + if(limit_info.rlim_cur < check_hugepagesize()) + CONFIG("Locked memory ulimit set below huge page size"); + + test_simple_mlock(MAP_PRIVATE); + test_simple_mlock(MAP_SHARED); + test_simple_mlock(MAP_PRIVATE|MAP_LOCKED); + test_simple_mlock(MAP_SHARED|MAP_LOCKED); + PASS(); +} diff --git a/tests/mmap-cow.c b/tests/mmap-cow.c new file mode 100644 index 0000000..a7d3a86 --- /dev/null +++ b/tests/mmap-cow.c @@ -0,0 +1,182 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +extern int errno; + +#define P "mmap-cow" +#define DESC \ + "* Tests copy-on-write semantics of large pages where a number *\n"\ + "* of threads map the same file with the MAP_PRIVATE flag. The *\n"\ + "* threads then write into their copy of the mapping and recheck *\n"\ + "* the contents to ensure they were not corrupted by the other *\n"\ + "* threads. *"\ + +#define HTLB_FILE "mmap-cow" +#define BUF_SZ 256 + +#define CHILD_FAIL(thread, fmt, ...) \ + do { \ + verbose_printf("Thread %d (pid=%d) FAIL: " fmt, \ + thread, getpid(), __VA_ARGS__); \ + exit(1); \ + } while (0) + +/* Setup Configuration */ +static int nr_hugepages; /* Number of huge pages to allocate */ +static unsigned int threads; /* Number of threads to run */ + +static int mmap_file(int fd, char **addr, size_t size, int type) +{ + int flags = 0; + + *addr = mmap(NULL, size, PROT_READ|PROT_WRITE, flags | type, fd, 0); + if (*addr == MAP_FAILED) + return -1; + + return 0; +} + +static void do_work(int thread, size_t size, int fd) +{ + char *addr; + size_t i; + char pattern = thread+65; + + if (mmap_file(fd, &addr, size, MAP_PRIVATE)) + CHILD_FAIL(thread, "mmap() failed: %s", strerror(errno)); + + verbose_printf("Thread %d (pid=%d): Mapped at address %p\n", + thread, getpid(), addr); + + /* Write to the mapping with a distinct pattern */ + verbose_printf("Thread %d (pid=%d): Writing %c to the mapping\n", + thread, getpid(), pattern); + for (i = 0; i < size; i++) + memcpy((char *)addr+i, &pattern, 1); + + if (msync(addr, size, MS_SYNC)) + CHILD_FAIL(thread, "msync() failed: %s", strerror(errno)); + + /* Verify the pattern */ + for (i = 0; i < size; i++) + if (addr[i] != pattern) + CHILD_FAIL(thread, "Corruption at %p; " + "Got %c, Expected %c", + &addr[i], addr[i], pattern); + + verbose_printf("Thread %d (pid=%d): Pattern verified\n", + thread, getpid()); + + /* Munmap the area */ + munmap(addr, size); + close(fd); + exit(0); +} + +int main(int argc, char ** argv) +{ + char *addr; + long hpage_size; + size_t size; + int i, pid, status, fd, ret; + pid_t *wait_list; + + test_init(argc, argv); + + if (argc < 3) + CONFIG("Usage: mmap-cow <# threads> <# pages>\n"); + + nr_hugepages = atoi(argv[2]); + threads = atoi(argv[1]); + + if ((threads+1) > nr_hugepages) + CONFIG("Need more hugepages than threads\n"); + + wait_list = malloc(threads * sizeof(pid_t)); + if (wait_list == NULL) + CONFIG("Couldn't allocate memory for wait_list\n"); + + hpage_size = check_hugepagesize(); + /* Have to have enough available hugepages for each thread to + * get its own copy, plus one for the parent/page-cache */ + size = (nr_hugepages / (threads+1)) * hpage_size; + verbose_printf("hpage_size is %lx, Size is %zu, threads: %u\n", + hpage_size, size, threads); + + /* First, open the file */ + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("hugetlbfs_unlinked_fd() failed: %s\n", + strerror(errno)); + + /* First, mmap the file with MAP_SHARED and fill with data + * If this is not done, then the fault handler will not be + * called in the kernel since private mappings will be + * created for the children at prefault time. + */ + if (mmap_file(fd, &addr, size, MAP_SHARED)) + FAIL("Failed to create shared mapping: %s", strerror(errno)); + + for (i = 0; i < size; i += 8) { + memcpy(addr+i, "deadbeef", 8); + } + + for (i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +#define P "mmap-gettest" +#define DESC \ + "* This baseline test validates that a mapping of a certain size *\n"\ + "* can be created, correctly. Once created, all the pages are *\n"\ + "* filled with a pattern and rechecked to test for corruption. *\n"\ + "* The mapping is then released. This process is repeated for a *\n"\ + "* specified number of iterations. *" + +extern int errno; + +#define BUF_SZ 256 + +/* Global test configuration */ +#define HTLB_FILE "mmap-gettest" +static char hugetlb_mount[BUF_SZ]; +static unsigned int iter; +static int nr_hugepages; +static long hpage_size; + +static int do_one(char *mountpoint, size_t size) { + char *ma; + int fha; + size_t i,j; + char pattern = 'A'; + + fha = hugetlbfs_unlinked_fd(); + if (fha < 0) + CONFIG("Unable to open temp file in hugetlbfs (%s)", + strerror(errno)); + + /* Map the files with MAP_PRIVATE */ + ma = mmap(NULL, size, (PROT_READ|PROT_WRITE), MAP_SHARED, fha, 0); + if (ma == MAP_FAILED) + FAIL("Failed to mmap the hugetlb file: %s", strerror(errno)); + + /* Make sure the page is zeroed */ + for (i = 0; i < nr_hugepages; i++) { + verbose_printf("Verifying %p\n", (ma+(i*hpage_size))); + for (j = 0; j < hpage_size; j++) { + if (*(ma+(i*hpage_size)+j) != 0) + FAIL("Verifying the mmap area failed. " + "Got %c, expected 0", + *(ma+(i*hpage_size)+j)); + } + } + /* Fill each file with a pattern */ + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Touching %p with %c\n", ma+(i*hpage_size),pattern); + memset(ma+(i*hpage_size), pattern, hpage_size); + } + + /* Verify the pattern */ + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Verifying %p\n", (ma+(i*hpage_size))); + for (j = 0; j < hpage_size; j++) { + if (*(ma+(i*hpage_size)+j) != pattern) + FAIL("Verifying the mmap area failed. " + "Got %c, expected %c", + *(ma+(i*hpage_size)+j),pattern); + } + } + + /* Munmap the area */ + munmap(ma, size); + + /* Close and delete the file */ + close(fha); + return 0; +} + +int main(int argc, char ** argv) +{ + size_t size; + int i; + + test_init(argc, argv); + + if (argc < 3) + CONFIG("Usage: %s <# iterations> <# pages>\n", argv[0]); + + iter = atoi(argv[1]); + nr_hugepages = atoi(argv[2]); + + hpage_size = check_hugepagesize(); + size = nr_hugepages * hpage_size; + + for (i=0; i < iter; i++) { + verbose_printf("Iteration %d\n", i); + do_one(hugetlb_mount, size); + } + + PASS(); +} diff --git a/tests/mprotect.c b/tests/mprotect.c new file mode 100644 index 0000000..9ffb1bc --- /dev/null +++ b/tests/mprotect.c @@ -0,0 +1,229 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +static sigjmp_buf sig_escape; +static void *sig_expected = MAP_FAILED; +static long hpage_size; + +static void sig_handler(int signum, siginfo_t *si, void *uc) +{ + if (signum == SIGSEGV) { + verbose_printf("SIGSEGV at %p (sig_expected=%p)\n", si->si_addr, + sig_expected); + if (si->si_addr == sig_expected) { + siglongjmp(sig_escape, 1); + } + FAIL("SIGSEGV somewhere unexpected"); + } + FAIL("Unexpected signal %s", strsignal(signum)); +} + +static int test_read(void *p) +{ + volatile unsigned long *pl = p; + unsigned long x; + + if (sigsetjmp(sig_escape, 1)) { + /* We got a SEGV */ + sig_expected = MAP_FAILED; + return -1; + } + + sig_expected = p; + barrier(); + x = *pl; + verbose_printf("Read back %lu\n", x); + barrier(); + sig_expected = MAP_FAILED; + /* + * gcc 5 complains about x not ever being used, the following + * statement is solely here to shut it up + */ + pl = (unsigned long *)x; + + return 0; +} + +static int test_write(void *p, unsigned long val) +{ + volatile unsigned long *pl = p; + unsigned long x; + + if (sigsetjmp(sig_escape, 1)) { + /* We got a SEGV */ + sig_expected = MAP_FAILED; + return -1; + } + + sig_expected = p; + barrier(); + *pl = val; + x = *pl; + barrier(); + sig_expected = MAP_FAILED; + + return (x != val); +} + +#define RANDOM_CONSTANT 0x1234ABCD + +static void test_prot(void *p, int prot) +{ + int r, w; + + verbose_printf("Reading.."); + r = test_read(p); + verbose_printf("%d\n", r); + verbose_printf("Writing.."); + w = test_write(p, RANDOM_CONSTANT); + verbose_printf("%d\n", w); + + if (prot & PROT_READ) { + if (r != 0) + FAIL("read failed on mmap(prot=%x)", prot); + } else { + if (r != -1) + FAIL("read succeeded on mmap(prot=%x)", prot); + } + + if (prot & PROT_WRITE) { + switch (w) { + case -1: + FAIL("write failed on mmap(prot=%x)", prot); + break; + case 0: + break; + case 1: + FAIL("write mismatch on mmap(prot=%x)", prot); + break; + default: + TEST_BUG(); + } + } else { + switch (w) { + case -1: + break; + case 0: + FAIL("write succeeded on mmap(prot=%x)", prot); + break; + case 1: + FAIL("write mismatch on mmap(prot=%x)", prot); + break; + default: + TEST_BUG(); + } + } +} + +static void test_mprotect(int fd, char *testname, + unsigned long len1, int prot1, + unsigned long len2, int prot2) +{ + void *p; + int err; + + verbose_printf("Testing %s\n", testname); + verbose_printf("Mapping with prot=%x\n", prot1); + p = mmap(NULL, len1, prot1, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("%s: mmap(prot=%x): %s", testname, prot1, + strerror(errno)); + + test_prot(p, prot1); + + verbose_printf("mprotect()ing to prot=%x\n", prot2); + err = mprotect(p, len2, prot2); + if (err != 0) + FAIL("%s: mprotect(prot=%x): %s", testname, prot2, + strerror(errno)); + + test_prot(p, prot2); + + if (len2 < len1) + test_prot(p + len2, prot1); + + munmap(p, len1); +} + +int main(int argc, char *argv[]) +{ + int err; + int fd; + void *p; + + test_init(argc, argv); + + struct sigaction sa = { + .sa_sigaction = sig_handler, + .sa_flags = SA_SIGINFO, + }; + + err = sigaction(SIGSEGV, &sa, NULL); + if (err) + FAIL("Can't install SIGSEGV handler: %s", strerror(errno)); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + verbose_printf("instantiating page\n"); + + p = mmap(NULL, 2*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + memset(p, 0, hpage_size); + munmap(p, hpage_size); + + /* Basic protection change tests */ + test_mprotect(fd, "R->RW", hpage_size, PROT_READ, + hpage_size, PROT_READ|PROT_WRITE); + test_mprotect(fd, "RW->R", hpage_size, PROT_READ|PROT_WRITE, + hpage_size, PROT_READ); + + /* Tests which require VMA splitting */ + test_mprotect(fd, "R->RW 1/2", 2*hpage_size, PROT_READ, + hpage_size, PROT_READ|PROT_WRITE); + test_mprotect(fd, "RW->R 1/2", 2*hpage_size, PROT_READ|PROT_WRITE, + hpage_size, PROT_READ); + + /* PROT_NONE tests */ + test_mprotect(fd, "NONE->R", hpage_size, PROT_NONE, + hpage_size, PROT_READ); + test_mprotect(fd, "NONE->RW", hpage_size, PROT_NONE, + hpage_size, PROT_READ|PROT_WRITE); + + PASS(); +} diff --git a/tests/mremap-expand-slice-collision.c b/tests/mremap-expand-slice-collision.c new file mode 100644 index 0000000..78c11f2 --- /dev/null +++ b/tests/mremap-expand-slice-collision.c @@ -0,0 +1,226 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2009 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +unsigned long slice_boundary; +long hpage_size, page_size; + +void init_slice_boundary(int fd) +{ + unsigned long slice_size; + void *p, *heap; + int i, rc; +#if defined(__LP64__) && !defined(__aarch64__) + /* powerpc: 1TB slices starting at 1 TB */ + slice_boundary = 0x10000000000; + slice_size = 0x10000000000; +#else + /* powerpc: 256MB slices up to 4GB */ + slice_boundary = 0x00000000; + slice_size = 0x10000000; +#endif + + /* dummy malloc so we know where is heap */ + heap = malloc(1); + free(heap); + + /* Find 2 neighbour slices with couple huge pages free + * around slice boundary. + * 16 is the maximum number of slices (low/high) */ + for (i = 0; i < 16-1; i++) { + slice_boundary += slice_size; + p = mmap((void *)(slice_boundary-2*hpage_size), 4*hpage_size, + PROT_READ, MAP_SHARED | MAP_FIXED, fd, 0); + if (p == MAP_FAILED) { + verbose_printf("can't use slice_boundary: 0x%lx\n", + slice_boundary); + } else { + rc = munmap(p, 4*hpage_size); + if (rc != 0) + FAIL("munmap(p1): %s", strerror(errno)); + break; + } + } + + if (p == MAP_FAILED) + FAIL("couldn't find 2 free neighbour slices"); + verbose_printf("using slice_boundary: 0x%lx\n", slice_boundary); +} + +void do_readback(void *p, size_t size, const char *stage) +{ + unsigned int *q = p; + int i; + + verbose_printf("do_readback(%p, 0x%lx, \"%s\")\n", p, + (unsigned long)size, stage); + + for (i = 0; i < (size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Stage \"%s\": Mismatch at offset 0x%x: 0x%x instead of 0x%x", + stage, i, q[i], RANDOM_CONSTANT ^ i); + } +} + +void do_remap(int fd, void *target) +{ + void *a, *b; + int rc; + + a = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (a == MAP_FAILED) + FAIL("mmap(huge page): %s", strerror(errno)); + + verbose_printf("Huge base mapping at %p\n", a); + + do_readback(a, hpage_size, "base huge"); + + verbose_printf("Attempting mremap(MAYMOVE|FIXED) to %p...", target); + + b = mremap(a, hpage_size, hpage_size, MREMAP_MAYMOVE | MREMAP_FIXED, + target); + + if (b != MAP_FAILED) { + verbose_printf("testing..."); + do_readback(b, hpage_size, "remapped"); + verbose_printf("ok\n"); + } else { + verbose_printf("disallowed (%s)\n", strerror(errno)); + } + + rc = munmap(b, hpage_size); + if (rc != 0) + FAIL("munmap(after remap): %s", strerror(errno)); +} + +int main(int argc, char *argv[]) +{ + int fd, rc; + void *p, *q, *r; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + page_size = getpagesize(); + + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + init_slice_boundary(fd); + + /* First, hugepages above, normal below */ + p = mmap((void *)(slice_boundary + hpage_size), hpage_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(huge above): %s", strerror(errno)); + + do_readback(p, hpage_size, "huge above"); + + q = mmap((void *)(slice_boundary - page_size), page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (q == MAP_FAILED) + FAIL("mmap(normal below): %s", strerror(errno)); + + do_readback(q, page_size, "normal below"); + + verbose_printf("Attempting to remap..."); + + r = mremap(q, page_size, 2*page_size, 0); + if (r == MAP_FAILED) { + verbose_printf("disallowed\n"); + rc = munmap(q, page_size); + if (rc != 0) + FAIL("munmap(normal below): %s", strerror(errno)); + } else { + if (r != q) + FAIL("mremap() moved without MREMAP_MAYMOVE!?"); + + verbose_printf("testing..."); + do_readback(q, 2*page_size, "normal below expanded"); + rc = munmap(q, 2*page_size); + if (rc != 0) + FAIL("munmap(normal below expanded): %s", strerror(errno)); + } + + rc = munmap(p, hpage_size); + if (rc != 0) + FAIL("munmap(huge above)"); + + /* Next, normal pages above, huge below */ + p = mmap((void *)(slice_boundary + hpage_size), page_size, + PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) + FAIL("mmap(normal above): %s", strerror(errno)); + + do_readback(p, page_size, "normal above"); + + q = mmap((void *)(slice_boundary - hpage_size), + hpage_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (q == MAP_FAILED) + FAIL("mmap(huge below): %s", strerror(errno)); + + do_readback(q, hpage_size, "huge below"); + + verbose_printf("Attempting to remap..."); + + r = mremap(q, hpage_size, 2*hpage_size, 0); + if (r == MAP_FAILED) { + verbose_printf("disallowed\n"); + rc = munmap(q, hpage_size); + if (rc != 0) + FAIL("munmap(huge below): %s", strerror(errno)); + } else { + if (r != q) + FAIL("mremap() moved without MREMAP_MAYMOVE!?"); + + verbose_printf("testing..."); + do_readback(q, 2*hpage_size, "huge below expanded"); + rc = munmap(q, 2*hpage_size); + if (rc != 0) + FAIL("munmap(huge below expanded): %s", strerror(errno)); + } + + rc = munmap(p, page_size); + if (rc != 0) + FAIL("munmap(normal above)"); + + + PASS(); +} diff --git a/tests/mremap-expand-slice-collision.sh b/tests/mremap-expand-slice-collision.sh new file mode 100755 index 0000000..8c9d98a --- /dev/null +++ b/tests/mremap-expand-slice-collision.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# mremap-expand-slice-collision is known broken before 2.6.33 +compare_kvers `uname -r` "2.6.33" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC mremap-expand-slice-collision "$@" +fi + diff --git a/tests/mremap-fixed-huge-near-normal.c b/tests/mremap-fixed-huge-near-normal.c new file mode 100644 index 0000000..e0f2ae4 --- /dev/null +++ b/tests/mremap-fixed-huge-near-normal.c @@ -0,0 +1,151 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2009 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +long hpage_size; + +void do_readback(void *p, size_t size, const char *stage) +{ + unsigned int *q = p; + int i; + + verbose_printf("do_readback(%p, 0x%lx, \"%s\")\n", p, + (unsigned long)size, stage); + + for (i = 0; i < (size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Stage \"%s\": Mismatch at offset 0x%x: 0x%x instead of 0x%x", + stage, i, q[i], RANDOM_CONSTANT ^ i); + } +} + +void do_remap(int fd, void *target) +{ + void *a, *b; + int rc; + + a = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (a == MAP_FAILED) + FAIL("mmap(huge page): %s", strerror(errno)); + + verbose_printf("Huge base mapping at %p\n", a); + + do_readback(a, hpage_size, "base huge"); + + verbose_printf("Attempting mremap(MAYMOVE|FIXED) to %p...", target); + + b = mremap(a, hpage_size, hpage_size, MREMAP_MAYMOVE | MREMAP_FIXED, + target); + + if (b != MAP_FAILED) { + verbose_printf("testing..."); + do_readback(b, hpage_size, "remapped"); + verbose_printf("ok\n"); + + } else { + verbose_printf("disallowed (%s)\n", strerror(errno)); + b = a; + } + + rc = munmap(b, hpage_size); + if (rc != 0) + FAIL("munmap(after remap): %s", strerror(errno)); +} + +void *map_align(size_t size, size_t align) +{ + unsigned long xsize = size + align - getpagesize(); + size_t t; + void *p, *q; + int rc; + + p = mmap(NULL, xsize, PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + q = PALIGN(p, align); + + t = q - p; + if (t) { + rc = munmap(p, t); + if (rc != 0) + FAIL("munmap(lower aligning): %s", strerror(errno)); + } + + t = p + xsize - (q + size); + if (t) { + rc = munmap(q + size, t); + if (rc != 0) + FAIL("munmap(upper aligning): %s", strerror(errno)); + } + + return q; +} + +int main(int argc, char *argv[]) +{ + int fd, rc; + void *p; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = map_align(3*hpage_size, hpage_size); + + rc = munmap(p, hpage_size); + if (rc != 0) + FAIL("munmap() low portion: %s", strerror(errno)); + + rc = munmap(p + 2*hpage_size, hpage_size); + if (rc != 0) + FAIL("munmap() high portion: %s", strerror(errno)); + + p = p + hpage_size; + + verbose_printf("Normal mapping at %p\n", p); + + do_readback(p, hpage_size, "base normal page"); + + do_remap(fd, p - hpage_size); + do_remap(fd, p + hpage_size); + + PASS(); +} diff --git a/tests/mremap-fixed-huge-near-normal.sh b/tests/mremap-fixed-huge-near-normal.sh new file mode 100755 index 0000000..4b89c35 --- /dev/null +++ b/tests/mremap-fixed-huge-near-normal.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# mremap-fixed-huge-near-normal is known broken before 2.6.33 +compare_kvers `uname -r` "2.6.33" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC mremap-fixed-huge-near-normal "$@" +fi + diff --git a/tests/mremap-fixed-normal-near-huge.c b/tests/mremap-fixed-normal-near-huge.c new file mode 100644 index 0000000..1be5f8f --- /dev/null +++ b/tests/mremap-fixed-normal-near-huge.c @@ -0,0 +1,124 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2009 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +long page_size, hpage_size; + +void do_readback(void *p, size_t size, const char *stage) +{ + unsigned int *q = p; + int i; + + verbose_printf("do_readback(%p, 0x%lx, \"%s\")\n", p, + (unsigned long)size, stage); + + for (i = 0; i < (size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Stage \"%s\": Mismatch at offset 0x%x: 0x%x instead of 0x%x", + stage, i, q[i], RANDOM_CONSTANT ^ i); + } +} + +void do_remap(void *target) +{ + void *a, *b; + int rc; + + a = mmap(NULL, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (a == MAP_FAILED) + FAIL("mmap(normal page): %s", strerror(errno)); + + verbose_printf("Normal base mapping at %p\n", a); + + do_readback(a, page_size, "base normal"); + + verbose_printf("Attempting mremap(MAYMOVE|FIXED) to %p...", target); + + b = mremap(a, page_size, page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + target); + + if (b != MAP_FAILED) { + verbose_printf("testing..."); + do_readback(b, page_size, "remapped"); + verbose_printf("ok\n"); + } else { + verbose_printf("disallowed (%s)\n", strerror(errno)); + b = a; + } + + rc = munmap(b, page_size); + if (rc != 0) + FAIL("munmap(after remap): %s", strerror(errno)); +} + +int main(int argc, char *argv[]) +{ + int fd, rc; + void *p; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + page_size = getpagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, 3*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + rc = munmap(p, hpage_size); + if (rc != 0) + FAIL("munmap() low hpage: %s", strerror(errno)); + + rc = munmap(p + 2*hpage_size, hpage_size); + if (rc != 0) + FAIL("munmap() high hpage: %s", strerror(errno)); + + p = p + hpage_size; + + verbose_printf("Hugepage mapping at %p\n", p); + + do_readback(p, hpage_size, "base hugepage"); + + do_remap(p - page_size); + do_remap(p + hpage_size); + + PASS(); +} diff --git a/tests/mremap-fixed-normal-near-huge.sh b/tests/mremap-fixed-normal-near-huge.sh new file mode 100755 index 0000000..9ed058f --- /dev/null +++ b/tests/mremap-fixed-normal-near-huge.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# mremap-fixed-normal-near-huge is known broken before 2.6.33 +compare_kvers `uname -r` "2.6.33" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC mremap-fixed-normal-near-huge "$@" +fi + diff --git a/tests/noresv-preserve-resv-page.c b/tests/noresv-preserve-resv-page.c new file mode 100644 index 0000000..b7b8043 --- /dev/null +++ b/tests/noresv-preserve-resv-page.c @@ -0,0 +1,109 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2013 Joonsoo Kim, LG Electronics. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +#define P "noresv-preserve-resv-page" +#define DESC \ + "* Test to preserve a reserved page against no-reserved maping. *\n"\ + "* If all hugepages are reserved, access to no-reserved shared *\n"\ + "* mapping cause a process die, instead of stealing a hugepage *\n"\ + "* which is reserved for other process *\n" + +static sigjmp_buf sig_escape; +static void *sig_expected = MAP_FAILED; + +static void sig_handler(int signum, siginfo_t *si, void *uc) +{ + if (signum == SIGBUS) { + verbose_printf("SIGBUS at %p (sig_expected=%p)\n", si->si_addr, + sig_expected); + if (si->si_addr == sig_expected) { + siglongjmp(sig_escape, 1); + } + FAIL("SIGBUS somewhere unexpected"); + } + FAIL("Unexpected signal %s", strsignal(signum)); +} + +static void test_write(void *p) +{ + volatile char *pl = p; + + if (sigsetjmp(sig_escape, 1)) { + /* We got a SIGBUS */ + PASS(); + } + + sig_expected = p; + barrier(); + *pl = 's'; +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int nr_hugepages; + int fd1, fd2, err; + char *p, *q; + struct sigaction sa = { + .sa_sigaction = sig_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + nr_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + + fd1 = hugetlbfs_unlinked_fd(); + if (fd1 < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + fd2 = hugetlbfs_unlinked_fd(); + if (fd2 < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("Can't install SIGBUS handler: %s", strerror(errno)); + + p = mmap(NULL, hpage_size * nr_hugepages, + PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + verbose_printf("Reserve all hugepages %d\n", nr_hugepages); + + q = mmap(NULL, hpage_size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd2, 0); + if (q == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + verbose_printf("Write to %p to steal reserved page\n", q); + + test_write(q); + FAIL("Steal reserved page"); +} diff --git a/tests/noresv-regarded-as-resv.c b/tests/noresv-regarded-as-resv.c new file mode 100644 index 0000000..4d5e5b7 --- /dev/null +++ b/tests/noresv-regarded-as-resv.c @@ -0,0 +1,77 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2013 Joonsoo Kim, LG Electronics. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +#define P "noresv-regarded-as-resv" +#define DESC \ + "* Test to correct handling for reserve count. If no reserved *\n"\ + "* mapping is created to reserved file region, it should be *\n"\ + "* considered as reserve mapping. Otherwise, reserve count will be *\n"\ + "* overflowed. *\n" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int nr_resvpages1, nr_resvpages2; + int fd; + char *p, *q; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + nr_resvpages1 = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Number of reserve page is %d\n", nr_resvpages1); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + verbose_printf("Reserve a page to file offset 0\n"); + + q = mmap(NULL, hpage_size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0); + if (q == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + verbose_printf("Map a page of file offset 0 with no resv mapping\n"); + *q = 's'; + + munmap(p, hpage_size); + munmap(q, hpage_size); + close(fd); + verbose_printf("Unmap all mappings and close file\n"); + + nr_resvpages2 = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Number of reserve page is now %d\n", nr_resvpages2); + + if (nr_resvpages1 != nr_resvpages2) + FAIL("Reserve count overflowed"); + + PASS(); +} diff --git a/tests/private.c b/tests/private.c new file mode 100644 index 0000000..8f5cb45 --- /dev/null +++ b/tests/private.c @@ -0,0 +1,92 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD +#define OTHER_CONSTANT 0xFEDC9876 + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p, *q; + unsigned int *pl, *ql; + int i; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() SHARED: %s", strerror(errno)); + + pl = p; + for (i = 0; i < (hpage_size / sizeof(*pl)); i++) { + pl[i] = RANDOM_CONSTANT ^ i; + } + + q = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, + fd, 0); + if (q == MAP_FAILED) + FAIL("mmap() PRIVATE: %s", strerror(errno)); + + ql = q; + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Mismatch"); + } + + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + ql[i] = OTHER_CONSTANT ^ i; + } + + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (OTHER_CONSTANT ^ i)) + FAIL("PRIVATE mismatch"); + } + + for (i = 0; i < (hpage_size / sizeof(*pl)); i++) { + if (pl[i] != (RANDOM_CONSTANT ^ i)) + FAIL("SHARED map contaminated"); + } + + memset(p, 0, hpage_size); + + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (OTHER_CONSTANT ^ i)) + FAIL("PRIVATE map contaminated"); + } + + PASS(); +} diff --git a/tests/ptrace-write-hugepage.c b/tests/ptrace-write-hugepage.c new file mode 100644 index 0000000..d99d74f --- /dev/null +++ b/tests/ptrace-write-hugepage.c @@ -0,0 +1,161 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define CONST 0xdeadbeefL + +static long hpage_size; +static volatile int ready_to_trace = 0; + +static void sigchld_handler(int signum, siginfo_t *si, void *uc) +{ + int status; + + wait(&status); + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + exit(status); + + ready_to_trace = 1; +} + +static void child(int hugefd, int pipefd) +{ + void *p; + int err; + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + hugefd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + memset(p, 0, hpage_size); + + verbose_printf("Child mapped data at %p\n", p); + + err = write(pipefd, &p, sizeof(p)); + if (err == -1) + FAIL("Writing to pipe: %s", strerror(errno)); + if (err != sizeof(p)) + FAIL("Short write to pipe"); + + pause(); +} + +static void do_poke(pid_t pid, void *p) +{ + long err; + + verbose_printf("Poking..."); + err = ptrace(PTRACE_POKEDATA, pid, p, (void *)CONST); + if (err) + FAIL("ptrace(POKEDATA): %s", strerror(errno)); + verbose_printf("done\n"); + + verbose_printf("Peeking..."); + err = ptrace(PTRACE_PEEKDATA, pid, p, NULL); + if (err == -1) + FAIL("ptrace(PEEKDATA): %s", strerror(errno)); + + if (err != CONST) + FAIL("mismatch (%lx instead of %lx)", err, CONST); + verbose_printf("done\n"); +} + +int main(int argc, char *argv[]) +{ + int fd; + int pipefd[2]; + long err; + pid_t cpid; + void *p; + struct sigaction sa = { + .sa_sigaction = sigchld_handler, + .sa_flags = SA_SIGINFO, + }; + struct sigaction old_sa; + + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + err = sigaction(SIGCHLD, &sa, &old_sa); + if (err) + FAIL("Can't install SIGCHLD handler: %s", strerror(errno)); + + err = pipe(pipefd); + if (err) + FAIL("pipe(): %s", strerror(errno)); + + cpid = fork(); + if (cpid < 0) + FAIL("fork(): %s", strerror(errno)); + + + if (cpid == 0) { + child(fd, pipefd[1]); + exit(0); + } + + /* Parent */ + err = read(pipefd[0], &p, sizeof(p)); + if (err == -1) + FAIL("Reading pipe: %s\n", strerror(errno)); + if (err != sizeof(p)) + FAIL("Short read over pipe"); + + verbose_printf("Parent received address %p\n", p); + + err = ptrace(PTRACE_ATTACH, cpid, NULL, NULL); + if (err) + FAIL("ptrace(ATTACH): %s", strerror(errno)); + + while (! ready_to_trace) + ; + + do_poke(cpid, p); + do_poke(cpid, p + getpagesize()); + + err = sigaction(SIGCHLD, &old_sa, NULL); + if (err) + FAIL("Clearing SIGCHLD handler: %s", strerror(errno)); + + ptrace(PTRACE_KILL, cpid, NULL, NULL); + + PASS(); +} diff --git a/tests/quota.c b/tests/quota.c new file mode 100644 index 0000000..4961371 --- /dev/null +++ b/tests/quota.c @@ -0,0 +1,271 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2007 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +/* + * Test Rationale: + * + * The number of global huge pages available to a mounted hugetlbfs filesystem + * can be limited using a fs quota mechanism by setting the size attribute at + * mount time. Older kernels did not properly handle quota accounting in a + * number of cases (eg. for MAP_PRIVATE pages, and wrt MAP_SHARED reservation. + * + * This test replays some scenarios on a privately mounted filesystem to check + * for regressions in hugetlbfs quota accounting. + */ + +extern int errno; + +#define BUF_SZ 1024 + +/* Global test configuration */ +static long hpage_size; +char *mountpoint = NULL; + +/* map action flags */ +#define ACTION_COW 0x0001 +#define ACTION_TOUCH 0x0002 + +/* Testlet results */ +#define GOOD 0 +#define BAD_SIG 1 +#define BAD_EXIT 2 + +char result_str[3][10] = { "pass", "killed", "fail" }; + +void cleanup(void) +{ + if (mountpoint && (umount(mountpoint) == 0)) + rmdir(mountpoint); +} + +/* + * Debugging function: Verify the counters in the hugetlbfs superblock that + * are used to implement the filesystem quotas. + */ +void _verify_stat(int line, long tot, long free, long avail) +{ + struct statfs s; + statfs(mountpoint, &s); + + if (s.f_blocks != tot || s.f_bfree != free || s.f_bavail != avail) + FAIL("Bad quota counters at line %i: total: %li free: %li " + "avail: %li\n", line, s.f_blocks, s.f_bfree, s.f_bavail); +} +#define verify_stat(t, f, a) _verify_stat(__LINE__, t, f, a) + +void get_quota_fs(unsigned long size, char *prog) +{ + char mount_str[17]; + char mount_opts[50]; + int nr_written; + + nr_written = snprintf(mount_opts, 20, "size=%luK", size/1024); + + /* + * If the mount point now in use does not use the system default + * huge page size, specify the desired size when mounting. When + * the sizes do match, we avoid specifying the pagesize= option to + * preserve backwards compatibility with kernels that do not + * recognize that option. + */ + if (!using_system_hpage_size(hugetlbfs_find_path())) + snprintf(mount_opts + nr_written, 29, ",pagesize=%lu", + hpage_size); + + sprintf(mount_str, "/tmp/huge-XXXXXX"); + if (!mkdtemp(mount_str)) + FAIL("Cannot create directory for mountpoint: %s", + strerror(errno)); + + if (mount("none", mount_str, "hugetlbfs", 0, mount_opts)) { + perror("mount"); + FAIL(); + } + mountpoint = mount_str; + + /* + * Set HUGETLB_PATH and then exec the test again. This will cause + * libhugetlbfs to use this newly created mountpoint. + */ + if (setenv("HUGETLB_PATH", mount_str, 1)) + FAIL("Cannot set HUGETLB_PATH environment variable: %s", + strerror(errno)); + verbose_printf("Using %s as temporary mount point.\n", mount_str); + + execlp(prog, prog, "-p", mount_str, NULL); + FAIL("execle failed: %s", strerror(errno)); +} + +void map(unsigned long size, int mmap_flags, int action_flags) +{ + int fd; + char *a, *b, *c; + + fd = hugetlbfs_unlinked_fd(); + if (!fd) { + verbose_printf("hugetlbfs_unlinked_fd () failed\n"); + exit(1); + } + + a = mmap(0, size, PROT_READ|PROT_WRITE, mmap_flags, fd, 0); + if (a == MAP_FAILED) { + verbose_printf("mmap failed: %s\n", strerror(errno)); + exit(1); + } + + + if (action_flags & ACTION_TOUCH) + for (b = a; b < a + size; b += hpage_size) + *(b) = 1; + + if (action_flags & ACTION_COW) { + c = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (c == MAP_FAILED) { + verbose_printf("Creating COW mapping failed: %s\n", strerror(errno)); + exit(1); + } + if ((*c) != 1) { + verbose_printf("Data mismatch when setting up COW"); + exit(1); + } + (*c) = 0; + munmap(c, size); + } + + munmap(a, size); + close(fd); +} + +void do_unexpected_result(int line, int expected, int actual) +{ + FAIL("Unexpected result on line %i: expected %s, actual %s", + line, result_str[expected], result_str[actual]); +} + +void _spawn(int l, int expected_result, unsigned long size, int mmap_flags, + int action_flags) +{ + pid_t pid; + int status; + int actual_result; + + pid = fork(); + if (pid == 0) { + map(size, mmap_flags, action_flags); + exit(0); + } else if (pid < 0) { + FAIL("fork(): %s", strerror(errno)); + } else { + waitpid(pid, &status, 0); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) == 0) + actual_result = GOOD; + else + actual_result = BAD_EXIT; + } else { + actual_result = BAD_SIG; + } + + if (actual_result != expected_result) + do_unexpected_result(l, expected_result, actual_result); + } +} +#define spawn(e,s,mf,af) _spawn(__LINE__, e, s, mf, af) + +int main(int argc, char ** argv) +{ + int private_resv; + int bad_priv_resv; + + test_init(argc, argv); + hpage_size = check_hugepagesize(); + + if ((argc == 3) && !strcmp(argv[1], "-p")) + mountpoint = argv[2]; + else + get_quota_fs(hpage_size, argv[0]); + + check_must_be_root(); + check_free_huge_pages(1); + + private_resv = kernel_has_private_reservations(); + if (private_resv == -1) + FAIL("kernel_has_private_reservations() failed\n"); + bad_priv_resv = private_resv ? BAD_EXIT : BAD_SIG; + + /* + * Check that unused quota is cleared when untouched mmaps are + * cleaned up. + */ + spawn(GOOD, hpage_size, MAP_PRIVATE, 0); + verify_stat(1, 1, 1); + spawn(GOOD, hpage_size, MAP_SHARED, 0); + verify_stat(1, 1, 1); + + /* + * Check that simple page instantiation works within quota limits + * for private and shared mappings. + */ + spawn(GOOD, hpage_size, MAP_PRIVATE, ACTION_TOUCH); + spawn(GOOD, hpage_size, MAP_SHARED, ACTION_TOUCH); + + /* + * Page instantiation should be refused if doing so puts the fs + * over quota. + */ + spawn(BAD_EXIT, 2 * hpage_size, MAP_SHARED, ACTION_TOUCH); + + /* + * If private mappings are reserved, the quota is checked up front + * (as is the case for shared mappings). + */ + spawn(bad_priv_resv, 2 * hpage_size, MAP_PRIVATE, ACTION_TOUCH); + + /* + * COW should not be allowed if doing so puts the fs over quota. + */ + spawn(bad_priv_resv, hpage_size, MAP_SHARED, ACTION_TOUCH|ACTION_COW); + spawn(bad_priv_resv, hpage_size, MAP_PRIVATE, ACTION_TOUCH|ACTION_COW); + + /* + * Make sure that operations within the quota will succeed after + * some failures. + */ + spawn(GOOD, hpage_size, MAP_SHARED, ACTION_TOUCH); + spawn(GOOD, hpage_size, MAP_PRIVATE, ACTION_TOUCH); + + PASS(); +} diff --git a/tests/quota.sh b/tests/quota.sh new file mode 100755 index 0000000..398d442 --- /dev/null +++ b/tests/quota.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +. wrapper-utils.sh + +# There are known bugs in quota accounting prior to 2.6.24 +compare_kvers `uname -r` "2.6.24" +if [ $? -eq 1 ]; then + EXP_RC=$RC_FAIL +else + EXP_RC=$RC_PASS +fi + +exec_and_check $EXP_RC quota "$@" diff --git a/tests/readahead_reserve.c b/tests/readahead_reserve.c new file mode 100644 index 0000000..068b6f4 --- /dev/null +++ b/tests/readahead_reserve.c @@ -0,0 +1,85 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include +#include "hugetests.h" + +/* + * Test rationale: + * + * readahead() on some kernels can cause the reservation counter to get + * corrupted. The problem is that the patches are allocated for the + * reservation but not faulted in at the time of allocation. The + * counters do not get updated and effectively "leak". This test + * identifies whether the kernel is vunerable to the problem or not. + * It's fixed in kernel by commit f2deae9d4e70793568ef9e85d227abb7bef5b622. + */ +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long initial_rsvd, map_rsvd, readahead_rsvd, end_rsvd; + + test_init(argc, argv); + + /* Setup */ + hpage_size = check_hugepagesize(); + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + /* mmap a region and record reservations */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + map_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after map: %lu\n", map_rsvd); + + /* readahead the region and record reservations */ + readahead(fd, 0, hpage_size); + readahead_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after readahead: %lu\n", readahead_rsvd); + + /* Write the region */ + memset(p, 1, hpage_size); + + /* Free region */ + munmap(p, hpage_size); + close(fd); + end_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after close(): %lu\n", end_rsvd); + + /* Reserve count should match initial reserve count */ + if (end_rsvd != initial_rsvd) + FAIL("Reserve leaked: %lu != %lu\n", end_rsvd, initial_rsvd); + + PASS(); +} diff --git a/tests/readahead_reserve.sh b/tests/readahead_reserve.sh new file mode 100755 index 0000000..5ab7400 --- /dev/null +++ b/tests/readahead_reserve.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# readahead is known broken before 2.6.30 +compare_kvers `uname -r` "2.6.30" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC readahead_reserve "$@" +fi + diff --git a/tests/readback.c b/tests/readback.c new file mode 100644 index 0000000..984369c --- /dev/null +++ b/tests/readback.c @@ -0,0 +1,64 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned int *q; + int i; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + q = p; + for (i = 0; i < (hpage_size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (hpage_size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Mismatch at offset 0x%x: 0x%x instead of 0x%x", + i, q[i], RANDOM_CONSTANT ^ i); + } + + PASS(); +} diff --git a/tests/run_tests.py b/tests/run_tests.py new file mode 100755 index 0000000..3c95a03 --- /dev/null +++ b/tests/run_tests.py @@ -0,0 +1,766 @@ +#! /usr/bin/env python + +import subprocess +import types +import os +import sys +import getopt +import resource +import errno + +# The superset of wordsizes that should be tested (default 32, 64) +wordsizes = set() + +# The super set of page sizes that should be tested. Defaults to all supported +# huge page sizes with an active mount and at least one huge page allocated +pagesizes = set() + +# Each page size may have a subset of valid wordsizes +# This is a dictionary (indexed by page size) of sets +wordsizes_by_pagesize = {} + +# The linkhuge tests may only be valid on a subset of word sizes +# This set contains the wordsizes valid for linkhuge tests +linkhuge_wordsizes = set() + +# A list of all discovered mountpoints that may be used by libhugetlbfs for +# this run of tests. This is used for cleaning up left-over share files. +mounts = [] + +# Results matrix: This 3-D dictionary is indexed as follows: +# [type] - Test results fall into one of the 'result_types' categories +# [pagesize] - a page size from the set 'pagesizes' +# [bits] - a word size from the set 'wordsizes' +# The indexed value is the number of tests matching the above traits +R = {} +result_types = ("total", "pass", "config", "fail", "xfail", "xpass", + "signal", "strange", "skip", "nofile") + +def bash(cmd): + """ + Run 'cmd' in the shell and return the exit code and output. + """ + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + try: + rc = p.wait() + except KeyboardInterrupt: + # Abort and mark this a strange test result + return (127, "") + out = p.stdout.read().strip() + return (rc, out) + +def snapshot_pool_state(): + l = [] + for d in os.listdir("/sys/kernel/mm/hugepages"): + substate = [(f, int(open("/sys/kernel/mm/hugepages/%s/%s" % (d, f)).read())) + for f in os.listdir("/sys/kernel/mm/hugepages/%s" % d)] + l.append((d, tuple(substate))) + return tuple(l) + +def run_test_prog(bits, pagesize, cmd, **env): + if paranoid_pool_check: + beforepool = snapshot_pool_state() + print "Pool state: %s" % str(beforepool) + + local_env = os.environ.copy() + local_env.update(env) + local_env["PATH"] = "./obj%d:../obj%d:%s" \ + % (bits, bits, local_env.get("PATH", "")) + local_env["LD_LIBRARY_PATH"] = "../obj%d:obj%d:%s" \ + % (bits, bits, local_env.get("LD_LIBRARY_PATH", "")) + local_env["HUGETLB_DEFAULT_PAGE_SIZE"] = repr(pagesize) + + try: + p = subprocess.Popen(cmd, env=local_env, stdout=subprocess.PIPE) + rc = p.wait() + except KeyboardInterrupt: + # Abort and mark this a strange test result + return (None, "") + except OSError as e: + return (-e.errno, "") + out = p.stdout.read().strip() + + if paranoid_pool_check: + afterpool = snapshot_pool_state() + if afterpool != beforepool: + print >>sys.stderr, "Hugepage pool state not preserved!" + print >>sys.stderr, "BEFORE: %s" % str(beforepool) + print >>sys.stderr, "AFTER: %s" % str(afterpool) + sys.exit(98) + + return (rc, out) + +def setup_env(override, defaults): + """ + Set up the environment for running commands in the shell. + """ + # All items in override are unconditionally set or unset + for (var, val) in override.items(): + if val == None: + if var in os.environ: + del os.environ[var] + else: + os.environ[var] = val + # If not already set, these variables are given default values + for (var, val) in defaults.items(): + if var not in os.environ or os.environ[var] == "": + os.environ[var] = val + +def init_results(): + """ + Define the structure of the results matrix and initialize all results to 0. + """ + global R + + for t in result_types: + R[t] = {} + for p in pagesizes: + R[t][p] = {} + for bits in (32, 64): + R[t][p][bits] = 0 + +def pretty_page_size(size): + """ + Convert a page size to a formatted string + + Given a page size in bytes, return a string that expresses the size in + a sensible unit (K, M, or G). + """ + factor = 0 + while size > 1024: + factor += 1 + size /= 1024 + + if factor == 0: return "%iB" % size + elif factor == 1: return "%iK" % size + elif factor == 2: return "%iM" % size + elif factor == 3: return "%iG" % size + +def print_per_size(title, values): + """ + Print one line of test results + + Print the results of a given result type on one line. The results for all + page sizes and word sizes are written in a table format. + """ + print "*%20s: " % title, + for sz in pagesizes: + print "%4s %4s " % (values[sz][32], values[sz][64]), + print + +def results_summary(): + """ + Display a summary of the test results + """ + print "********** TEST SUMMARY" + print "*%21s" % "", + for p in pagesizes: print "%-13s " % pretty_page_size(p), + print + print "*%21s" % "", + for p in pagesizes: print "32-bit 64-bit ", + print + + print_per_size("Total testcases", R["total"]) + print_per_size("Skipped", R["skip"]) + print_per_size("PASS", R["pass"]) + print_per_size("FAIL", R["fail"]) + print_per_size("Killed by signal", R["signal"]) + print_per_size("Bad configuration", R["config"]) + print_per_size("Expected FAIL", R["xfail"]) + print_per_size("Unexpected PASS", R["xpass"]) + print_per_size("Test not present", R["nofile"]) + print_per_size("Strange test result", R["strange"]) + print "**********" + +def free_hpages(): + """ + Return the number of free huge pages. + + Parse /proc/meminfo to obtain the number of free huge pages for + the default page size. + XXX: This function is not multi-size aware yet. + """ + (rc, out) = bash("grep 'HugePages_Free:' /proc/meminfo | cut -f2 -d:") + return (rc, int(out)) + +def total_hpages(): + """ + Return the total number of huge pages in the pool. + + Parse /proc/meminfo to obtain the number of huge pages for the default + page size. + XXX: This function is not multi-size aware yet. + """ + (rc, out) = bash("grep 'HugePages_Total:' /proc/meminfo | cut -f2 -d:") + return (rc, int(out)) + +def hpage_size(): + """ + Return the size of the default huge page size in bytes. + + Parse /proc/meminfo to obtain the default huge page size. This number is + reported in Kb so multiply it by 1024 to convert it to bytes. + XXX: This function is not multi-size aware yet. + """ + (rc, out) = bash("grep 'Hugepagesize:' /proc/meminfo | awk '{print $2}'") + if out == "": out = 0 + out = int(out) * 1024 + return (rc, out) + +def clear_hpages(): + """ + Remove stale hugetlbfs files after sharing tests. + + Traverse the mount points that are in use during testing to find left-over + files that were created by the elflink sharing tests. These are not + cleaned up automatically and must be removed to free up the huge pages. + """ + for mount in mounts: + dir = mount + "/elflink-uid-" + `os.getuid()` + for root, dirs, files in os.walk(dir, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + try: + os.rmdir(dir) + except OSError: + pass + +def get_pagesizes(): + """ + Get a list of configured huge page sizes. + + Use libhugetlbfs' hugeadm utility to get a list of page sizes that have + active mount points and at least one huge page allocated to the pool. + """ + sizes = set() + out = "" + (rc, out) = bash("../obj/hugeadm --page-sizes") + if rc != 0 or out == "": return sizes + + for size in out.split("\n"): sizes.add(int(size)) + return sizes + +def get_wordsizes(): + """ + Checks for obj32 and obj64 subdirs to determine valid word sizes. + """ + sizes = set() + if os.path.isdir("./obj32"): sizes.add(32) + if os.path.isdir("./obj64"): sizes.add(64) + + return sizes + +def check_hugetlbfs_path(): + """ + Check each combination of page size and word size for validity. + + Some word sizes may not be valid for all page sizes. For example, a 16G + page is too large to be used in a 32 bit process. Use a helper program to + weed out invalid combinations and print informational messages as required. + """ + global wordsizes, pagesizes, mounts, wordsizes_by_pagesize + + for p in pagesizes: + okbits = [] + for b in wordsizes: + (rc, out) = run_test_prog(b, p, "get_hugetlbfs_path") + if rc == 0: + okbits.append(b) + mounts.append(out) + if len(okbits) == 0: + print "run_tests.py: No mountpoints available for page size %s" % \ + pretty_page_size(p) + wordsizes_by_pagesize[p] = set() + continue + for b in wordsizes - set(okbits): + print "run_tests.py: The %i bit word size is not compatible with " \ + "%s pages" % (b, pretty_page_size(p)) + wordsizes_by_pagesize[p] = set(okbits) + +def check_linkhuge_tests(): + """ + Check if the linkhuge tests are safe to run on this system. + + Newer versions of binutils (>= 2.18) are known to be incompatible with the + linkhuge tests and running them may cause unreliable behavior. Determine + which word sizes can be tested with linkhuge. The others will be skipped. + NOTE: The linhuge_rw tests are always safe to run and will not be skipped. + """ + okbits = set() + + for bits in wordsizes: + script = open('obj%d/dummy.ldscript' % bits, 'r').read() + if script.count('SPECIAL') == 0: + okbits.add(bits) + return okbits + +def print_cmd(pagesize, bits, cmd, env): + if env: + print ' '.join(['%s=%s' % (k, v) for k, v in env.items()]), + if type(cmd) != types.StringType: + cmd = ' '.join(cmd) + print "%s (%s: %i):\t" % (cmd, pretty_page_size(pagesize), bits), + sys.stdout.flush() + +def run_test(pagesize, bits, cmd, **env): + """ + Execute a test, print the output and log the result + + Run a test using the specified page size and word size. The parameter + 'pre' may contain additional environment settings and will be prepended to + cmd. A line showing info about the test is printed and after completion + the test output is printed. The result is recorded in the result matrix. + """ + global R + + objdir = "obj%i" % bits + if not os.path.isdir(objdir): + return + + print_cmd(pagesize, bits, cmd, env) + (rc, out) = run_test_prog(bits, pagesize, cmd, **env) + print out + + R["total"][pagesize][bits] += 1 + if rc == 0: R["pass"][pagesize][bits] += 1 + elif rc == 1: R["config"][pagesize][bits] += 1 + elif rc == 2: R["fail"][pagesize][bits] += 1 + elif rc == 3: R["xfail"][pagesize][bits] += 1 + elif rc == 4: R["xpass"][pagesize][bits] += 1 + elif rc == -errno.ENOENT: + R["nofile"][pagesize][bits] += 1 + elif rc < 0: R["signal"][pagesize][bits] += 1 + else: R["strange"][pagesize][bits] += 1 + +def skip_test(pagesize, bits, cmd, **env): + """ + Skip a test, print test information, and log that it was skipped. + """ + global tot_tests, tot_skip + R["total"][pagesize][bits] += 1 + R["skip"][pagesize][bits] += 1 + print_cmd(pagesize, bits, cmd, env) + print "SKIPPED" + +def do_test(cmd, bits=None, **env): + """ + Run a test case, testing each page size and each indicated word size. + """ + if bits == None: bits = wordsizes + for p in pagesizes: + for b in (set(bits) & wordsizes_by_pagesize[p]): + run_test(p, b, cmd, **env) + +def do_test_with_rlimit(rtype, limit, cmd, bits=None, **env): + """ + Run a test case with a temporarily altered resource limit. + """ + oldlimit = resource.getrlimit(rtype) + resource.setrlimit(rtype, (limit, limit)) + do_test(cmd, bits, **env) + resource.setrlimit(rtype, oldlimit) + +def do_elflink_test(cmd, **env): + """ + Run an elflink test case, skipping known-bad configurations. + """ + for p in pagesizes: + for b in wordsizes_by_pagesize[p]: + if b in linkhuge_wordsizes: run_test(p, b, cmd, **env) + else: skip_test(p, b, cmd, **env) + +def elflink_test(cmd, **env): + """ + Run an elflink test case with different configuration combinations. + + Test various combinations of: preloading libhugetlbfs, B vs. BDT link + modes, minimal copying on or off, and disabling segment remapping. + """ + do_test(cmd, **env) + # Test we don't blow up if not linked for hugepage + do_test(cmd, LD_PRELOAD="libhugetlbfs.so", **env) + + # Only run custom ldscript tests when -l option is set + if not custom_ldscripts: + return + + do_elflink_test("xB." + cmd, **env) + do_elflink_test("xBDT." + cmd, **env) + # Test we don't blow up if HUGETLB_MINIMAL_COPY is diabled + do_elflink_test("xB." + cmd, HUGETLB_MINIMAL_COPY="no", **env) + do_elflink_test("xBDT." + cmd, HUGETLB_MINIMAL_COPY="no", **env) + # Test that HUGETLB_ELFMAP=no inhibits remapping as intended + do_elflink_test("xB." + cmd, HUGETLB_ELFMAP="no", **env) + do_elflink_test("xBDT." + cmd, HUGETLB_ELFMAP="no", **env) + +def elflink_rw_test(cmd, **env): + """ + Run the elflink_rw test with different configuration combinations. + + Test various combinations of: remapping modes and minimal copy on or off. + """ + # Basic tests: None, Read-only, Write-only, Read-Write, exlicit disable + do_test(cmd, **env) + do_test(cmd, HUGETLB_ELFMAP="R", **env) + do_test(cmd, HUGETLB_ELFMAP="W", **env) + do_test(cmd, HUGETLB_ELFMAP="RW", **env) + do_test(cmd, HUGETLB_ELFMAP="no", **env) + + # Test we don't blow up if HUGETLB_MINIMAL_COPY is disabled + do_test(cmd, HUGETLB_MINIMAL_COPY="no", HUGETLB_ELFMAP="R", **env) + do_test(cmd, HUGETLB_MINIMAL_COPY="no", HUGETLB_ELFMAP="W", **env) + do_test(cmd, HUGETLB_MINIMAL_COPY="no", HUGETLB_ELFMAP="RW", **env) + +def elfshare_test(cmd, **env): + """ + Test segment sharing with multiple configuration variations. + """ + # Run each elfshare test invocation independently - clean up the + # sharefiles before and after in the first set of runs, but leave + # them there in the second: + clear_hpages() + do_elflink_test("xB." + cmd, HUGETLB_SHARE="1", **env) + clear_hpages() + do_elflink_test("xBDT." + cmd, HUGETLB_SHARE="1", **env) + clear_hpages() + do_elflink_test("xB." + cmd, HUGETLB_SHARE="1", **env) + do_elflink_test("xBDT." + cmd, HUGETLB_SHARE="1", **env) + clear_hpages() + +def elflink_and_share_test(cmd, **env): + """ + Run the ordinary linkhuge tests with sharing enabled + """ + # Run each elflink test pair independently - clean up the sharefiles + # before and after each pair + clear_hpages() + for link_str in ("xB.", "xBDT."): + for i in range(2): + do_elflink_test(link_str + cmd, HUGETLB_SHARE=repr(i), **env) + clear_hpages() + +def elflink_rw_and_share_test(cmd, **env): + """ + Run the ordinary linkhuge_rw tests with sharing enabled + """ + clear_hpages() + for mode in ("R", "W", "RW"): + for i in range(2): + do_test(cmd, HUGETLB_ELFMAP=mode, HUGETLB_SHARE=repr(i), **env) + clear_hpages() + +def setup_shm_sysctl(limit): + """ + Adjust the kernel shared memory limits to accomodate a desired size. + + The original values are returned in a dictionary that can be passed to + restore_shm_sysctl() to restore the system state. + """ + if os.getuid() != 0: return {} + sysctls = {} + files = [ "/proc/sys/kernel/shmmax", "/proc/sys/kernel/shmall"] + for f in files: + fh = open(f, "r") + sysctls[f] = fh.read() + fh.close() + fh = open(f, "w") + fh.write(`limit`) + fh.close() + print "set shmmax limit to %s" % limit + return sysctls + +def restore_shm_sysctl(sysctls): + """ + Restore the sysctls named in 'sysctls' to the given values. + """ + if os.getuid() != 0: return + for (file, val) in sysctls.items(): + fh = open(file, "w") + fh.write(val) + fh.close() + +def do_shm_test(cmd, limit=None, bits=None, **env): + """ + Run a test case with temporarily expanded SysV shm limits, testing + each indicated word size. + """ + if bits == None: + bits = wordsizes + if limit != None: + tmp = setup_shm_sysctl(limit) + for b in bits: + run_test(system_default_hpage_size, b, cmd, **env) + if limit != None: + restore_shm_sysctl(tmp) + +def functional_tests(): + """ + Run the set of functional tests. + """ + global linkhuge_wordsizes + + # Kernel background tests not requiring hugepage support + do_test("zero_filesize_segment") + + # Library background tests not requiring hugepage support + do_test("test_root") + do_test("meminfo_nohuge") + + # Library tests requiring kernel hugepage support + do_test("gethugepagesize") + do_test("gethugepagesizes") + do_test("empty_mounts", HUGETLB_VERBOSE="1") + do_test("large_mounts", HUGETLB_VERBOSE="1") + + # Tests requiring an active and usable hugepage mount + do_test("find_path") + do_test("unlinked_fd") + do_test("readback") + do_test("truncate") + do_test("shared") + do_test("mprotect") + do_test_with_rlimit(resource.RLIMIT_MEMLOCK, -1, "mlock") + do_test("misalign") + do_test("fallocate_basic.sh") + do_test("fallocate_align.sh") + + # Specific kernel bug tests + do_test("ptrace-write-hugepage") + do_test("icache-hygiene") + do_test("slbpacaflush") + do_test("straddle_4GB_static", bits=(64,)) + do_test("huge_at_4GB_normal_below_static", bits=(64,)) + do_test("huge_below_4GB_normal_above_static", bits=(64,)) + do_test("map_high_truncate_2") + do_test("misaligned_offset") + do_test("truncate_above_4GB") + do_test("brk_near_huge") + do_test("task-size-overrun") + do_test_with_rlimit(resource.RLIMIT_STACK, -1, "stack_grow_into_huge") + do_test("corrupt-by-cow-opt") + do_test("noresv-preserve-resv-page") + do_test("noresv-regarded-as-resv") + + if dangerous == 1: + do_test("readahead_reserve") + do_test("madvise_reserve") + do_test("fadvise_reserve") + do_test("mremap-expand-slice-collision") + do_test("mremap-fixed-normal-near-huge") + do_test("mremap-fixed-huge-near-normal") + else: + do_test("readahead_reserve.sh") + do_test("madvise_reserve.sh") + do_test("fadvise_reserve.sh") + do_test("mremap-expand-slice-collision.sh") + do_test("mremap-fixed-normal-near-huge.sh") + do_test("mremap-fixed-huge-near-normal.sh") + do_shm_test("shm-perms", 64*1024*1024) + + # Tests requiring an active mount and hugepage COW + do_test("private") + do_test("fork-cow") + do_test("direct") + do_test("malloc") + do_test("malloc", LD_PRELOAD="libhugetlbfs.so", HUGETLB_MORECORE="yes") + do_test("malloc", LD_PRELOAD="libhugetlbfs.so", HUGETLB_MORECORE="yes", + HUGETLB_RESTRICT_EXE="unknown:none") + do_test("malloc", LD_PRELOAD="libhugetlbfs.so", HUGETLB_MORECORE="yes", + HUGETLB_RESTRICT_EXE="unknown:malloc") + do_test("malloc_manysmall") + do_test("malloc_manysmall", LD_PRELOAD="libhugetlbfs.so", + HUGETLB_MORECORE="yes") + + # After upstream commit: (glibc-2.25.90-688-gd5c3fafc43) glibc has a + # new per-thread caching mechanism that will NOT allow heapshrink test to + # successfully measure if heap has shrunk or not due to the fact that + # heap won't have its sized reduced right away. + # + # In order to disable it, you need to have the tunable GLIBC in place. + # Unfortunately, it requires to be set before program is loaded, as an + # environment variable, since we can't re-initialize malloc() from the + # program context (not even with a constructor function), and the tunable + # is only evaluated during malloc() initialization. + + do_test("heapshrink", + GLIBC_TUNABLES="glibc.malloc.tcache_count=0") + do_test("heapshrink", + GLIBC_TUNABLES="glibc.malloc.tcache_count=0", + LD_PRELOAD="libheapshrink.so") + do_test("heapshrink", + GLIBC_TUNABLES="glibc.malloc.tcache_count=0", + LD_PRELOAD="libhugetlbfs.so", + HUGETLB_MORECORE="yes") + do_test("heapshrink", + GLIBC_TUNABLES="glibc.malloc.tcache_count=0", + LD_PRELOAD="libhugetlbfs.so libheapshrink.so", + HUGETLB_MORECORE="yes") + do_test("heapshrink", + GLIBC_TUNABLES="glibc.malloc.tcache_count=0", + LD_PRELOAD="libheapshrink.so", + HUGETLB_MORECORE="yes", + HUGETLB_MORECORE_SHRINK="yes") + do_test("heapshrink", + GLIBC_TUNABLES="glibc.malloc.tcache_count=0", + LD_PRELOAD="libhugetlbfs.so libheapshrink.so", + HUGETLB_MORECORE="yes", + HUGETLB_MORECORE_SHRINK="yes") + + do_test("heap-overflow", HUGETLB_VERBOSE="1", HUGETLB_MORECORE="yes") + + # Run the remapping tests' up-front checks + linkhuge_wordsizes = check_linkhuge_tests() + # Original elflink tests + elflink_test("linkhuge_nofd", HUGETLB_VERBOSE="0") + elflink_test("linkhuge") + + # Only run custom ldscript tests when -l option is set + if custom_ldscripts: + # Original elflink sharing tests + elfshare_test("linkshare") + elflink_and_share_test("linkhuge") + + # elflink_rw tests + elflink_rw_test("linkhuge_rw") + # elflink_rw sharing tests + elflink_rw_and_share_test("linkhuge_rw") + + # Accounting bug tests + # reset free hpages because sharing will have held some + # alternatively, use + do_test("chunk-overcommit") + do_test(("alloc-instantiate-race", "shared")) + do_test(("alloc-instantiate-race", "private")) + do_test("truncate_reserve_wraparound") + do_test("truncate_sigbus_versus_oom") + + # Test direct allocation API + do_test("get_huge_pages") + + # Test overriding of shmget() + do_shm_test("shmoverride_linked") + do_shm_test("shmoverride_linked", HUGETLB_SHM="yes") + do_shm_test("shmoverride_linked_static") + do_shm_test("shmoverride_linked_static", HUGETLB_SHM="yes") + do_shm_test("shmoverride_unlinked", LD_PRELOAD="libhugetlbfs.so") + do_shm_test("shmoverride_unlinked", LD_PRELOAD="libhugetlbfs.so", HUGETLB_SHM="yes") + + # Test hugetlbfs filesystem quota accounting + do_test("quota.sh") + + # Test accounting of HugePages_{Total|Free|Resv|Surp} + # Alters the size of the hugepage pool so should probably be run last + do_test("counters.sh") + +def stress_tests(): + """ + Run the set of stress tests. + """ + iterations = 10 # Number of iterations for looping tests + + # Don't update NRPAGES every time like above because we want to catch the + # failures that happen when the kernel doesn't release all of the huge pages + # after a stress test terminates + (rc, nr_pages) = free_hpages() + + do_test(("mmap-gettest", repr(iterations), repr(nr_pages))) + + # mmap-cow needs a hugepages for each thread plus one extra + do_test(("mmap-cow", repr(nr_pages-1), repr(nr_pages))) + + (rc, tot_pages) = total_hpages() + limit = system_default_hpage_size * tot_pages + threads = 10 # Number of threads for shm-fork + + # Run shm-fork once using half available hugepages, then once using all + # This is to catch off-by-ones or races in the kernel allocated that + # can make allocating all hugepages a problem + if nr_pages > 1: + do_shm_test(("shm-fork", repr(threads), repr(nr_pages / 2)), limit) + do_shm_test(("shm-fork", repr(threads), repr(nr_pages)), limit) + + do_shm_test(("shm-getraw", repr(nr_pages), "/dev/full"), limit) + + do_test("fallocate_stress.sh") + +def print_help(): + print "Usage: %s [options]" % sys.argv[0] + print "Options:" + print " -v \t Verbose output." + print " -V \t Highly verbose output." + print " -f \t Force all tests." + print " -t Run test set, allowed are func and stress." + print " -b Define wordsizes to be used. " + print " -p Define the page sizes to be used." + print " -c \t Do a paranoid pool check." + print " -l \t Use custom ld scripts." + print " -h \t This help." + sys.exit(0) + +def main(): + global wordsizes, pagesizes, dangerous, paranoid_pool_check, system_default_hpage_size + global custom_ldscripts + testsets = set() + env_override = {"QUIET_TEST": "1", "HUGETLBFS_MOUNTS": "", + "HUGETLB_ELFMAP": None, "HUGETLB_MORECORE": None} + env_defaults = {"HUGETLB_VERBOSE": "0"} + dangerous = 0 + paranoid_pool_check = False + custom_ldscripts = False + + try: + opts, args = getopt.getopt(sys.argv[1:], "vVft:b:p:c:lh") + except getopt.GetoptError, err: + print str(err) + sys.exit(1) + for opt, arg in opts: + if opt == "-v": + env_override["QUIET_TEST"] = None + env_defaults["HUGETLB_VERBOSE"] = "2" + elif opt == "-V": + env_defaults["HUGETLB_VERBOSE"] = "99" + elif opt == "-f": + dangerous = 1 + elif opt == "-t": + for t in arg.split(): testsets.add(t) + elif opt == "-b": + for b in arg.split(): wordsizes.add(int(b)) + elif opt == "-p": + for p in arg.split(): pagesizes.add(int(p)) + elif opt == '-c': + paranoid_pool_check = True + elif opt == '-l': + custom_ldscripts = True + elif opt == '-h': + print_help() + else: + assert False, "unhandled option" + if len(testsets) == 0: testsets = set(["func", "stress"]) + if len(wordsizes) == 0: wordsizes = get_wordsizes() + if len(pagesizes) == 0: pagesizes = get_pagesizes() + + if len(pagesizes) == 0: + print "Unable to find available page sizes, are you sure hugetlbfs" + print "is mounted and there are available huge pages?" + return 1 + + setup_env(env_override, env_defaults) + init_results() + + (rc, system_default_hpage_size) = hpage_size() + if rc != 0: + print "Unable to find system default hugepage size." + print "Is hugepage supported included in this kernel?" + return 1 + + check_hugetlbfs_path() + + if "func" in testsets: functional_tests() + if "stress" in testsets: stress_tests() + + results_summary() + +if __name__ == "__main__": + main() diff --git a/tests/shared.c b/tests/shared.c new file mode 100644 index 0000000..e04fb04 --- /dev/null +++ b/tests/shared.c @@ -0,0 +1,71 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p, *q; + unsigned int *pl, *ql; + int i; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + q = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (q == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + pl = p; + for (i = 0; i < (hpage_size / sizeof(*pl)); i++) { + pl[i] = RANDOM_CONSTANT ^ i; + } + + ql = q; + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Mismatch"); + } + + PASS(); +} diff --git a/tests/shm-fork.c b/tests/shm-fork.c new file mode 100644 index 0000000..4f500af --- /dev/null +++ b/tests/shm-fork.c @@ -0,0 +1,137 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +#define P "shm-fork" +#define DESC \ + "* Test shared memory behavior when multiple threads are attached *\n"\ + "* to a segment. A segment is created and then children are *\n"\ + "* spawned which attach, write, read (verify), and detach from the *\n"\ + "* shared memory segment. *" + +extern int errno; + +/* Global Configuration */ +static int nr_hugepages; +static int numprocs; +static int shmid = -1; + +#define MAX_PROCS 200 +#define BUF_SZ 256 + +#define CHILD_FAIL(thread, fmt, ...) \ + do { \ + verbose_printf("Thread %d (pid=%d) FAIL: " fmt, \ + thread, getpid(), __VA_ARGS__); \ + exit(1); \ + } while (0) + +void cleanup(void) +{ + remove_shmid(shmid); +} + +static void do_child(int thread, unsigned long size) +{ + volatile char *shmaddr; + int j; + unsigned long k; + + verbose_printf("."); + for (j=0; j<5; j++) { + shmaddr = shmat(shmid, 0, SHM_RND); + if (shmaddr == MAP_FAILED) + CHILD_FAIL(thread, "shmat() failed: %s", + strerror(errno)); + + for (k=0;k <# pages>", argv[0]); + + numprocs = atoi(argv[1]); + nr_hugepages = atoi(argv[2]); + + if (numprocs > MAX_PROCS) + CONFIG("Cannot spawn more than %d processes", MAX_PROCS); + + check_hugetlb_shm_group(); + + hpage_size = check_hugepagesize(); + size = hpage_size * nr_hugepages; + verbose_printf("Requesting %lu bytes\n", size); + if ((shmid = shmget(2, size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W )) < 0) + FAIL("shmget(): %s", strerror(errno)); + + verbose_printf("shmid: %d\n", shmid); + + verbose_printf("Spawning children:\n"); + for (i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +extern int errno; + +/* Global Configuration */ +#define P "shm-getraw" +#define DESC \ + "* This test exercizes the code path which performs raw device IO *\n"\ + "* into a large page backed shared memory segment. The specified *\n"\ + "* device will be read into a shared memory segment. *" + +static int nr_hugepages; +static int shmid = -1; + +void cleanup(void) +{ + remove_shmid(shmid); +} + +int main(int argc, char ** argv) +{ + size_t size; + size_t i; + long hpage_size = check_hugepagesize(); + volatile char *shmaddr; + char *buffer; + int raw_fd; + + test_init(argc, argv); + + check_hugetlb_shm_group(); + + if (argc < 3) + CONFIG("Usage: %s <# pages> ", argv[0]); + + nr_hugepages = atoi(argv[1]); + + verbose_printf("hpage_size is: %ld\n", hpage_size); + + buffer = malloc(hpage_size*sizeof(char)); + if (!buffer) + FAIL("malloc(%li)", hpage_size*sizeof(char)); + + raw_fd = open(argv[2], O_RDONLY); + if (!raw_fd) + CONFIG("Cannot open raw device: %s", strerror(errno)); + + size = hpage_size * nr_hugepages; + + verbose_printf("Requesting %zu bytes\n", size); + + if ((shmid = shmget(2, size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W )) < 0) + FAIL("shmget(): %s", strerror(errno)); + + verbose_printf("shmid: 0x%x\n", shmid); + shmaddr = shmat(shmid, 0, SHM_RND) ; + if (shmaddr == MAP_FAILED) + FAIL("shmat() failed: %s", strerror(errno)); + + verbose_printf("shmaddr: %p\n", shmaddr); + + /* Read a page from device and write to shm segment */ + for (i = 0; i < size; i+=hpage_size) { + if (!read(raw_fd, buffer, hpage_size)) + FAIL("Can't read from raw device: %s", + strerror(errno)); + memcpy((char*)(shmaddr + i), buffer, hpage_size); + } + + verbose_printf("Done.\n"); + if (shmdt((const void *)shmaddr) != 0) + FAIL("shmdt() failed: %s", strerror(errno)); + + free(buffer); + PASS(); +} diff --git a/tests/shm-gettest.c b/tests/shm-gettest.c new file mode 100644 index 0000000..a0f17eb --- /dev/null +++ b/tests/shm-gettest.c @@ -0,0 +1,110 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +extern int errno; + +/* Global Configuration */ +#define P "shm-gettest" +#define DESC \ + "* A looping test to verify the functionality of large page backed *\n"\ + "* shared memory segments. A segment is created, written, *\n"\ + "* verified, and detached a specified number of times. *" + +static unsigned int iter; +static int nr_hugepages; +static int shmid = -1; +static long hpage_size; + +void cleanup(void) +{ + remove_shmid(shmid); +} + +static void do_one(size_t size) { + size_t i,j; + char pattern; + char *shmaddr; + + verbose_printf("Requesting %zu bytes\n", size); + + if ((shmid = shmget(2, size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W )) < 0) + FAIL("shmget(): %s", strerror(errno)); + + verbose_printf("shmid: 0x%x\n", shmid); + + shmaddr = shmat(shmid, 0, SHM_RND) ; + if (shmaddr == MAP_FAILED) + FAIL("shmat(): %s", strerror(errno)); + + verbose_printf("shmaddr: %p\n", shmaddr); + + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Touching %p with %c\n", shmaddr+(i*hpage_size),pattern); + memset(shmaddr+(i*hpage_size), pattern, hpage_size); + } + + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Verifying %p\n", (shmaddr+(i*hpage_size))); + for (j = 0; j < hpage_size; j++) + if (*(shmaddr+(i*hpage_size)+j) != pattern) + FAIL("Verifying the segment failed. " + "Got %c, expected %c", + *(shmaddr+(i*hpage_size)+j), pattern); + } + + if (shmdt((const void *)shmaddr) != 0) + FAIL("shmdt(): %s", strerror(errno)); +} + +int main(int argc, char ** argv) +{ + size_t size; + int i; + + test_init(argc, argv); + + if (argc < 3) + CONFIG("Usage: shmgettest <# iterations> <# pages>\n"); + + check_hugetlb_shm_group(); + + iter = atoi(argv[1]); + nr_hugepages = atoi(argv[2]); + + hpage_size = gethugepagesize(); + size = nr_hugepages * hpage_size; + + for (i=0; i < iter; i++) + do_one(size); + + PASS(); +} diff --git a/tests/shm-perms.c b/tests/shm-perms.c new file mode 100644 index 0000000..590a101 --- /dev/null +++ b/tests/shm-perms.c @@ -0,0 +1,131 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +#define P "shm-perms" +#define DESC \ + "* Test shared memory behavior when multiple threads are attached *\n"\ + "* to a segment with different permissions. A segment is created *\n"\ + "* and children attach read-only to check reservation accounting. *" + +#define SEGMENT_SIZE ((size_t)0x4000000) +#define SEGMENT_KEY 0x82ba15ff +#define STRIDE 0x200000 + +static int global_shmid = -1; +void *shm_addr = NULL; + +void cleanup(void) +{ + remove_shmid(global_shmid); +} + +int attach_segment(size_t segsize, int shmflags, int shmperms) +{ + int shmid; + + /* Create/get large segment */ + shmid = shmget(SEGMENT_KEY, segsize, shmflags); + if (shmid == -1) { + perror("shmget(SEGMENT)"); + cleanup(); + exit(EXIT_FAILURE); + } + + /* Attach large segment */ + if ( (shm_addr = shmat(shmid, shm_addr, shmperms)) == (void *)-1) { + perror("shmat(SEGMENT)"); + cleanup(); + exit(EXIT_FAILURE); + } + + global_shmid = shmid; + return shmid; +} + +int main(int argc, char **argv) +{ + char *p; + pid_t *wait_list; + int i, iterations; + long hpage_size = check_hugepagesize(); + long total_hpages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + + /* Setup */ + test_init(argc, argv); + check_hugetlb_shm_group(); + if (hpage_size > SEGMENT_SIZE) + CONFIG("Page size is too large for configured SEGMENT_SIZE\n"); + check_free_huge_pages(SEGMENT_SIZE / hpage_size); + + iterations = (total_hpages * hpage_size) / SEGMENT_SIZE + 1; + verbose_printf("iterations = %d\n", iterations); + + wait_list = malloc(sizeof(pid_t) * iterations); + if (wait_list == NULL) + FAIL("Failed to allocate wait_list"); + + /* Create, attach and part init segment */ + attach_segment(SEGMENT_SIZE, IPC_CREAT|SHM_HUGETLB|0640, 0); + p = (char *)shm_addr; + for (i = 0; i < 4; i++, p += STRIDE) + memset(p, 0x55, STRIDE); + + /* Detach segment */ + if (shmdt(shm_addr) != 0) + FAIL("shmdt(SEGMENT)"); + + /* Create children to reattach read-only */ + for (i = 0; i < iterations; i++) { + pid_t pid; + pid = fork(); + if (pid == -1) + FAIL("fork"); + + if (pid) { + wait_list[i] = pid; + } else { + attach_segment(0, 0, SHM_RDONLY); + if (shmdt(shm_addr) != 0) { + perror("shmdt(SEGMENT)"); + exit(EXIT_FAILURE); + } + exit(EXIT_SUCCESS); + } + } + + /* Wait for all children to exit */ + for (i = 0; i < iterations; i++) { + int status; + if (waitpid(wait_list[i], &status, 0) == -1) + FAIL("waitpid"); + if (status != EXIT_SUCCESS) + FAIL("Child exited with failure"); + } + + PASS(); +} diff --git a/tests/shmoverride_unlinked.c b/tests/shmoverride_unlinked.c new file mode 100644 index 0000000..25c2b31 --- /dev/null +++ b/tests/shmoverride_unlinked.c @@ -0,0 +1,248 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hugetests.h" + +/* + * Test Scenario: + * + * libhugetlbfs_shmoverride can be used to force shmget() to use the + * SHM_HUGETLB flag. This test ensures that the flag is correctly used + * based on the value of the environment variable. The assumption is + * made that the library is being preloaded. + */ + +extern int errno; + +/* Global test configuration */ +#define DYNAMIC_SYSCTL "/proc/sys/vm/nr_overcommit_hugepages" +static long saved_nr_hugepages = -1; +static long hpage_size, bpage_size; +static long oc_pool = -1; + +/* Required pool size for test */ +#define POOL_SIZE 4 + +/* State arrays for our mmaps */ +#define NR_SLOTS 1 +#define SL_TEST 0 +static int map_id[NR_SLOTS]; +static char *map_addr[NR_SLOTS]; +static size_t map_size[NR_SLOTS]; + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define SHMAT_FLAGS (SHM_RND) +#else +#define ADDR (void *)(0x0UL) +#define SHMAT_FLAGS (0) +#endif + +void _shmmap(int s, int hpages, int bpages, int line) +{ + map_size[s] = hpages * hpage_size + bpages * bpage_size; + map_id[s] = shmget(IPC_PRIVATE, map_size[s], IPC_CREAT | SHM_R | SHM_W); + if (map_id[s] < 0) + FAIL("shmget failed size %zd from line %d: %s", + map_size[s], line, strerror(errno)); + + map_addr[s] = shmat(map_id[s], ADDR, SHMAT_FLAGS); + if (map_addr[s] == (char *)-1) + FAIL("shmmat failed from line %d: %s", line, strerror(errno)); +} +#define shmmap(s, h, b) _shmmap(s, h, b, __LINE__) + +void _shmunmap(int s, int line) +{ + if (shmdt((const void *)map_addr[s]) != 0) { + FAIL("shmdt failed from line %d: %s", line, strerror(errno)); + return; + } + + if (shmctl(map_id[s], IPC_RMID, NULL) == -1) + FAIL("shmctl failed from line %d: %s", line, strerror(errno)); + + map_id[s] = -1; + map_addr[s] = NULL; + map_size[s] = 0; +} +#define shmunmap(s) _shmunmap(s, __LINE__) + +/* + * This test wants to manipulate the hugetlb pool without necessarily linking + * to libhugetlbfs so the helpers for doing this may not be available -- hence + * the duplicated versions below. + * + * NOTE: We use /proc/sys/vm/nr_hugepages and /proc/meminfo for writing and + * reading pool counters because shared memory will always use the system + * default huge page size regardless of any libhugetlbfs settings. + */ +#define MEMINFO_SIZE 2048 +long local_read_meminfo(const char *tag) +{ + int fd; + char buf[MEMINFO_SIZE]; + int len, readerr; + char *p, *q; + long val; + + fd = open("/proc/meminfo", O_RDONLY); + if (fd < 0) + FAIL("Couldn't open /proc/meminfo: %s\n", strerror(errno)); + + len = read(fd, buf, sizeof(buf)); + readerr = errno; + close(fd); + if (len < 0) + FAIL("Error reading /proc/meminfo: %s\n", strerror(readerr)); + + if (len == sizeof(buf)) + FAIL("/proc/meminfo is too large\n"); + buf[len] = '\0'; + + p = strstr(buf, tag); + if (!p) + FAIL("Tag %s not found in /proc/meminfo\n", tag); + p += strlen(tag); + + val = strtol(p, &q, 0); + if (!isspace(*q)) + FAIL("Couldn't parse /proc/meminfo\n"); + + return val; +} + +void setup_hugetlb_pool(unsigned long count) +{ + FILE *fd; + unsigned long poolsize; + count += local_read_meminfo("HugePages_Rsvd:"); + fd = fopen("/proc/sys/vm/nr_hugepages", "w"); + if (!fd) + CONFIG("Cannot open nr_hugepages for writing\n"); + fprintf(fd, "%lu", count); + fclose(fd); + + /* Confirm the resize worked */ + poolsize = local_read_meminfo("HugePages_Total:"); + if (poolsize != count) + FAIL("Failed to resize pool to %lu pages. Got %lu instead\n", + count, poolsize); +} + +void local_check_free_huge_pages(int needed_pages) +{ + int free = local_read_meminfo("HugePages_Free:"); + if (free < needed_pages) + CONFIG("Must have at least %i free hugepages", needed_pages); +} + +void run_test(char *desc, int hpages, int bpages, int pool_nr, int expect_diff) +{ + long resv_before, resv_after; + verbose_printf("%s...\n", desc); + setup_hugetlb_pool(pool_nr); + + /* untouched, shared mmap */ + resv_before = local_read_meminfo("HugePages_Rsvd:"); + shmmap(SL_TEST, hpages, bpages); + resv_after = local_read_meminfo("HugePages_Rsvd:"); + memset(map_addr[SL_TEST], 0, map_size[SL_TEST]); + shmunmap(SL_TEST); + + if (resv_after - resv_before != expect_diff) + FAIL("%s: Reserve page count did not adjust by %d page. " + "Expected %li reserved pages but got %li pages", + desc, expect_diff, + resv_before + expect_diff, resv_after); +} + +void cleanup(void) +{ + int i; + + /* Clean up any allocated shmids */ + for (i = 0; i < NR_SLOTS; i++) + if (map_id[i] > 0) + shmctl(map_id[i], IPC_RMID, NULL); + + /* Restore the pool size. */ + if (saved_nr_hugepages >= 0) + setup_hugetlb_pool(saved_nr_hugepages); + + if (oc_pool > 0) + restore_overcommit_pages(hpage_size, oc_pool); +} + +int main(int argc, char **argv) +{ + char *env; + + test_init(argc, argv); + check_must_be_root(); + local_check_free_huge_pages(POOL_SIZE); + saved_nr_hugepages = local_read_meminfo("HugePages_Total:"); + + /* + * We cannot call check_hugepagesize because we are not linked to + * libhugetlbfs. This is a bit hacky but we are depending on earlier + * tests failing to catch when this wouldn't work + */ + hpage_size = local_read_meminfo("Hugepagesize:") * 1024; + bpage_size = getpagesize(); + oc_pool = read_nr_overcommit(hpage_size); + if (oc_pool > 0) + set_nr_overcommit_hugepages(hpage_size, 0); + + env = getenv("HUGETLB_SHM"); + + /* Now that all env parsing is in one location and is only done once + * during library init, we cannot modify the value of HGUETLB_SHM + * in the middle of the test, instead run the tests that fit with + * the current value of HUGETLB_SHM + */ + if (env && strcasecmp(env, "yes") == 0) { + /* Run the test with large pages */ + run_test("override-requested-aligned", 1, 0, POOL_SIZE, 1); + + /* Run the test with large pages but with an unaligned size */ + run_test("override-requested-unaligned", 1, 1, POOL_SIZE, 2); + + /* Run the test with no pool but requested large pages */ + setup_hugetlb_pool(0); + run_test("override-requested-aligned-nopool", 1, 0, 0, 0); + } else { + /* Run the test with small pages */ + run_test("override-not-requested-aligned", 1, 0, POOL_SIZE, 0); + } + + PASS(); +} diff --git a/tests/slbpacaflush.c b/tests/slbpacaflush.c new file mode 100644 index 0000000..8893c4d --- /dev/null +++ b/tests/slbpacaflush.c @@ -0,0 +1,96 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +/* Test rationale: + * + * ppc64 kernels (prior to 2.6.15-rc5) have a bug in the hugepage SLB + * flushing path. After opening new hugetlb areas, we update the + * masks in the thread_struct, copy to the PACA, then do slbies on + * each CPU. The trouble is we only copy to the PACA on the CPU where + * we're opening the segments, which can leave a stale copy in the + * PACAs on other CPUs. + * + * This can be triggered either with multiple threads sharing the mm, + * or with a single thread which is migrated from one CPU, to another + * (where the mapping occurs), then back again (where we touch the + * stale SLB). We use the second method in this test, since it's + * easier to force (using sched_setaffinity). However it relies on a + * close-to-idle system, if any process other than a kernel thread + * runs on the first CPU between runs of the test process, the SLB + * will be flushed and we won't trigger the bug, hence the + * PASS_INCONCLUSIVE(). Obviously, this test won't work on a 1-cpu + * system (should get CONFIG() on the sched_setaffinity()). + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + volatile unsigned long *q; + int err; + cpu_set_t cpu0, cpu1; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + CPU_ZERO(&cpu0); + CPU_SET(0, &cpu0); + CPU_ZERO(&cpu1); + CPU_SET(1, &cpu1); + + err = sched_setaffinity(getpid(), CPU_SETSIZE/8, &cpu0); + if (err != 0) + CONFIG("sched_setaffinity(cpu0): %s", strerror(errno)); + + err = sched_setaffinity(getpid(), CPU_SETSIZE/8, &cpu1); + if (err != 0) + CONFIG("sched_setaffinity(): %s", strerror(errno)); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + err = sched_setaffinity(getpid(), CPU_SETSIZE/8, &cpu0); + if (err != 0) + CONFIG("sched_setaffinity(cpu0): %s", strerror(errno)); + + q = (volatile unsigned long *)(p + getpagesize()); + *q = 0xdeadbeef; + + PASS_INCONCLUSIVE(); +} diff --git a/tests/stack_grow_into_huge.c b/tests/stack_grow_into_huge.c new file mode 100644 index 0000000..a380da0 --- /dev/null +++ b/tests/stack_grow_into_huge.c @@ -0,0 +1,150 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include +#include "hugetests.h" + +/* + * Test rationale: + * + * On PowerPC, the address space is divided into segments. These segments can + * contain either huge pages or normal pages, but not both. All segments are + * initially set up to map normal pages. When a huge page mapping is created + * within a set of empty segments, they are "enabled" for huge pages at that + * time. Once enabled for huge pages, they can not be used again for normal + * pages for the remaining lifetime of the process. + * + * If the segment immediately preceeding the segment containing the stack is + * converted to huge pages and the stack is made to grow into the this + * preceeding segment, some kernels may attempt to map normal pages into the + * huge page-only segment -- resulting in bugs. + * + * The kernel bug in question was fixed by commit + * 0d59a01bc461bbab4017ff449b8401151ef44cf6. + */ + +#ifdef __LP64__ +#define STACK_ALLOCATION_SIZE (256*1024*1024) +#else +#define STACK_ALLOCATION_SIZE (16*1024*1024) +#endif + +void do_child(void *stop_address) +{ + struct rlimit r; + volatile int *x; + + /* corefile from this process is not interesting and limiting + * its size can save a lot of time. '1' is a special value, + * that will also abort dumping via pipe, which by default + * sets limit to RLIM_INFINITY. */ + r.rlim_cur = 1; + r.rlim_max = 1; + setrlimit(RLIMIT_CORE, &r); + + do { + x = alloca(STACK_ALLOCATION_SIZE); + *x = 1; + } while ((void *)x >= stop_address); +} + +int main(int argc, char *argv[]) +{ + int fd, pid, s, ret; + struct rlimit r; + char *b; + long hpage_size = gethugepagesize(); + void *stack_address, *mmap_address, *heap_address; + + test_init(argc, argv); + + ret = getrlimit(RLIMIT_STACK, &r); + if (ret) + CONFIG("getrlimit failed: %s", strerror(errno)); + + if (r.rlim_cur != RLIM_INFINITY) + CONFIG("Stack rlimit must be 'unlimited'"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("Couldn't get hugepage fd"); + + stack_address = alloca(0); + heap_address = sbrk(0); + + /* + * paranoia: start mapping two hugepages below the start of the stack, + * in case the alignment would cause us to map over something if we + * only used a gap of one hugepage. + */ + mmap_address = PALIGN(stack_address - 2 * hpage_size, hpage_size); + + do { + b = mmap(mmap_address, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_SHARED, fd, 0); + mmap_address -= hpage_size; + /* + * if we get all the way down to the heap, stop trying + */ + if (mmap_address <= heap_address) + break; + } while (b == MAP_FAILED); + + if (b == MAP_FAILED) + FAIL("mmap: %s", strerror(errno)); + + if ((pid = fork()) < 0) + FAIL("fork: %s", strerror(errno)); + + if (pid == 0) { + do_child(mmap_address); + exit(0); + } + + ret = waitpid(pid, &s, 0); + if (ret == -1) + FAIL("waitpid: %s", strerror(errno)); + + /* + * The child grows its stack until a failure occurs. We expect + * this to result in a SIGSEGV. If any other signal is + * delivered (ie. SIGTRAP) or no signal is sent at all, we + * determine the kernel has not behaved correctly and trigger a + * test failure. + */ + if (WIFSIGNALED(s)) { + int sig = WTERMSIG(s); + + if (sig == SIGSEGV) { + PASS(); + } else { + FAIL("Got unexpected signal: %s", strsignal(sig)); + } + } + FAIL("Child not signalled"); +} diff --git a/tests/straddle_4GB.c b/tests/straddle_4GB.c new file mode 100644 index 0000000..e068a72 --- /dev/null +++ b/tests/straddle_4GB.c @@ -0,0 +1,117 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long straddle_addr; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + if (sizeof(void *) <= 4) + TEST_BUG("64-bit only"); + + if (hpage_size > FOURGB) + CONFIG("Huge page size too large"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + straddle_addr = FOURGB - hpage_size; + + /* We first try to get the mapping without MAP_FIXED */ + verbose_printf("Mapping without MAP_FIXED at %lx...", straddle_addr); + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + if (p == (void *)straddle_addr) { + /* These tests irrelevant if we didn't get the + * straddle address */ + verbose_printf("done\n"); + + if (test_addr_huge(p) != 1) + FAIL("Mapped address is not hugepage"); + + if (test_addr_huge(p + hpage_size) != 1) + FAIL("Mapped address is not hugepage"); + + verbose_printf("Clearing below 4GB..."); + memset(p, 0, hpage_size); + verbose_printf("done\n"); + + verbose_printf("Clearing above 4GB..."); + memset(p + hpage_size, 0, hpage_size); + verbose_printf("done\n"); + } else { + verbose_printf("got %p instead, never mind\n", p); + munmap(p, 2*hpage_size); + } + + verbose_printf("Mapping with MAP_FIXED at %lx...", straddle_addr); + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) { + /* this area crosses last low slice and first high slice */ + unsigned long below_start = FOURGB - 256L*1024*1024; + unsigned long above_end = 1024L*1024*1024*1024; + if (range_is_mapped(below_start, above_end) == 1) { + verbose_printf("region (4G-256M)-1T is not free\n"); + verbose_printf("mmap() failed: %s\n", strerror(errno)); + PASS_INCONCLUSIVE(); + } else + FAIL("mmap() FIXED failed: %s\n", strerror(errno)); + } + if (p != (void *)straddle_addr) { + verbose_printf("got %p instead\n", p); + FAIL("Wrong address with MAP_FIXED"); + } + verbose_printf("done\n"); + + if (test_addr_huge(p) != 1) + FAIL("Mapped address is not hugepage"); + + if (test_addr_huge(p + hpage_size) != 1) + FAIL("Mapped address is not hugepage"); + + verbose_printf("Clearing below 4GB..."); + memset(p, 0, hpage_size); + verbose_printf("done\n"); + + verbose_printf("Clearing above 4GB..."); + memset(p + hpage_size, 0, hpage_size); + verbose_printf("done\n"); + + verbose_printf("Tested above 4GB\n"); + + PASS(); +} diff --git a/tests/task-size-overrun.c b/tests/task-size-overrun.c new file mode 100644 index 0000000..dc9ce0e --- /dev/null +++ b/tests/task-size-overrun.c @@ -0,0 +1,139 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define MAPS_BUF_SZ 4096 + +static unsigned long find_last_mapped(void) +{ + FILE *f; + char line[MAPS_BUF_SZ]; + char *tmp; + unsigned long start, end, off, ino; + int ret; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + ERROR("Failed to open /proc/self/maps: %s\n", strerror(errno)); + return -1; + } + + do { + tmp = fgets(line, MAPS_BUF_SZ, f); + } while (tmp); + fclose(f); + + verbose_printf("Last map: %s", line); + ret = sscanf(line, "%lx-%lx %*s %lx %*s %ld %*s", &start, &end, &off, &ino); + if (ret == EOF) + FAIL("Couldn't parse /proc/self/maps line: %s: %s\n", line, + strerror(errno)); + if (ret != 4) + FAIL("Couldn't parse /proc/self/maps line: %s\n", line); + + verbose_printf("Last map at 0x%lx-0x%lx\n", start, end); + return end; +} + +static unsigned long find_task_size(void) +{ + unsigned long addr; + void *p; + + addr = find_last_mapped(); + if (!addr || ((addr % getpagesize()) != 0)) + FAIL("Bogus stack end address, 0x%lx!?", addr); + + while (addr) { + p = mmap64((void *)addr, getpagesize(), PROT_READ, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0); + if (p == MAP_FAILED) { + verbose_printf("Searching map failed: %s\n", strerror(errno)); + return addr; + } + munmap(p, getpagesize()); + addr += getpagesize(); +#if defined(__powerpc64__) + if (addr > (1UL << 46) && addr < (1UL << 49)) + addr = 1UL << 49; +#endif +#if defined(__s390x__) + if (addr > (1UL << 42) && addr < (1UL << 53)) + addr = 1UL << 53; +#endif + } + /* addr wrapped around */ + return 0; +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long task_size; + unsigned long straddle_addr; + + test_init(argc, argv); + + task_size = find_task_size(); + + verbose_printf("TASK_SIZE = 0x%lx\n", task_size); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + straddle_addr = task_size - hpage_size; + straddle_addr = ALIGN(straddle_addr, hpage_size); + + /* We first try to get the mapping without MAP_FIXED */ + verbose_printf("Mapping without MAP_FIXED at %lx...", straddle_addr); + errno = 0; + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + verbose_printf("%s\n", strerror(errno)); + if (p == (void *)straddle_addr) + FAIL("Apparently suceeded in mapping across TASK_SIZE boundary"); + + verbose_printf("Mapping with MAP_FIXED at %lx...", straddle_addr); + errno = 0; + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, fd, 0); + verbose_printf("%s\n", strerror(errno)); + if (p != MAP_FAILED) + FAIL("Apparently suceeded in mapping across TASK_SIZE boundary"); + + PASS(); +} diff --git a/tests/test_root.c b/tests/test_root.c new file mode 100644 index 0000000..a6c842c --- /dev/null +++ b/tests/test_root.c @@ -0,0 +1,39 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + int val; + + test_init(argc, argv); + + val = hugetlbfs_test_path("/"); + + if (val) + FAIL("/ reports as hugetlbfs"); + + PASS(); +} diff --git a/tests/testutils.c b/tests/testutils.c new file mode 100644 index 0000000..6298370 --- /dev/null +++ b/tests/testutils.c @@ -0,0 +1,341 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hugetlbfs.h" +#include "hugetests.h" + +#define HUGETLBFS_MAGIC 0x958458f6 +#define BUF_SZ 1024 +#define MEMINFO_SZ 2048 + +int verbose_test = 1; +char *test_name; + +void check_must_be_root(void) +{ + uid_t uid = getuid(); + if (uid != 0) + CONFIG("Must be root"); +} + +void check_hugetlb_shm_group(void) +{ + int fd; + ssize_t ret; + char gid_buffer[64] = {0}; + gid_t hugetlb_shm_group; + gid_t gid = getgid(); + uid_t uid = getuid(); + + /* root is an exception */ + if (uid == 0) + return; + + fd = open("/proc/sys/vm/hugetlb_shm_group", O_RDONLY); + if (fd < 0) + ERROR("Unable to open /proc/sys/vm/hugetlb_shm_group: %s", + strerror(errno)); + ret = read(fd, &gid_buffer, sizeof(gid_buffer)); + if (ret < 0) + ERROR("Unable to read /proc/sys/vm/hugetlb_shm_group: %s", + strerror(errno)); + hugetlb_shm_group = atoi(gid_buffer); + close(fd); + if (hugetlb_shm_group != gid) + CONFIG("Do not have permission to use SHM_HUGETLB"); +} + +void __attribute__((weak)) cleanup(void) +{ +} + +#if 0 +static void segv_handler(int signum, siginfo_t *si, void *uc) +{ + FAIL("Segmentation fault"); +} +#endif + +static void sigint_handler(int signum, siginfo_t *si, void *uc) +{ + cleanup(); + fprintf(stderr, "%s: %s (pid=%d)\n", test_name, + strsignal(signum), getpid()); + exit(RC_BUG); +} + +void test_init(int argc, char *argv[]) +{ + int err; + struct sigaction sa_int = { + .sa_sigaction = sigint_handler, + }; + + test_name = argv[0]; + + err = sigaction(SIGINT, &sa_int, NULL); + if (err) + FAIL("Can't install SIGINT handler: %s", strerror(errno)); + + if (getenv("QUIET_TEST")) + verbose_test = 0; + + verbose_printf("Starting testcase \"%s\", pid %d\n", + test_name, getpid()); +} + +#define MAPS_BUF_SZ 4096 + +static int read_maps(unsigned long addr, char *buf) +{ + FILE *f; + char line[MAPS_BUF_SZ]; + char *tmp; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + ERROR("Failed to open /proc/self/maps: %s\n", strerror(errno)); + return -1; + } + + while (1) { + unsigned long start, end, off, ino; + int ret; + + tmp = fgets(line, MAPS_BUF_SZ, f); + if (!tmp) + break; + + buf[0] = '\0'; + ret = sscanf(line, "%lx-%lx %*s %lx %*s %ld %255s", + &start, &end, &off, &ino, + buf); + if ((ret < 4) || (ret > 5)) { + ERROR("Couldn't parse /proc/self/maps line: %s\n", + line); + fclose(f); + return -1; + } + + if ((start <= addr) && (addr < end)) { + fclose(f); + return 1; + } + } + + fclose(f); + return 0; +} + +int range_is_mapped(unsigned long low, unsigned long high) +{ + FILE *f; + char line[MAPS_BUF_SZ]; + char *tmp; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + ERROR("Failed to open /proc/self/maps: %s\n", strerror(errno)); + return -1; + } + + while (1) { + unsigned long start, end; + int ret; + + tmp = fgets(line, MAPS_BUF_SZ, f); + if (!tmp) + break; + + ret = sscanf(line, "%lx-%lx", &start, &end); + if (ret != 2) { + ERROR("Couldn't parse /proc/self/maps line: %s\n", + line); + fclose(f); + return -1; + } + + if ((start >= low) && (start < high)) { + fclose(f); + return 1; + } + if ((end >= low) && (end < high)) { + fclose(f); + return 1; + } + + } + + fclose(f); + return 0; +} + +/* + * With the inclusion of MAP_HUGETLB it is now possible to have huge pages + * without using hugetlbfs, so not all huge page regions will show with the + * test that reads /proc/self/maps. Instead we ask /proc/self/smaps for + * the KernelPageSize. On success we return the page size (in bytes) for the + * mapping that contains addr, on failure we return 0 + */ +unsigned long long get_mapping_page_size(void *p) +{ + FILE *f; + char line[MAPS_BUF_SZ]; + char *tmp; + unsigned long addr = (unsigned long)p; + + f = fopen("/proc/self/smaps", "r"); + if (!f) { + ERROR("Unable to open /proc/self/smaps\n"); + return 0; + } + + while ((tmp = fgets(line, MAPS_BUF_SZ, f))) { + unsigned long start, end, dummy; + char map_name[256]; + char buf[64]; + int ret; + + ret = sscanf(line, "%lx-%lx %s %lx %s %ld %s", &start, &end, + buf, &dummy, buf, &dummy, map_name); + if (ret < 7 || start > addr || end <= addr) + continue; + + while ((tmp = fgets(line, MAPS_BUF_SZ, f))) { + unsigned long long page_size; + + ret = sscanf(line, "KernelPageSize: %lld kB", + &page_size); + if (ret == 0 ) + continue; + if (ret < 1 || page_size <= 0) { + ERROR("Cannot parse /proc/self/smaps\n"); + page_size = 0; + } + + fclose(f); + /* page_size is reported in kB, we return B */ + return page_size * 1024; + } + } + + /* We couldn't find an entry for this addr in smaps */ + fclose(f); + return 0; +} + +/* We define this function standalone, rather than in terms of + * hugetlbfs_test_path() so that we can use it without -lhugetlbfs for + * testing PRELOAD */ +int test_addr_huge(void *p) +{ + char name[256]; + char *dirend; + int ret; + struct statfs64 sb; + + ret = read_maps((unsigned long)p, name); + if (ret < 0) + return ret; + if (ret == 0) { + verbose_printf("Couldn't find address %p in /proc/self/maps\n", + p); + return -1; + } + + /* looks like a filename? */ + if (name[0] != '/') + return 0; + + /* Truncate the filename portion */ + + dirend = strrchr(name, '/'); + if (dirend && dirend > name) { + *dirend = '\0'; + } + + ret = statfs64(name, &sb); + if (ret) + return -1; + + return (sb.f_type == HUGETLBFS_MAGIC); +} + +ino_t get_addr_inode(void *p) +{ + char name[256]; + int ret; + struct stat sb; + + ret = read_maps((unsigned long)p, name); + if (ret < 0) + return ret; + if (ret == 0) { + ERROR("Couldn't find address %p in /proc/self/maps\n", p); + return -1; + } + + /* Don't care about non-filenames */ + if (name[0] != '/') + return 0; + + /* Truncate the filename portion */ + + ret = stat(name, &sb); + if (ret < 0) { + /* Don't care about unlinked files */ + if (errno == ENOENT) + return 0; + ERROR("stat failed: %s\n", strerror(errno)); + return -1; + } + + return sb.st_ino; +} + +int remove_shmid(int shmid) +{ + if (shmid >= 0) { + if (shmctl(shmid, IPC_RMID, NULL) != 0) { + ERROR("shmctl(%x, IPC_RMID) failed (%s)\n", + shmid, strerror(errno)); + return -1; + } + } + return 0; +} diff --git a/tests/truncate.c b/tests/truncate.c new file mode 100644 index 0000000..a45c8c4 --- /dev/null +++ b/tests/truncate.c @@ -0,0 +1,79 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +static void sigbus_handler(int signum, siginfo_t *si, void *uc) +{ + PASS(); +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + volatile unsigned int *q; + int err; + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + q = p; + + /* Touch the memory */ + *q = 0; + + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("sigaction(): %s", strerror(errno)); + + + err = ftruncate(fd, 0); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + + *q; + + /* Should have SIGBUSed above */ + FAIL("Didn't SIGBUS"); +} diff --git a/tests/truncate_above_4GB.c b/tests/truncate_above_4GB.c new file mode 100644 index 0000000..4c427fc --- /dev/null +++ b/tests/truncate_above_4GB.c @@ -0,0 +1,159 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * Copyright (C) 2006 Hugh Dickins + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _LARGEFILE64_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, a misconversion of hugetlb_vmtruncate_list to a + * prio_tree meant that on 32-bit machines, truncates at or above 4GB + * could truncate lower pages, resulting in BUG_ON()s. + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + * + * The kernel bug in question was fixed with commit + * 856fc29505556cf263f3dcda2533cf3766c14ab6. + */ +#define FOURGIG ((off64_t)0x100000000ULL) + +static void sigbus_handler_fail(int signum, siginfo_t *si, void *uc) +{ + FAIL("Unexpected SIGBUS"); +} + +static void sigbus_handler_pass(int signum, siginfo_t *si, void *uc) +{ + PASS(); +} + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + long long buggy_offset, truncate_point; + int fd; + void *p, *q; + volatile unsigned int *pi, *qi; + int err; + struct sigaction sa_fail = { + .sa_sigaction = sigbus_handler_fail, + .sa_flags = SA_SIGINFO, + }; + struct sigaction sa_pass = { + .sa_sigaction = sigbus_handler_pass, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + page_size = getpagesize(); + hpage_size = check_hugepagesize(); + + check_free_huge_pages(3); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + truncate_point = FOURGIG; + buggy_offset = truncate_point / (hpage_size / page_size); + buggy_offset = ALIGN(buggy_offset, hpage_size); + + verbose_printf("Mapping 3 hpages at offset 0x%llx...", truncate_point); + /* First get arena of three hpages size, at file offset 4GB */ + q = mmap64(NULL, 3*hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, truncate_point); + if (q == MAP_FAILED) + FAIL("mmap() offset 4GB: %s", strerror(errno)); + verbose_printf("mapped at %p\n", q); + qi = q; + /* Touch the high page */ + *qi = 0; + + /* This part of the test makes the problem more obvious, but + * is not essential. It can't be done on segmented powerpc, where + * segment restrictions prohibit us from performing such a + * mapping, so skip it there. Similarly, ia64's address space + * restrictions prevent this. */ +#if (defined(__powerpc__) && defined(PPC_NO_SEGMENTS)) \ + || !defined(__powerpc__) && !defined(__powerpc64__) \ + && !defined(__ia64__) + /* Replace middle hpage by tinypage mapping to trigger + * nr_ptes BUG */ + verbose_printf("Replacing map at %p-%p...", q + hpage_size, + q + hpage_size + hpage_size-1); + p = mmap64(q + hpage_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE|MAP_ANON, -1, 0); + if (p != q + hpage_size) + FAIL("mmap() before low hpage"); + verbose_printf("done\n"); + pi = p; + /* Touch one page to allocate its page table */ + *pi = 0; +#endif + + /* Replace top hpage by hpage mapping at confusing file offset */ + verbose_printf("Replacing map at %p with map from offset 0x%llx...", + q + 2*hpage_size, buggy_offset); + p = mmap64(q + 2*hpage_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, fd, buggy_offset); + if (p != q + 2*hpage_size) + FAIL("mmap() buggy offset 0x%llx", buggy_offset); + verbose_printf("done\n"); + pi = p; + /* Touch the low page with something non-zero */ + *pi = 1; + + verbose_printf("Truncating at 0x%llx...", truncate_point); + err = ftruncate64(fd, truncate_point); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + verbose_printf("done\n"); + + err = sigaction(SIGBUS, &sa_fail, NULL); + if (err) + FAIL("sigaction() fail: %s", strerror(errno)); + + if (*pi != 1) + FAIL("Data 1 has changed to %u", *pi); + + err = sigaction(SIGBUS, &sa_pass, NULL); + if (err) + FAIL("sigaction() pass: %s", strerror(errno)); + + *qi; + + /* Should have SIGBUSed above */ + FAIL("Didn't SIGBUS on truncated page."); +} diff --git a/tests/truncate_reserve_wraparound.c b/tests/truncate_reserve_wraparound.c new file mode 100644 index 0000000..0e27787 --- /dev/null +++ b/tests/truncate_reserve_wraparound.c @@ -0,0 +1,130 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, improper handling of tests against i_size could mess + * up accounting of reserved hugepages on certain truncate + * operations. + * + * This bug was fixed with a band-aid (enough to pass this test) in + * commit ebed4bfc8da8df5b6b0bc4a5064a949f04683509. A more complete + * fix still pending as of 3d4248885b9fca818e7fe6b66328e714876d36ad. + */ + +#define RANDOM_CONSTANT 0x1234ABCD + +static sigjmp_buf sig_escape; + +static void sigbus_handler(int signum, siginfo_t *si, void *uc) +{ + siglongjmp(sig_escape, 17); +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + volatile unsigned int *q; + int err; + int sigbus_count = 0; + unsigned long initial_rsvd, rsvd; + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + q = p; + + verbose_printf("Reserve count after map: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + *q = 0; + verbose_printf("Reserve count after touch: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + err = ftruncate(fd, 0); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + + rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after truncate: %lu\n", rsvd); + if (rsvd != initial_rsvd) + FAIL("Reserved count is not restored after truncate: %lu instead of %lu", + rsvd, initial_rsvd); + + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("sigaction(): %s", strerror(errno)); + + if (sigsetjmp(sig_escape, 1) == 0) + *q; /* Fault, triggering a SIGBUS */ + else + sigbus_count++; + + if (sigbus_count != 1) + FAIL("Didn't SIGBUS after truncate"); + + rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after SIGBUS fault: %lu\n", rsvd); + if (rsvd != initial_rsvd) + FAIL("Reserved count is altered by SIGBUS fault: %lu instead of %lu", + rsvd, initial_rsvd); + + munmap(p, hpage_size); + + verbose_printf("Reserve count after munmap(): %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + close(fd); + + verbose_printf("Reserve count after close(): %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + PASS(); +} diff --git a/tests/truncate_sigbus_versus_oom.c b/tests/truncate_sigbus_versus_oom.c new file mode 100644 index 0000000..7aa2fe5 --- /dev/null +++ b/tests/truncate_sigbus_versus_oom.c @@ -0,0 +1,100 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +/* + * Test rationale: + * + * Some kernel have a bug in the positioning of the test against + * i_size. This bug means that attempting to instantiate a page + * beyond the end of a hugepage file can result in an OOM and SIGKILL + * instead of the correct SIGBUS. + * + * This bug was fixed by commit ebed4bfc8da8df5b6b0bc4a5064a949f04683509. + */ +static void sigbus_handler(int signum, siginfo_t *si, void *uc) +{ + PASS(); +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd, fdx; + unsigned long totpages; + void *p, *q; + int i; + int err; + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + totpages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + err = ftruncate(fd, 0); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + + /* Now slurp up all the available pages */ + fdx = hugetlbfs_unlinked_fd(); + if (fdx < 0) + FAIL("hugetlbfs_unlinked_fd() 2"); + + q = mmap(NULL, totpages * hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fdx, 0); + if (q == MAP_FAILED) + FAIL("mmap() reserving all pages: %s", strerror(errno)); + + /* Touch the pages to ensure they're removed from the pool */ + for (i = 0; i < totpages; i++) { + volatile char *x = (volatile char *)q + i*hpage_size; + *x = 0; + } + + /* SIGBUS is what *should* happen */ + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("sigaction(): %s", strerror(errno)); + + *((volatile unsigned int *)p); + + /* Should have SIGBUSed above, or (failed the test) with SIGKILL */ + FAIL("Didn't SIGBUS or OOM"); +} diff --git a/tests/unlinked_fd.c b/tests/unlinked_fd.c new file mode 100644 index 0000000..98bd4ee --- /dev/null +++ b/tests/unlinked_fd.c @@ -0,0 +1,60 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include + +#include + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + int err; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + err = test_addr_huge(p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + err = munmap(p, hpage_size); + if (err != 0) + FAIL("munmap(): %s", strerror(errno)); + + if (close(fd)) + FAIL("close(): %s", strerror(errno)); + + PASS(); +} diff --git a/tests/wrapper-utils.sh b/tests/wrapper-utils.sh new file mode 100644 index 0000000..2f6451d --- /dev/null +++ b/tests/wrapper-utils.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Standard return codes +RC_PASS=0 +RC_CONFIG=1 +RC_FAIL=2 +RC_XFAIL=3 +RC_XPASS=4 +RC_BUG=99 + +function unexpected_pass() +{ + echo -n "UNEXPECTED " +} + +function expected_fail() +{ + echo -n "EXPECTED " +} + +# check_rc (, ) +# Returns: Adjusted return code +# +# Check the actual and expected return codes to identify +# expected failures and unexpected passes. +function check_rc() +{ + EXP_RC=$1 + ACT_RC=$2 + + if [ $ACT_RC -eq $RC_PASS -a $EXP_RC -ne $RC_PASS ]; then + unexpected_pass + return $RC_XPASS + elif [ $EXP_RC -ne $RC_PASS -a $EXP_RC -eq $ACT_RC ]; then + expected_fail + return $RC_XFAIL + else + return $ACT_RC + fi +} + +# exec_and_check (, ) +# Does not return +# Execute a test command and check for expected failures and unexpected passes. +function exec_and_check() +{ + EXP_RC=$1 + shift + + OUTPUT=`$@` + check_rc $EXP_RC $? + RC=$? + echo $OUTPUT + + exit $RC +} diff --git a/tests/zero_filesize_segment.c b/tests/zero_filesize_segment.c new file mode 100644 index 0000000..22f52f1 --- /dev/null +++ b/tests/zero_filesize_segment.c @@ -0,0 +1,60 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include +#include +#include + +#include "hugetests.h" + +static int parse_phdrs(struct dl_phdr_info *info, size_t size, void *data) +{ + int i; + /* This should only be iterated once - we assume that the + * first iteration is the phdrs for the main executable */ + + for (i = 0; i < info->dlpi_phnum; i++) { + const ElfW(Phdr) *phdr = &info->dlpi_phdr[i]; + + if (phdr->p_type != PT_LOAD) + continue; + + verbose_printf("PHDR %d: filesz = 0x%lx, memsz = 0x%lx\n", + i, (unsigned long)phdr->p_filesz, + (unsigned long)phdr->p_memsz); + if (phdr->p_filesz == 0) + PASS(); + } + + return 1; +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + + /* If we're even able to load, that's a good start, but lets + * verify that we really do have a segment with + * zero-filesize. */ + dl_iterate_phdr(parse_phdrs, NULL); + + FAIL("Couldn't find zero filesize segment (test misbuilt)"); +} diff --git a/tests/zero_filesize_segment.ld b/tests/zero_filesize_segment.ld new file mode 100644 index 0000000..7f2fe12 --- /dev/null +++ b/tests/zero_filesize_segment.ld @@ -0,0 +1,7 @@ +SECTIONS +{ + .empty (0x20000000) : { + __empty_segment = .; + . = . + 4; + } +} diff --git a/version b/version new file mode 100644 index 0000000..2ef40bd --- /dev/null +++ b/version @@ -0,0 +1 @@ +2.21 diff --git a/version.c b/version.c new file mode 100644 index 0000000..0ab886a --- /dev/null +++ b/version.c @@ -0,0 +1,3 @@ +#include "version.h" + +static const char libhugetlbfs_version[] = "VERSION: "VERSION; diff --git a/version.lds b/version.lds new file mode 100644 index 0000000..e76b8f7 --- /dev/null +++ b/version.lds @@ -0,0 +1,28 @@ +VERS_1.0 { + global: + gethugepagesize; + hugetlbfs_test_path; + hugetlbfs_find_path; + hugetlbfs_unlinked_fd; + local: + direct_syscall; + __lh_*; + __pu_*; +}; + +HTLBFS_2.0 { + global: + get_huge_pages; + free_huge_pages; +}; + +HTLBFS_2.1 { + global: + get_hugepage_region; + free_hugepage_region; + gethugepagesizes; + getpagesizes; + hugetlbfs_find_path_for_size; + hugetlbfs_unlinked_fd_for_size; + __tp_*; +};