/*
* Copyright © 2009 CNRS
* Copyright © 2009-2018 Inria. All rights reserved.
* Copyright © 2009-2013, 2015 Université Bordeaux
* Copyright © 2009-2018 Cisco Systems, Inc. All rights reserved.
* Copyright © 2015 Intel, Inc. All rights reserved.
* Copyright © 2010 IBM
* See COPYING in top-level directory.
*/
#include <private/autogen/config.h>
#include <hwloc.h>
#include <hwloc/linux.h>
#include <private/misc.h>
#include <private/private.h>
#include <private/misc.h>
#include <private/debug.h>
#include <limits.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <assert.h>
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HWLOC_HAVE_LIBUDEV
#include <libudev.h>
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <sched.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <mntent.h>
struct hwloc_linux_backend_data_s {
char *root_path; /* NULL if unused */
int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
#ifdef HWLOC_HAVE_LIBUDEV
struct udev *udev; /* Global udev context */
#endif
char *dumped_hwdata_dirname;
enum {
HWLOC_LINUX_ARCH_X86, /* x86 32 or 64bits, including k1om (KNC) */
HWLOC_LINUX_ARCH_IA64,
HWLOC_LINUX_ARCH_ARM,
HWLOC_LINUX_ARCH_POWER,
HWLOC_LINUX_ARCH_UNKNOWN
} arch;
int is_knl;
int is_amd_with_CU;
struct utsname utsname; /* fields contain \0 when unknown */
int fallback_nbprocessors; /* only used in hwloc_linux_fallback_pu_level(), maybe be <= 0 (error) earlier */
unsigned pagesize;
};
/***************************
* Misc Abstraction layers *
***************************/
#include <linux/unistd.h>
#if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE_SYSCALL)
/* libc doesn't have support for sched_setaffinity, make system call
* ourselves: */
# ifndef __NR_sched_setaffinity
# ifdef __i386__
# define __NR_sched_setaffinity 241
# elif defined(__x86_64__)
# define __NR_sched_setaffinity 203
# elif defined(__ia64__)
# define __NR_sched_setaffinity 1231
# elif defined(__hppa__)
# define __NR_sched_setaffinity 211
# elif defined(__alpha__)
# define __NR_sched_setaffinity 395
# elif defined(__s390__)
# define __NR_sched_setaffinity 239
# elif defined(__sparc__)
# define __NR_sched_setaffinity 261
# elif defined(__m68k__)
# define __NR_sched_setaffinity 311
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
# define __NR_sched_setaffinity 222
# elif defined(__arm__)
# define __NR_sched_setaffinity 241
# elif defined(__cris__)
# define __NR_sched_setaffinity 241
/*# elif defined(__mips__)
# define __NR_sched_setaffinity TODO (32/64/nabi) */
# else
# warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
# define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
# endif
# endif
# ifndef sched_setaffinity
# define sched_setaffinity(pid, lg, mask) syscall(__NR_sched_setaffinity, pid, lg, mask)
# endif
# ifndef __NR_sched_getaffinity
# ifdef __i386__
# define __NR_sched_getaffinity 242
# elif defined(__x86_64__)
# define __NR_sched_getaffinity 204
# elif defined(__ia64__)
# define __NR_sched_getaffinity 1232
# elif defined(__hppa__)
# define __NR_sched_getaffinity 212
# elif defined(__alpha__)
# define __NR_sched_getaffinity 396
# elif defined(__s390__)
# define __NR_sched_getaffinity 240
# elif defined(__sparc__)
# define __NR_sched_getaffinity 260
# elif defined(__m68k__)
# define __NR_sched_getaffinity 312
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
# define __NR_sched_getaffinity 223
# elif defined(__arm__)
# define __NR_sched_getaffinity 242
# elif defined(__cris__)
# define __NR_sched_getaffinity 242
/*# elif defined(__mips__)
# define __NR_sched_getaffinity TODO (32/64/nabi) */
# else
# warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
# define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
# endif
# endif
# ifndef sched_getaffinity
# define sched_getaffinity(pid, lg, mask) (syscall(__NR_sched_getaffinity, pid, lg, mask) < 0 ? -1 : 0)
# endif
#endif
/* numa syscalls are only in libnuma, but libnuma devel headers aren't widely installed.
* just redefine these syscalls to avoid requiring libnuma devel headers just because of these missing syscalls.
* __NR_foo should be defined in headers in all modern platforms.
* Just redefine the basic ones on important platform when not to hard to detect/define.
*/
#ifndef MPOL_DEFAULT
# define MPOL_DEFAULT 0
#endif
#ifndef MPOL_PREFERRED
# define MPOL_PREFERRED 1
#endif
#ifndef MPOL_BIND
# define MPOL_BIND 2
#endif
#ifndef MPOL_INTERLEAVE
# define MPOL_INTERLEAVE 3
#endif
#ifndef MPOL_LOCAL
# define MPOL_LOCAL 4
#endif
#ifndef MPOL_F_ADDR
# define MPOL_F_ADDR (1<<1)
#endif
#ifndef MPOL_MF_STRICT
# define MPOL_MF_STRICT (1<<0)
#endif
#ifndef MPOL_MF_MOVE
# define MPOL_MF_MOVE (1<<1)
#endif
#ifndef __NR_mbind
# ifdef __i386__
# define __NR_mbind 274
# elif defined(__x86_64__)
# define __NR_mbind 237
# elif defined(__ia64__)
# define __NR_mbind 1259
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
# define __NR_mbind 259
# elif defined(__sparc__)
# define __NR_mbind 353
# endif
#endif
static __hwloc_inline long hwloc_mbind(void *addr __hwloc_attribute_unused,
unsigned long len __hwloc_attribute_unused,
int mode __hwloc_attribute_unused,
const unsigned long *nodemask __hwloc_attribute_unused,
unsigned long maxnode __hwloc_attribute_unused,
unsigned flags __hwloc_attribute_unused)
{
#if (defined __NR_mbind) && (defined HWLOC_HAVE_SYSCALL)
return syscall(__NR_mbind, (long) addr, len, mode, (long)nodemask, maxnode, flags);
#else
errno = ENOSYS;
return -1;
#endif
}
#ifndef __NR_set_mempolicy
# ifdef __i386__
# define __NR_set_mempolicy 276
# elif defined(__x86_64__)
# define __NR_set_mempolicy 239
# elif defined(__ia64__)
# define __NR_set_mempolicy 1261
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
# define __NR_set_mempolicy 261
# elif defined(__sparc__)
# define __NR_set_mempolicy 305
# endif
#endif
static __hwloc_inline long hwloc_set_mempolicy(int mode __hwloc_attribute_unused,
const unsigned long *nodemask __hwloc_attribute_unused,
unsigned long maxnode __hwloc_attribute_unused)
{
#if (defined __NR_set_mempolicy) && (defined HWLOC_HAVE_SYSCALL)
return syscall(__NR_set_mempolicy, mode, nodemask, maxnode);
#else
errno = ENOSYS;
return -1;
#endif
}
#ifndef __NR_get_mempolicy
# ifdef __i386__
# define __NR_get_mempolicy 275
# elif defined(__x86_64__)
# define __NR_get_mempolicy 238
# elif defined(__ia64__)
# define __NR_get_mempolicy 1260
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
# define __NR_get_mempolicy 260
# elif defined(__sparc__)
# define __NR_get_mempolicy 304
# endif
#endif
static __hwloc_inline long hwloc_get_mempolicy(int *mode __hwloc_attribute_unused,
const unsigned long *nodemask __hwloc_attribute_unused,
unsigned long maxnode __hwloc_attribute_unused,
void *addr __hwloc_attribute_unused,
int flags __hwloc_attribute_unused)
{
#if (defined __NR_get_mempolicy) && (defined HWLOC_HAVE_SYSCALL)
return syscall(__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags);
#else
errno = ENOSYS;
return -1;
#endif
}
#ifndef __NR_migrate_pages
# ifdef __i386__
# define __NR_migrate_pages 204
# elif defined(__x86_64__)
# define __NR_migrate_pages 256
# elif defined(__ia64__)
# define __NR_migrate_pages 1280
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
# define __NR_migrate_pages 258
# elif defined(__sparc__)
# define __NR_migrate_pages 302
# endif
#endif
static __hwloc_inline long hwloc_migrate_pages(int pid __hwloc_attribute_unused,
unsigned long maxnode __hwloc_attribute_unused,
const unsigned long *oldnodes __hwloc_attribute_unused,
const unsigned long *newnodes __hwloc_attribute_unused)
{
#if (defined __NR_migrate_pages) && (defined HWLOC_HAVE_SYSCALL)
return syscall(__NR_migrate_pages, pid, maxnode, oldnodes, newnodes);
#else
errno = ENOSYS;
return -1;
#endif
}
#ifndef __NR_move_pages
# ifdef __i386__
# define __NR_move_pages 317
# elif defined(__x86_64__)
# define __NR_move_pages 279
# elif defined(__ia64__)
# define __NR_move_pages 1276
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
# define __NR_move_pages 301
# elif defined(__sparc__)
# define __NR_move_pages 307
# endif
#endif
static __hwloc_inline long hwloc_move_pages(int pid __hwloc_attribute_unused,
unsigned long count __hwloc_attribute_unused,
void **pages __hwloc_attribute_unused,
const int *nodes __hwloc_attribute_unused,
int *status __hwloc_attribute_unused,
int flags __hwloc_attribute_unused)
{
#if (defined __NR_move_pages) && (defined HWLOC_HAVE_SYSCALL)
return syscall(__NR_move_pages, pid, count, pages, nodes, status, flags);
#else
errno = ENOSYS;
return -1;
#endif
}
/* Added for ntohl() */
#include <arpa/inet.h>
#ifdef HAVE_OPENAT
/* Use our own filesystem functions if we have openat */
static const char *
hwloc_checkat(const char *path, int fsroot_fd)
{
const char *relative_path;
if (fsroot_fd < 0) {
errno = EBADF;
return NULL;
}
/* Skip leading slashes. */
for (relative_path = path; *relative_path == '/'; relative_path++);
return relative_path;
}
static int
hwloc_openat(const char *path, int fsroot_fd)
{
const char *relative_path;
relative_path = hwloc_checkat(path, fsroot_fd);
if (!relative_path)
return -1;
return openat (fsroot_fd, relative_path, O_RDONLY);
}
static FILE *
hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
{
int fd;
if (strcmp(mode, "r")) {
errno = ENOTSUP;
return NULL;
}
fd = hwloc_openat (path, fsroot_fd);
if (fd == -1)
return NULL;
return fdopen(fd, mode);
}
static int
hwloc_accessat(const char *path, int mode, int fsroot_fd)
{
const char *relative_path;
relative_path = hwloc_checkat(path, fsroot_fd);
if (!relative_path)
return -1;
return faccessat(fsroot_fd, relative_path, mode, 0);
}
static int
hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
{
const char *relative_path;
relative_path = hwloc_checkat(path, fsroot_fd);
if (!relative_path)
return -1;
return fstatat(fsroot_fd, relative_path, st, flags);
}
static DIR*
hwloc_opendirat(const char *path, int fsroot_fd)
{
int dir_fd;
const char *relative_path;
relative_path = hwloc_checkat(path, fsroot_fd);
if (!relative_path)
return NULL;
dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
if (dir_fd < 0)
return NULL;
return fdopendir(dir_fd);
}
static int
hwloc_readlinkat(const char *path, char *buf, size_t buflen, int fsroot_fd)
{
const char *relative_path;
relative_path = hwloc_checkat(path, fsroot_fd);
if (!relative_path)
return -1;
return readlinkat(fsroot_fd, relative_path, buf, buflen);
}
#endif /* HAVE_OPENAT */
/* Static inline version of fopen so that we can use openat if we have
it, but still preserve compiler parameter checking */
static __hwloc_inline int
hwloc_open(const char *p, int d __hwloc_attribute_unused)
{
#ifdef HAVE_OPENAT
return hwloc_openat(p, d);
#else
return open(p, O_RDONLY);
#endif
}
static __hwloc_inline FILE *
hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
{
#ifdef HAVE_OPENAT
return hwloc_fopenat(p, m, d);
#else
return fopen(p, m);
#endif
}
/* Static inline version of access so that we can use openat if we have
it, but still preserve compiler parameter checking */
static __hwloc_inline int
hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
{
#ifdef HAVE_OPENAT
return hwloc_accessat(p, m, d);
#else
return access(p, m);
#endif
}
static __hwloc_inline int
hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
{
#ifdef HAVE_OPENAT
return hwloc_fstatat(p, st, 0, d);
#else
return stat(p, st);
#endif
}
static __hwloc_inline int
hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
{
#ifdef HAVE_OPENAT
return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
#else
return lstat(p, st);
#endif
}
/* Static inline version of opendir so that we can use openat if we have
it, but still preserve compiler parameter checking */
static __hwloc_inline DIR *
hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
{
#ifdef HAVE_OPENAT
return hwloc_opendirat(p, d);
#else
return opendir(p);
#endif
}
static __hwloc_inline int
hwloc_readlink(const char *p, char *l, size_t ll, int d __hwloc_attribute_unused)
{
#ifdef HAVE_OPENAT
return hwloc_readlinkat(p, l, ll, d);
#else
return readlink(p, l, ll);
#endif
}
/*****************************************
******* Helpers for reading files *******
*****************************************/
static __hwloc_inline int
hwloc_read_path_by_length(const char *path, char *string, size_t length, int fsroot_fd)
{
int fd, ret;
fd = hwloc_open(path, fsroot_fd);
if (fd < 0)
return -1;
ret = read(fd, string, length-1); /* read -1 to put the ending \0 */
close(fd);
if (ret <= 0)
return -1;
string[ret] = 0;
return 0;
}
static __hwloc_inline int
hwloc_read_path_as_int(const char *path, int *value, int fsroot_fd)
{
char string[11];
if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
return -1;
*value = atoi(string);
return 0;
}
static __hwloc_inline int
hwloc_read_path_as_uint(const char *path, unsigned *value, int fsroot_fd)
{
char string[11];
if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
return -1;
*value = (unsigned) strtoul(string, NULL, 10);
return 0;
}
/* Read everything from fd and save it into a newly allocated buffer
* returned in bufferp. Use sizep as a default buffer size, and returned
* the actually needed size in sizep.
*/
static __hwloc_inline int
hwloc__read_fd(int fd, char **bufferp, size_t *sizep)
{
char *buffer;
size_t toread, filesize, totalread;
ssize_t ret;
toread = filesize = *sizep;
/* Alloc and read +1 so that we get EOF on 2^n without reading once more */
buffer = malloc(filesize+1);
if (!buffer)
return -1;
ret = read(fd, buffer, toread+1);
if (ret < 0) {
free(buffer);
return -1;
}
totalread = (size_t) ret;
if (totalread < toread + 1)
/* Normal case, a single read got EOF */
goto done;
/* Unexpected case, must extend the buffer and read again.
* Only occurs on first invocation and if the kernel ever uses multiple page for a single mask.
*/
do {
char *tmp;
toread = filesize;
filesize *= 2;
tmp = realloc(buffer, filesize+1);
if (!tmp) {
free(buffer);
return -1;
}
buffer = tmp;
ret = read(fd, buffer+toread+1, toread);
if (ret < 0) {
free(buffer);
return -1;
}
totalread += ret;
} while ((size_t) ret == toread);
done:
buffer[totalread] = '\0';
*bufferp = buffer;
*sizep = filesize;
return 0;
}
/* kernel cpumaps are composed of an array of 32bits cpumasks */
#define KERNEL_CPU_MASK_BITS 32
#define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
static __hwloc_inline int
hwloc__read_fd_as_cpumask(int fd, hwloc_bitmap_t set)
{
static size_t _filesize = 0; /* will be dynamically initialized to hwloc_get_pagesize(), and increased later if needed */
size_t filesize;
unsigned long *maps;
unsigned long map;
int nr_maps = 0;
static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
* Actually, it may increase multiple times if first read cpumaps start with zeroes.
*/
int nr_maps_allocated = _nr_maps_allocated;
char *buffer, *tmpbuf;
int i;
/* Kernel sysfs files are usually at most one page. 4kB may contain 455 32-bit
* masks (followed by comma), enough for 14k PUs. So allocate a page by default for now.
*
* If we ever need a larger buffer, we'll realloc() the buffer during the first
* invocation of this function so that others directly allocate the right size
* (all cpumask files have the exact same size).
*/
filesize = _filesize;
if (!filesize)
filesize = hwloc_getpagesize();
if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
return -1;
/* Only update the static value with the final one,
* to avoid sharing intermediate values that we modify,
* in case there's ever multiple concurrent calls.
*/
_filesize = filesize;
maps = malloc(nr_maps_allocated * sizeof(*maps));
if (!maps) {
free(buffer);
return -1;
}
/* reset to zero first */
hwloc_bitmap_zero(set);
/* parse the whole mask */
tmpbuf = buffer;
while (sscanf(tmpbuf, "%lx", &map) == 1) {
/* read one kernel cpu mask and the ending comma */
if (nr_maps == nr_maps_allocated) {
unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
if (!tmp) {
free(buffer);
free(maps);
return -1;
}
maps = tmp;
nr_maps_allocated *= 2;
}
tmpbuf = strchr(tmpbuf, ',');
if (!tmpbuf) {
maps[nr_maps++] = map;
break;
} else
tmpbuf++;
if (!map && !nr_maps)
/* ignore the first map if it's empty */
continue;
maps[nr_maps++] = map;
}
free(buffer);
/* convert into a set */
#if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
for(i=0; i<nr_maps; i++)
hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
#else
for(i=0; i<(nr_maps+1)/2; i++) {
unsigned long mask;
mask = maps[nr_maps-2*i-1];
if (2*i+1<nr_maps)
mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
hwloc_bitmap_set_ith_ulong(set, i, mask);
}
#endif
free(maps);
/* Only update the static value with the final one,
* to avoid sharing intermediate values that we modify,
* in case there's ever multiple concurrent calls.
*/
if (nr_maps_allocated > _nr_maps_allocated)
_nr_maps_allocated = nr_maps_allocated;
return 0;
}
static __hwloc_inline int
hwloc__read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set, int fsroot_fd)
{
int fd, err;
fd = hwloc_open(maskpath, fsroot_fd);
if (fd < 0)
return -1;
err = hwloc__read_fd_as_cpumask(fd, set);
close(fd);
return err;
}
static __hwloc_inline hwloc_bitmap_t
hwloc__alloc_read_path_as_cpumask(const char *maskpath, int fsroot_fd)
{
hwloc_bitmap_t set;
int err;
set = hwloc_bitmap_alloc();
if (!set)
return NULL;
err = hwloc__read_path_as_cpumask(maskpath, set, fsroot_fd);
if (err < 0) {
hwloc_bitmap_free(set);
return NULL;
} else
return set;
}
int
hwloc_linux_read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set)
{
int fd, err;
fd = open(maskpath, O_RDONLY);
if (fd < 0)
return -1;
err = hwloc__read_fd_as_cpumask(fd, set);
close(fd);
return err;
}
/* set must be full on input */
static __hwloc_inline int
hwloc__read_fd_as_cpulist(int fd, hwloc_bitmap_t set)
{
/* Kernel sysfs files are usually at most one page.
* But cpulists can be of very different sizes depending on the fragmentation,
* so don't bother remember the actual read size between invocations.
* We don't have many invocations anyway.
*/
size_t filesize = hwloc_getpagesize();
char *buffer, *current, *comma, *tmp;
int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
return -1;
current = buffer;
prevlast = -1;
while (1) {
/* save a pointer to the next comma and erase it to simplify things */
comma = strchr(current, ',');
if (comma)
*comma = '\0';
/* find current enabled-segment bounds */
nextfirst = strtoul(current, &tmp, 0);
if (*tmp == '-')
nextlast = strtoul(tmp+1, NULL, 0);
else
nextlast = nextfirst;
if (prevlast+1 <= nextfirst-1)
hwloc_bitmap_clr_range(set, prevlast+1, nextfirst-1);
/* switch to next enabled-segment */
prevlast = nextlast;
if (!comma)
break;
current = comma+1;
}
hwloc_bitmap_clr_range(set, prevlast+1, -1);
free(buffer);
return 0;
}
/*****************************
******* CpuBind Hooks *******
*****************************/
int
hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
{
/* The resulting binding is always strict */
#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
cpu_set_t *plinux_set;
unsigned cpu;
int last;
size_t setsize;
int err;
last = hwloc_bitmap_last(hwloc_set);
if (last == -1) {
errno = EINVAL;
return -1;
}
setsize = CPU_ALLOC_SIZE(last+1);
plinux_set = CPU_ALLOC(last+1);
CPU_ZERO_S(setsize, plinux_set);
hwloc_bitmap_foreach_begin(cpu, hwloc_set)
CPU_SET_S(cpu, setsize, plinux_set);
hwloc_bitmap_foreach_end();
err = sched_setaffinity(tid, setsize, plinux_set);
CPU_FREE(plinux_set);
return err;
#elif defined(HWLOC_HAVE_CPU_SET)
cpu_set_t linux_set;
unsigned cpu;
CPU_ZERO(&linux_set);
hwloc_bitmap_foreach_begin(cpu, hwloc_set)
CPU_SET(cpu, &linux_set);
hwloc_bitmap_foreach_end();
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
return sched_setaffinity(tid, &linux_set);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
#elif defined(HWLOC_HAVE_SYSCALL)
unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
return sched_setaffinity(tid, (void*) &mask);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
#else /* !SYSCALL */
errno = ENOSYS;
return -1;
#endif /* !SYSCALL */
}
#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
/*
* On some kernels, sched_getaffinity requires the output size to be larger
* than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
* Try sched_affinity on ourself until we find a nr_cpus value that makes
* the kernel happy.
*/
static int
hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
{
static int _nr_cpus = -1;
int nr_cpus = _nr_cpus;
int fd;
if (nr_cpus != -1)
/* already computed */
return nr_cpus;
if (topology->levels[0][0]->complete_cpuset)
/* start with a nr_cpus that may contain the whole topology */
nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
if (nr_cpus <= 0)
/* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
nr_cpus = 1;
fd = open("/sys/devices/system/cpu/possible", O_RDONLY); /* binding only supported in real fsroot, no need for data->root_fd */
if (fd >= 0) {
hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc_full();
if (hwloc__read_fd_as_cpulist(fd, possible_bitmap) == 0) {
int max_possible = hwloc_bitmap_last(possible_bitmap);
hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
if (nr_cpus < max_possible + 1)
nr_cpus = max_possible + 1;
}
close(fd);
hwloc_bitmap_free(possible_bitmap);
}
while (1) {
cpu_set_t *set = CPU_ALLOC(nr_cpus);
size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
CPU_FREE(set);
nr_cpus = setsize * 8; /* that's the value that was actually tested */
if (!err)
/* Found it. Only update the static value with the final one,
* to avoid sharing intermediate values that we modify,
* in case there's ever multiple concurrent calls.
*/
return _nr_cpus = nr_cpus;
nr_cpus *= 2;
}
}
#endif
int
hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
{
int err __hwloc_attribute_unused;
#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
cpu_set_t *plinux_set;
unsigned cpu;
int last;
size_t setsize;
int kernel_nr_cpus;
/* find the kernel nr_cpus so as to use a large enough cpu_set size */
kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
plinux_set = CPU_ALLOC(kernel_nr_cpus);
err = sched_getaffinity(tid, setsize, plinux_set);
if (err < 0) {
CPU_FREE(plinux_set);
return -1;
}
last = -1;
if (topology->levels[0][0]->complete_cpuset)
last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
if (last == -1)
/* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
last = kernel_nr_cpus-1;
hwloc_bitmap_zero(hwloc_set);
for(cpu=0; cpu<=(unsigned) last; cpu++)
if (CPU_ISSET_S(cpu, setsize, plinux_set))
hwloc_bitmap_set(hwloc_set, cpu);
CPU_FREE(plinux_set);
#elif defined(HWLOC_HAVE_CPU_SET)
cpu_set_t linux_set;
unsigned cpu;
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
err = sched_getaffinity(tid, &linux_set);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
if (err < 0)
return -1;
hwloc_bitmap_zero(hwloc_set);
for(cpu=0; cpu<CPU_SETSIZE; cpu++)
if (CPU_ISSET(cpu, &linux_set))
hwloc_bitmap_set(hwloc_set, cpu);
#elif defined(HWLOC_HAVE_SYSCALL)
unsigned long mask;
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
err = sched_getaffinity(tid, (void*) &mask);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
if (err < 0)
return -1;
hwloc_bitmap_from_ulong(hwloc_set, mask);
#else /* !SYSCALL */
errno = ENOSYS;
return -1;
#endif /* !SYSCALL */
return 0;
}
/* Get the array of tids of a process from the task directory in /proc */
static int
hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
{
struct dirent *dirent;
unsigned nr_tids = 0;
unsigned max_tids = 32;
pid_t *tids;
struct stat sb;
/* take the number of links as a good estimate for the number of tids */
if (fstat(dirfd(taskdir), &sb) == 0)
max_tids = sb.st_nlink;
tids = malloc(max_tids*sizeof(pid_t));
if (!tids) {
errno = ENOMEM;
return -1;
}
rewinddir(taskdir);
while ((dirent = readdir(taskdir)) != NULL) {
if (nr_tids == max_tids) {
pid_t *newtids;
max_tids += 8;
newtids = realloc(tids, max_tids*sizeof(pid_t));
if (!newtids) {
free(tids);
errno = ENOMEM;
return -1;
}
tids = newtids;
}
if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
continue;
tids[nr_tids++] = atoi(dirent->d_name);
}
*nr_tidsp = nr_tids;
*tidsp = tids;
return 0;
}
/* Per-tid callbacks */
typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
static int
hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
void *data)
{
char taskdir_path[128];
DIR *taskdir;
pid_t *tids, *newtids;
unsigned i, nr, newnr, failed = 0, failed_errno = 0;
unsigned retrynr = 0;
int err;
if (pid)
snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
else
snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
taskdir = opendir(taskdir_path);
if (!taskdir) {
if (errno == ENOENT)
errno = EINVAL;
err = -1;
goto out;
}
/* read the current list of threads */
err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
if (err < 0)
goto out_with_dir;
retry:
/* apply the callback to all threads */
failed=0;
for(i=0; i<nr; i++) {
err = cb(topology, tids[i], data, i);
if (err < 0) {
failed++;
failed_errno = errno;
}
}
/* re-read the list of thread */
err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
if (err < 0)
goto out_with_tids;
/* retry if the list changed in the meantime, or we failed for *some* threads only.
* if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
*/
if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
free(tids);
tids = newtids;
nr = newnr;
if (++retrynr > 10) {
/* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
errno = EAGAIN;
err = -1;
goto out_with_tids;
}
goto retry;
} else {
free(newtids);
}
/* if all threads failed, return the last errno. */
if (failed) {
err = -1;
errno = failed_errno;
goto out_with_tids;
}
err = 0;
out_with_tids:
free(tids);
out_with_dir:
closedir(taskdir);
out:
return err;
}
/* Per-tid proc_set_cpubind callback and caller.
* Callback data is a hwloc_bitmap_t. */
static int
hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
{
return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
}
static int
hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
return hwloc_linux_foreach_proc_tid(topology, pid,
hwloc_linux_foreach_proc_tid_set_cpubind_cb,
(void*) hwloc_set);
}
/* Per-tid proc_get_cpubind callback data, callback function and caller */
struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
hwloc_bitmap_t cpuset;
hwloc_bitmap_t tidset;
int flags;
};
static int
hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
{
struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
hwloc_bitmap_t cpuset = data->cpuset;
hwloc_bitmap_t tidset = data->tidset;
int flags = data->flags;
if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
return -1;
/* reset the cpuset on first iteration */
if (!idx)
hwloc_bitmap_zero(cpuset);
if (flags & HWLOC_CPUBIND_STRICT) {
/* if STRICT, we want all threads to have the same binding */
if (!idx) {
/* this is the first thread, copy its binding */
hwloc_bitmap_copy(cpuset, tidset);
} else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
/* this is not the first thread, and it's binding is different */
errno = EXDEV;
return -1;
}
} else {
/* if not STRICT, just OR all thread bindings */
hwloc_bitmap_or(cpuset, cpuset, tidset);
}
return 0;
}
static int
hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
{
struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
int ret;
data.cpuset = hwloc_set;
data.tidset = tidset;
data.flags = flags;
ret = hwloc_linux_foreach_proc_tid(topology, pid,
hwloc_linux_foreach_proc_tid_get_cpubind_cb,
(void*) &data);
hwloc_bitmap_free(tidset);
return ret;
}
static int
hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
{
if (pid == 0)
pid = topology->pid;
if (flags & HWLOC_CPUBIND_THREAD)
return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
else
return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
}
static int
hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
{
if (pid == 0)
pid = topology->pid;
if (flags & HWLOC_CPUBIND_THREAD)
return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
else
return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
}
static int
hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
{
return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
}
static int
hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
{
return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
}
static int
hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
if (topology->pid) {
errno = ENOSYS;
return -1;
}
return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
}
static int
hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
if (topology->pid) {
errno = ENOSYS;
return -1;
}
return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
}
#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
#pragma weak pthread_setaffinity_np
#pragma weak pthread_self
static int
hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
int err;
if (topology->pid) {
errno = ENOSYS;
return -1;
}
if (!pthread_self) {
/* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
errno = ENOSYS;
return -1;
}
if (tid == pthread_self())
return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
if (!pthread_setaffinity_np) {
errno = ENOSYS;
return -1;
}
#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
/* Use a separate block so that we can define specific variable
types here */
{
cpu_set_t *plinux_set;
unsigned cpu;
int last;
size_t setsize;
last = hwloc_bitmap_last(hwloc_set);
if (last == -1) {
errno = EINVAL;
return -1;
}
setsize = CPU_ALLOC_SIZE(last+1);
plinux_set = CPU_ALLOC(last+1);
CPU_ZERO_S(setsize, plinux_set);
hwloc_bitmap_foreach_begin(cpu, hwloc_set)
CPU_SET_S(cpu, setsize, plinux_set);
hwloc_bitmap_foreach_end();
err = pthread_setaffinity_np(tid, setsize, plinux_set);
CPU_FREE(plinux_set);
}
#elif defined(HWLOC_HAVE_CPU_SET)
/* Use a separate block so that we can define specific variable
types here */
{
cpu_set_t linux_set;
unsigned cpu;
CPU_ZERO(&linux_set);
hwloc_bitmap_foreach_begin(cpu, hwloc_set)
CPU_SET(cpu, &linux_set);
hwloc_bitmap_foreach_end();
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
err = pthread_setaffinity_np(tid, &linux_set);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
}
#else /* CPU_SET */
/* Use a separate block so that we can define specific variable
types here */
{
unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
err = pthread_setaffinity_np(tid, (void*) &mask);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
}
#endif /* CPU_SET */
if (err) {
errno = err;
return -1;
}
return 0;
}
#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
#pragma weak pthread_getaffinity_np
#pragma weak pthread_self
static int
hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
int err;
if (topology->pid) {
errno = ENOSYS;
return -1;
}
if (!pthread_self) {
/* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
errno = ENOSYS;
return -1;
}
if (tid == pthread_self())
return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
if (!pthread_getaffinity_np) {
errno = ENOSYS;
return -1;
}
#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
/* Use a separate block so that we can define specific variable
types here */
{
cpu_set_t *plinux_set;
unsigned cpu;
int last;
size_t setsize;
last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
assert (last != -1);
setsize = CPU_ALLOC_SIZE(last+1);
plinux_set = CPU_ALLOC(last+1);
err = pthread_getaffinity_np(tid, setsize, plinux_set);
if (err) {
CPU_FREE(plinux_set);
errno = err;
return -1;
}
hwloc_bitmap_zero(hwloc_set);
for(cpu=0; cpu<=(unsigned) last; cpu++)
if (CPU_ISSET_S(cpu, setsize, plinux_set))
hwloc_bitmap_set(hwloc_set, cpu);
CPU_FREE(plinux_set);
}
#elif defined(HWLOC_HAVE_CPU_SET)
/* Use a separate block so that we can define specific variable
types here */
{
cpu_set_t linux_set;
unsigned cpu;
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
err = pthread_getaffinity_np(tid, &linux_set);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
if (err) {
errno = err;
return -1;
}
hwloc_bitmap_zero(hwloc_set);
for(cpu=0; cpu<CPU_SETSIZE; cpu++)
if (CPU_ISSET(cpu, &linux_set))
hwloc_bitmap_set(hwloc_set, cpu);
}
#else /* CPU_SET */
/* Use a separate block so that we can define specific variable
types here */
{
unsigned long mask;
#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
err = pthread_getaffinity_np(tid, (void*) &mask);
#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
if (err) {
errno = err;
return -1;
}
hwloc_bitmap_from_ulong(hwloc_set, mask);
}
#endif /* CPU_SET */
return 0;
}
#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
int
hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
{
/* read /proc/pid/stat.
* its second field contains the command name between parentheses,
* and the command itself may contain parentheses,
* so read the whole line and find the last closing parenthesis to find the third field.
*/
char buf[1024] = "";
char name[64];
char *tmp;
int fd, i, err;
/* TODO: find a way to use sched_getcpu().
* either compare tid with gettid() in all callbacks.
* or pass gettid() in the callback data.
*/
if (!tid) {
#ifdef SYS_gettid
tid = syscall(SYS_gettid);
#else
errno = ENOSYS;
return -1;
#endif
}
snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
fd = open(name, O_RDONLY); /* no fsroot for real /proc */
if (fd < 0) {
errno = ENOSYS;
return -1;
}
err = read(fd, buf, sizeof(buf)-1); /* read -1 to put the ending \0 */
close(fd);
if (err <= 0) {
errno = ENOSYS;
return -1;
}
buf[err-1] = '\0';
tmp = strrchr(buf, ')');
if (!tmp) {
errno = ENOSYS;
return -1;
}
/* skip ') ' to find the actual third argument */
tmp += 2;
/* skip 35 fields */
for(i=0; i<36; i++) {
tmp = strchr(tmp, ' ');
if (!tmp) {
errno = ENOSYS;
return -1;
}
/* skip the ' ' itself */
tmp++;
}
/* read the last cpu in the 38th field now */
if (sscanf(tmp, "%d ", &i) != 1) {
errno = ENOSYS;
return -1;
}
hwloc_bitmap_only(set, i);
return 0;
}
/* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
hwloc_bitmap_t cpuset;
hwloc_bitmap_t tidset;
};
static int
hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
{
struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
hwloc_bitmap_t cpuset = data->cpuset;
hwloc_bitmap_t tidset = data->tidset;
if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
return -1;
/* reset the cpuset on first iteration */
if (!idx)
hwloc_bitmap_zero(cpuset);
hwloc_bitmap_or(cpuset, cpuset, tidset);
return 0;
}
static int
hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
int ret;
data.cpuset = hwloc_set;
data.tidset = tidset;
ret = hwloc_linux_foreach_proc_tid(topology, pid,
hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
&data);
hwloc_bitmap_free(tidset);
return ret;
}
static int
hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
{
if (pid == 0)
pid = topology->pid;
if (flags & HWLOC_CPUBIND_THREAD)
return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
else
return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
}
static int
hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
{
return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
}
static int
hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
if (topology->pid) {
errno = ENOSYS;
return -1;
}
#if HAVE_DECL_SCHED_GETCPU
{
int pu = sched_getcpu();
if (pu >= 0) {
hwloc_bitmap_only(hwloc_set, pu);
return 0;
}
}
#endif
return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
}
/***************************
****** Membind hooks ******
***************************/
static int
hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
{
switch (policy) {
case HWLOC_MEMBIND_DEFAULT:
*linuxpolicy = MPOL_DEFAULT;
break;
case HWLOC_MEMBIND_FIRSTTOUCH:
*linuxpolicy = MPOL_LOCAL;
break;
case HWLOC_MEMBIND_BIND:
if (flags & HWLOC_MEMBIND_STRICT)
*linuxpolicy = MPOL_BIND;
else
*linuxpolicy = MPOL_PREFERRED;
break;
case HWLOC_MEMBIND_INTERLEAVE:
*linuxpolicy = MPOL_INTERLEAVE;
break;
/* TODO: next-touch when (if?) patch applied upstream */
default:
errno = ENOSYS;
return -1;
}
return 0;
}
static int
hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
hwloc_const_nodeset_t nodeset,
unsigned *max_os_index_p, unsigned long **linuxmaskp)
{
unsigned max_os_index = 0; /* highest os_index + 1 */
unsigned long *linuxmask;
unsigned i;
hwloc_nodeset_t linux_nodeset = NULL;
if (hwloc_bitmap_isfull(nodeset)) {
linux_nodeset = hwloc_bitmap_alloc();
hwloc_bitmap_only(linux_nodeset, 0);
nodeset = linux_nodeset;
}
max_os_index = hwloc_bitmap_last(nodeset);
if (max_os_index == (unsigned) -1)
max_os_index = 0;
/* add 1 to convert the last os_index into a max_os_index,
* and round up to the nearest multiple of BITS_PER_LONG */
max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
if (!linuxmask) {
hwloc_bitmap_free(linux_nodeset);
errno = ENOMEM;
return -1;
}
for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
if (linux_nodeset)
hwloc_bitmap_free(linux_nodeset);
*max_os_index_p = max_os_index;
*linuxmaskp = linuxmask;
return 0;
}
static void
hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
hwloc_nodeset_t nodeset,
unsigned max_os_index, const unsigned long *linuxmask)
{
unsigned i;
#ifdef HWLOC_DEBUG
/* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
assert(!(max_os_index%HWLOC_BITS_PER_LONG));
#endif
hwloc_bitmap_zero(nodeset);
for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
}
static int
hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
{
unsigned max_os_index; /* highest os_index + 1 */
unsigned long *linuxmask;
size_t remainder;
int linuxpolicy;
unsigned linuxflags = 0;
int err;
remainder = (uintptr_t) addr & (hwloc_getpagesize()-1);
addr = (char*) addr - remainder;
len += remainder;
err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
if (err < 0)
return err;
if (linuxpolicy == MPOL_DEFAULT) {
/* Some Linux kernels don't like being passed a set */
return hwloc_mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
} else if (linuxpolicy == MPOL_LOCAL) {
if (!hwloc_bitmap_isequal(nodeset, hwloc_topology_get_complete_nodeset(topology))) {
errno = EXDEV;
return -1;
}
/* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
return hwloc_mbind((void *) addr, len, MPOL_PREFERRED, NULL, 0, 0);
}
err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
if (err < 0)
goto out;
if (flags & HWLOC_MEMBIND_MIGRATE) {
linuxflags = MPOL_MF_MOVE;
if (flags & HWLOC_MEMBIND_STRICT)
linuxflags |= MPOL_MF_STRICT;
}
err = hwloc_mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
if (err < 0)
goto out_with_mask;
free(linuxmask);
return 0;
out_with_mask:
free(linuxmask);
out:
return -1;
}
static void *
hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
{
void *buffer;
int err;
buffer = hwloc_alloc_mmap(topology, len);
if (!buffer)
return NULL;
err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
if (err < 0 && (flags & HWLOC_MEMBIND_STRICT)) {
munmap(buffer, len);
return NULL;
}
return buffer;
}
static int
hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
{
unsigned max_os_index; /* highest os_index + 1 */
unsigned long *linuxmask;
int linuxpolicy;
int err;
err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
if (err < 0)
return err;
if (linuxpolicy == MPOL_DEFAULT) {
/* Some Linux kernels don't like being passed a set */
return hwloc_set_mempolicy(linuxpolicy, NULL, 0);
} else if (linuxpolicy == MPOL_LOCAL) {
if (!hwloc_bitmap_isequal(nodeset, hwloc_topology_get_complete_nodeset(topology))) {
errno = EXDEV;
return -1;
}
/* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
return hwloc_set_mempolicy(MPOL_PREFERRED, NULL, 0);
}
err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
if (err < 0)
goto out;
if (flags & HWLOC_MEMBIND_MIGRATE) {
unsigned long fullmask[max_os_index/HWLOC_BITS_PER_LONG];
memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
err = hwloc_migrate_pages(0, max_os_index+1, fullmask, linuxmask);
if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
goto out_with_mask;
}
err = hwloc_set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
if (err < 0)
goto out_with_mask;
free(linuxmask);
return 0;
out_with_mask:
free(linuxmask);
out:
return -1;
}
/*
* On some kernels, get_mempolicy requires the output size to be larger
* than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
* Try get_mempolicy on ourself until we find a max_os_index value that
* makes the kernel happy.
*/
static int
hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
{
static int _max_numnodes = -1, max_numnodes;
int linuxpolicy;
if (_max_numnodes != -1)
/* already computed */
return _max_numnodes;
/* start with a single ulong, it's the minimal and it's enough for most machines */
max_numnodes = HWLOC_BITS_PER_LONG;
while (1) {
unsigned long mask[max_numnodes / HWLOC_BITS_PER_LONG];
int err = hwloc_get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
if (!err || errno != EINVAL)
/* Found it. Only update the static value with the final one,
* to avoid sharing intermediate values that we modify,
* in case there's ever multiple concurrent calls.
*/
return _max_numnodes = max_numnodes;
max_numnodes *= 2;
}
}
static int
hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
{
switch (linuxpolicy) {
case MPOL_DEFAULT:
case MPOL_LOCAL: /* converted from MPOL_PREFERRED + empty nodeset by the caller */
*policy = HWLOC_MEMBIND_FIRSTTOUCH;
return 0;
case MPOL_PREFERRED:
case MPOL_BIND:
*policy = HWLOC_MEMBIND_BIND;
return 0;
case MPOL_INTERLEAVE:
*policy = HWLOC_MEMBIND_INTERLEAVE;
return 0;
default:
errno = EINVAL;
return -1;
}
}
static int hwloc_linux_mask_is_empty(unsigned max_os_index, unsigned long *linuxmask)
{
unsigned i;
for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
if (linuxmask[i])
return 0;
return 1;
}
static int
hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
{
unsigned max_os_index;
int linuxpolicy;
int err;
max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
unsigned long linuxmask[max_os_index/HWLOC_BITS_PER_LONG];
err = hwloc_get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
if (err < 0)
goto out;
/* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
linuxpolicy = MPOL_LOCAL;
if (linuxpolicy == MPOL_DEFAULT || linuxpolicy == MPOL_LOCAL) {
hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
} else {
hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
}
err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
if (err < 0)
goto out;
return 0;
out:
return -1;
}
static int
hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
{
unsigned max_os_index;
int linuxpolicy = 0, globallinuxpolicy = 0; /* shut-up the compiler */
int mixed = 0;
int full = 0;
int first = 1;
int pagesize = hwloc_getpagesize();
char *tmpaddr;
int err;
unsigned i;
max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
unsigned long linuxmask[max_os_index/HWLOC_BITS_PER_LONG];
unsigned long globallinuxmask[max_os_index/HWLOC_BITS_PER_LONG];
memset(globallinuxmask, 0, sizeof(globallinuxmask));
for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
tmpaddr < (char *)addr + len;
tmpaddr += pagesize) {
err = hwloc_get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
if (err < 0)
goto out;
/* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
linuxpolicy = MPOL_LOCAL;
/* use the first found policy. if we find a different one later, set mixed to 1 */
if (first)
globallinuxpolicy = linuxpolicy;
else if (globallinuxpolicy != linuxpolicy)
mixed = 1;
/* agregate masks, and set full to 1 if we ever find DEFAULT or LOCAL */
if (full || linuxpolicy == MPOL_DEFAULT || linuxpolicy == MPOL_LOCAL) {
full = 1;
} else {
for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
globallinuxmask[i] |= linuxmask[i];
}
first = 0;
}
if (mixed) {
*policy = HWLOC_MEMBIND_MIXED;
} else {
err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
if (err < 0)
goto out;
}
if (full) {
hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
} else {
hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
}
return 0;
out:
return -1;
}
static int
hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags __hwloc_attribute_unused)
{
unsigned offset;
unsigned long count;
void **pages;
int *status;
int pagesize = hwloc_getpagesize();
int ret;
unsigned i;
offset = ((unsigned long) addr) & (pagesize-1);
addr = ((char*) addr) - offset;
len += offset;
count = (len + pagesize-1)/pagesize;
pages = malloc(count*sizeof(*pages));
status = malloc(count*sizeof(*status));
if (!pages || !status) {
ret = -1;
goto out_with_pages;
}
for(i=0; i<count; i++)
pages[i] = ((char*)addr) + i*pagesize;
ret = hwloc_move_pages(0, count, pages, NULL, status, 0);
if (ret < 0)
goto out_with_pages;
hwloc_bitmap_zero(nodeset);
for(i=0; i<count; i++)
if (status[i] >= 0)
hwloc_bitmap_set(nodeset, status[i]);
ret = 0;
out_with_pages:
free(pages);
free(status);
return ret;
}
static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep);
static int hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)
{
const char *fsroot_path;
char *cpuset_name;
int root_fd = -1;
fsroot_path = getenv("HWLOC_FSROOT");
if (!fsroot_path)
fsroot_path = "/";
#ifdef HAVE_OPENAT
root_fd = open(fsroot_path, O_RDONLY | O_DIRECTORY);
if (root_fd < 0)
goto out;
#else
if (strcmp(fsroot_path, "/")) {
errno = ENOSYS;
goto out;
}
#endif
/* we could also error-out if the current topology doesn't actually match the system,
* at least for PUs and NUMA nodes. But it would increase the overhead of loading XMLs.
*
* Just trust the user when he sets THISSYSTEM=1. It enables hacky
* tests such as restricting random XML or synthetic to the current
* machine (uses the default cgroup).
*/
hwloc_linux__get_allowed_resources(topology, fsroot_path, root_fd, &cpuset_name);
if (cpuset_name) {
hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
free(cpuset_name);
}
if (root_fd != -1)
close(root_fd);
out:
return -1;
}
void
hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
struct hwloc_topology_support *support __hwloc_attribute_unused)
{
hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
hooks->get_area_membind = hwloc_linux_get_area_membind;
hooks->set_area_membind = hwloc_linux_set_area_membind;
hooks->get_area_memlocation = hwloc_linux_get_area_memlocation;
hooks->alloc_membind = hwloc_linux_alloc_membind;
hooks->alloc = hwloc_alloc_mmap;
hooks->free_membind = hwloc_free_mmap;
support->membind->firsttouch_membind = 1;
support->membind->bind_membind = 1;
support->membind->interleave_membind = 1;
support->membind->migrate_membind = 1;
hooks->get_allowed_resources = hwloc_linux_get_allowed_resources_hook;
}
/*******************************************
*** Misc Helpers for Topology Discovery ***
*******************************************/
/* cpuinfo array */
struct hwloc_linux_cpuinfo_proc {
/* set during hwloc_linux_parse_cpuinfo */
unsigned long Pproc;
/* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
long Pcore, Ppkg;
/* set later, or -1 if unknown */
long Lcore, Lpkg;
/* custom info, set during hwloc_linux_parse_cpuinfo */
struct hwloc_info_s *infos;
unsigned infos_count;
};
static void
hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, const char *root_path)
{
char *mount_path;
struct mntent mntent;
FILE *fd;
int err;
size_t bufsize;
*cgroup_mntpnt = NULL;
*cpuset_mntpnt = NULL;
if (root_path) {
/* setmntent() doesn't support openat(), so use the root_path directly */
err = asprintf(&mount_path, "%s/proc/mounts", root_path);
if (err < 0)
return;
fd = setmntent(mount_path, "r");
free(mount_path);
} else {
fd = setmntent("/proc/mounts", "r");
}
if (!fd)
return;
/* getmntent_r() doesn't actually report an error when the buffer
* is too small. It just silently truncates things. So we can't
* dynamically resize things.
*
* Linux limits mount type, string, and options to one page each.
* getmntent() limits the line size to 4kB.
* so use 4*pagesize to be far above both.
*/
bufsize = hwloc_getpagesize()*4;
char buf[bufsize];
while (getmntent_r(fd, &mntent, buf, bufsize)) {
if (!strcmp(mntent.mnt_type, "cpuset")) {
hwloc_debug("Found cpuset mount point on %s\n", mntent.mnt_dir);
*cpuset_mntpnt = strdup(mntent.mnt_dir);
break;
} else if (!strcmp(mntent.mnt_type, "cgroup")) {
/* found a cgroup mntpnt */
char *opt, *opts = mntent.mnt_opts;
int cpuset_opt = 0;
int noprefix_opt = 0;
/* look at options */
while ((opt = strsep(&opts, ",")) != NULL) {
if (!strcmp(opt, "cpuset"))
cpuset_opt = 1;
else if (!strcmp(opt, "noprefix"))
noprefix_opt = 1;
}
if (!cpuset_opt)
continue;
if (noprefix_opt) {
hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", mntent.mnt_dir);
*cpuset_mntpnt = strdup(mntent.mnt_dir);
} else {
hwloc_debug("Found cgroup/cpuset mount point on %s\n", mntent.mnt_dir);
*cgroup_mntpnt = strdup(mntent.mnt_dir);
}
break;
}
}
endmntent(fd);
}
/*
* Linux cpusets may be managed directly or through cgroup.
* If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
* single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
* containing <name>.
*/
static char *
hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
{
#define CPUSET_NAME_LEN 128
char cpuset_name[CPUSET_NAME_LEN];
FILE *file;
int err;
char *tmp;
/* check whether a cgroup-cpuset is enabled */
if (!pid)
file = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
else {
char path[] = "/proc/XXXXXXXXXX/cgroup";
snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
file = hwloc_fopen(path, "r", fsroot_fd);
}
if (file) {
/* find a cpuset line */
#define CGROUP_LINE_LEN 256
char line[CGROUP_LINE_LEN];
while (fgets(line, sizeof(line), file)) {
char *end, *colon = strchr(line, ':');
if (!colon)
continue;
if (strncmp(colon, ":cpuset:", 8))
continue;
/* found a cgroup-cpuset line, return the name */
fclose(file);
end = strchr(colon, '\n');
if (end)
*end = '\0';
hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
return strdup(colon+8);
}
fclose(file);
}
/* check whether a cpuset is enabled */
if (!pid)
err = hwloc_read_path_by_length("/proc/self/cpuset", cpuset_name, sizeof(cpuset_name), fsroot_fd);
else {
char path[] = "/proc/XXXXXXXXXX/cpuset";
snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
err = hwloc_read_path_by_length(path, cpuset_name, sizeof(cpuset_name), fsroot_fd);
}
if (err < 0) {
/* found nothing */
hwloc_debug("%s", "No cgroup or cpuset found\n");
return NULL;
}
/* found a cpuset, return the name */
tmp = strchr(cpuset_name, '\n');
if (tmp)
*tmp = '\0';
hwloc_debug("Found cpuset %s\n", cpuset_name);
return strdup(cpuset_name);
}
/*
* Then, the cpuset description is available from either the cgroup or
* the cpuset filesystem (usually mounted in / or /dev) where there
* are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
*/
static void
hwloc_admin_disable_set_from_cpuset(int root_fd,
const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
const char *attr_name,
hwloc_bitmap_t admin_enabled_cpus_set)
{
#define CPUSET_FILENAME_LEN 256
char cpuset_filename[CPUSET_FILENAME_LEN];
int fd;
int err;
if (cgroup_mntpnt) {
/* try to read the cpuset from cgroup */
snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
} else if (cpuset_mntpnt) {
/* try to read the cpuset directly */
snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
}
fd = hwloc_open(cpuset_filename, root_fd);
if (fd < 0) {
/* found no cpuset description, ignore it */
hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
return;
}
err = hwloc__read_fd_as_cpulist(fd, admin_enabled_cpus_set);
close(fd);
if (err < 0)
hwloc_bitmap_fill(admin_enabled_cpus_set);
else
hwloc_debug_bitmap("cpuset includes %s\n", admin_enabled_cpus_set);
}
static void
hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
const char *path,
uint64_t *local_memory,
uint64_t *meminfo_hugepages_count,
uint64_t *meminfo_hugepages_size,
int onlytotal)
{
char *tmp;
char buffer[4096];
unsigned long long number;
if (hwloc_read_path_by_length(path, buffer, sizeof(buffer), data->root_fd) < 0)
return;
tmp = strstr(buffer, "MemTotal: "); /* MemTotal: %llu kB */
if (tmp) {
number = strtoull(tmp+10, NULL, 10);
*local_memory = number << 10;
if (onlytotal)
return;
tmp = strstr(tmp, "Hugepagesize: "); /* Hugepagesize: %llu */
if (tmp) {
number = strtoull(tmp+14, NULL, 10);
*meminfo_hugepages_size = number << 10;
tmp = strstr(tmp, "HugePages_Free: "); /* HugePages_Free: %llu */
if (tmp) {
number = strtoull(tmp+16, NULL, 10);
*meminfo_hugepages_count = number;
}
}
}
}
#define SYSFS_NUMA_NODE_PATH_LEN 128
static void
hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
const char *dirpath,
struct hwloc_numanode_attr_s *memory,
uint64_t *remaining_local_memory)
{
DIR *dir;
struct dirent *dirent;
unsigned long index_ = 1;
char line[64];
char path[SYSFS_NUMA_NODE_PATH_LEN];
dir = hwloc_opendir(dirpath, data->root_fd);
if (dir) {
while ((dirent = readdir(dir)) != NULL) {
int err;
if (strncmp(dirent->d_name, "hugepages-", 10))
continue;
memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
err = snprintf(path, sizeof(path), "%s/%s/nr_hugepages", dirpath, dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, line, sizeof(line), data->root_fd)) {
/* these are the actual total amount of huge pages */
memory->page_types[index_].count = strtoull(line, NULL, 0);
*remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
index_++;
}
}
closedir(dir);
memory->page_types_len = index_;
}
}
static void
hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data,
struct hwloc_numanode_attr_s *memory)
{
uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
struct stat st;
int has_sysfs_hugepages = 0;
const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
int types = 2;
int err;
err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
if (!err) {
types = 1 + st.st_nlink-2;
has_sysfs_hugepages = 1;
}
if (topology->is_thissystem || pagesize_env) {
/* we cannot report any page_type info unless we have the page size.
* we'll take it either from the system if local, or from the debug env variable
*/
memory->page_types_len = types;
memory->page_types = calloc(types, sizeof(*memory->page_types));
}
if (topology->is_thissystem) {
/* Get the page and hugepage sizes from sysconf */
#if HAVE_DECL__SC_LARGE_PAGESIZE
memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
#endif
memory->page_types[0].size = data->pagesize; /* might be overwritten later by /proc/meminfo or sysfs */
}
hwloc_parse_meminfo_info(data, "/proc/meminfo",
&memory->local_memory,
&meminfo_hugepages_count, &meminfo_hugepages_size,
memory->page_types == NULL);
if (memory->page_types) {
uint64_t remaining_local_memory = memory->local_memory;
if (has_sysfs_hugepages) {
/* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
} else {
/* use what we found in meminfo */
if (meminfo_hugepages_size) {
memory->page_types[1].size = meminfo_hugepages_size;
memory->page_types[1].count = meminfo_hugepages_count;
remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
} else {
memory->page_types_len = 1;
}
}
if (pagesize_env) {
/* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
/* If failed, use 4kB */
if (!memory->page_types[0].size)
memory->page_types[0].size = 4096;
}
assert(memory->page_types[0].size); /* from sysconf if local or from the env */
/* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
* may be 0 if no hugepage support in the kernel */
memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
}
}
static void
hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data,
const char *syspath, int node,
struct hwloc_numanode_attr_s *memory)
{
char path[SYSFS_NUMA_NODE_PATH_LEN];
char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
uint64_t meminfo_hugepages_count = 0;
uint64_t meminfo_hugepages_size = 0;
struct stat st;
int has_sysfs_hugepages = 0;
int types = 2;
int err;
sprintf(path, "%s/node%d/hugepages", syspath, node);
err = hwloc_stat(path, &st, data->root_fd);
if (!err) {
types = 1 + st.st_nlink-2;
has_sysfs_hugepages = 1;
}
if (topology->is_thissystem) {
memory->page_types_len = types;
memory->page_types = malloc(types*sizeof(*memory->page_types));
memset(memory->page_types, 0, types*sizeof(*memory->page_types));
}
sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
hwloc_parse_meminfo_info(data, meminfopath,
&memory->local_memory,
&meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
memory->page_types == NULL);
if (memory->page_types) {
uint64_t remaining_local_memory = memory->local_memory;
if (has_sysfs_hugepages) {
/* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
} else {
/* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
* hwloc_get_procfs_meminfo_info must have been called earlier */
meminfo_hugepages_size = topology->machine_memory.page_types[1].size;
/* use what we found in meminfo */
if (meminfo_hugepages_size) {
memory->page_types[1].count = meminfo_hugepages_count;
memory->page_types[1].size = meminfo_hugepages_size;
remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
} else {
memory->page_types_len = 1;
}
}
/* update what's remaining as normal pages */
memory->page_types[0].size = data->pagesize;
memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
}
}
static int
hwloc_parse_nodes_distances(const char *path, unsigned nbnodes, unsigned *indexes, uint64_t *distances, int fsroot_fd)
{
size_t len = (10+1)*nbnodes;
uint64_t *curdist = distances;
char *string;
unsigned i;
string = malloc(len); /* space-separated %d */
if (!string)
goto out;
for(i=0; i<nbnodes; i++) {
unsigned osnode = indexes[i];
char distancepath[SYSFS_NUMA_NODE_PATH_LEN];
char *tmp, *next;
unsigned found;
/* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
* store them in slots X*N...X*N+N-1 */
sprintf(distancepath, "%s/node%u/distance", path, osnode);
if (hwloc_read_path_by_length(distancepath, string, len, fsroot_fd) < 0)
goto out_with_string;
tmp = string;
found = 0;
while (tmp) {
unsigned distance = strtoul(tmp, &next, 0); /* stored as a %d */
if (next == tmp)
break;
*curdist = (uint64_t) distance;
curdist++;
found++;
if (found == nbnodes)
break;
tmp = next+1;
}
if (found != nbnodes)
goto out_with_string;
}
free(string);
return 0;
out_with_string:
free(string);
out:
return -1;
}
static void
hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
hwloc_obj_t obj,
char *path, unsigned pathlen,
const char *dmi_name, const char *hwloc_name)
{
char dmi_line[64];
strcpy(path+pathlen, dmi_name);
if (hwloc_read_path_by_length(path, dmi_line, sizeof(dmi_line), data->root_fd) < 0)
return;
if (dmi_line[0] != '\0') {
char *tmp = strchr(dmi_line, '\n');
if (tmp)
*tmp = '\0';
hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
hwloc_obj_add_info(obj, hwloc_name, dmi_line);
}
}
static void
hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
{
char path[128];
unsigned pathlen;
DIR *dir;
strcpy(path, "/sys/devices/virtual/dmi/id");
dir = hwloc_opendir(path, data->root_fd);
if (dir) {
pathlen = 27;
} else {
strcpy(path, "/sys/class/dmi/id");
dir = hwloc_opendir(path, data->root_fd);
if (dir)
pathlen = 17;
else
return;
}
closedir(dir);
path[pathlen++] = '/';
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
}
/***********************************
****** Device tree Discovery ******
***********************************/
/* Reads the entire file and returns bytes read if bytes_read != NULL
* Returned pointer can be freed by using free(). */
static void *
hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
{
char fname[256];
char *ret = NULL;
struct stat fs;
int file = -1;
snprintf(fname, sizeof(fname), "%s/%s", p, p1);
file = hwloc_open(fname, root_fd);
if (-1 == file) {
goto out_no_close;
}
if (fstat(file, &fs)) {
goto out;
}
ret = (char *) malloc(fs.st_size);
if (NULL != ret) {
ssize_t cb = read(file, ret, fs.st_size);
if (cb == -1) {
free(ret);
ret = NULL;
} else {
if (NULL != bytes_read)
*bytes_read = cb;
}
}
out:
close(file);
out_no_close:
return ret;
}
/* Reads the entire file and returns it as a 0-terminated string
* Returned pointer can be freed by using free(). */
static char *
hwloc_read_str(const char *p, const char *p1, int root_fd)
{
size_t cb = 0;
char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
char *tmp = realloc(ret, cb + 1);
if (!tmp) {
free(ret);
return NULL;
}
ret = tmp;
ret[cb] = 0;
}
return ret;
}
/* Reads first 32bit bigendian value */
static ssize_t
hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
{
size_t cb = 0;
uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
if (sizeof(*buf) != cb) {
errno = EINVAL;
free(tmp); /* tmp is either NULL or contains useless things */
return -1;
}
*buf = htonl(*tmp);
free(tmp);
return sizeof(*buf);
}
typedef struct {
unsigned int n, allocated;
struct {
hwloc_bitmap_t cpuset;
uint32_t phandle;
uint32_t l2_cache;
char *name;
} *p;
} device_tree_cpus_t;
static void
add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
uint32_t l2_cache, uint32_t phandle, const char *name)
{
if (cpus->n == cpus->allocated) {
void *tmp;
unsigned allocated;
if (!cpus->allocated)
allocated = 64;
else
allocated = 2 * cpus->allocated;
tmp = realloc(cpus->p, allocated * sizeof(cpus->p[0]));
if (!tmp)
return; /* failed to realloc, ignore this entry */
cpus->p = tmp;
cpus->allocated = allocated;
}
cpus->p[cpus->n].phandle = phandle;
cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
cpus->p[cpus->n].l2_cache = l2_cache;
cpus->p[cpus->n].name = strdup(name);
++cpus->n;
}
/* Walks over the cache list in order to detect nested caches and CPU mask for each */
static int
look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
{
unsigned int i;
int ret = -1;
if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
return ret;
for (i = 0; i < cpus->n; ++i) {
if (phandle != cpus->p[i].l2_cache)
continue;
if (NULL != cpus->p[i].cpuset) {
hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
ret = 0;
} else {
++(*level);
if (0 == look_powerpc_device_tree_discover_cache(cpus,
cpus->p[i].phandle, level, cpuset))
ret = 0;
}
}
return ret;
}
static void
try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
unsigned int level, hwloc_obj_cache_type_t ctype,
uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
hwloc_bitmap_t cpuset)
{
struct hwloc_obj *c = NULL;
hwloc_obj_type_t otype;
if (0 == cache_size)
return;
otype = hwloc_cache_type_by_depth_type(level, ctype);
if (otype == HWLOC_OBJ_TYPE_NONE)
return;
if (!hwloc_filter_check_keep_object_type(topology, otype))
return;
c = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
c->attr->cache.depth = level;
c->attr->cache.linesize = cache_line_size;
c->attr->cache.size = cache_size;
c->attr->cache.type = ctype;
if (cache_sets == 1)
/* likely wrong, make it unknown */
cache_sets = 0;
if (cache_sets && cache_line_size)
c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
else
c->attr->cache.associativity = 0;
c->cpuset = hwloc_bitmap_dup(cpuset);
hwloc_debug_2args_bitmap("cache (%s) depth %u has cpuset %s\n",
ctype == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (ctype == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
level, c->cpuset);
hwloc_insert_object_by_cpuset(topology, c);
}
static void
try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data,
const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
{
/* d-cache-block-size - ignore */
/* d-cache-line-size - to read, in bytes */
/* d-cache-sets - ignore */
/* d-cache-size - to read, in bytes */
/* i-cache, same for instruction */
/* cache-unified only exist if data and instruction caches are unified */
/* d-tlb-sets - ignore */
/* d-tlb-size - ignore, always 0 on power6 */
/* i-tlb-*, same */
uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
char unified_path[1024];
struct stat statbuf;
int unified;
snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
data->root_fd);
hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
data->root_fd);
hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
data->root_fd);
hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
data->root_fd);
hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
data->root_fd);
hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
data->root_fd);
if (!unified)
try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
}
/*
* Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
* which provide NUMA nodes information without any details
*/
static void
look_powerpc_device_tree(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data)
{
device_tree_cpus_t cpus;
const char ofroot[] = "/proc/device-tree/cpus";
unsigned int i;
int root_fd = data->root_fd;
DIR *dt = hwloc_opendir(ofroot, root_fd);
struct dirent *dirent;
if (NULL == dt)
return;
/* only works for Power so far, and not useful on ARM */
if (data->arch != HWLOC_LINUX_ARCH_POWER) {
closedir(dt);
return;
}
cpus.n = 0;
cpus.p = NULL;
cpus.allocated = 0;
while (NULL != (dirent = readdir(dt))) {
char cpu[256];
char *device_type;
uint32_t reg = -1, l2_cache = -1, phandle = -1;
int err;
if ('.' == dirent->d_name[0])
continue;
err = snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
if ((size_t) err >= sizeof(cpu))
continue;
device_type = hwloc_read_str(cpu, "device_type", root_fd);
if (NULL == device_type)
continue;
hwloc_read_unit32be(cpu, "reg", ®, root_fd);
if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
if (0 == strcmp(device_type, "cache")) {
add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
}
else if (0 == strcmp(device_type, "cpu")) {
/* Found CPU */
hwloc_bitmap_t cpuset = NULL;
size_t cb = 0;
uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
uint32_t nthreads = cb / sizeof(threads[0]);
if (NULL != threads) {
cpuset = hwloc_bitmap_alloc();
for (i = 0; i < nthreads; ++i) {
if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
hwloc_bitmap_set(cpuset, ntohl(threads[i]));
}
free(threads);
} else if ((unsigned int)-1 != reg) {
/* Doesn't work on ARM because cpu "reg" do not start at 0.
* We know the first cpu "reg" is the lowest. The others are likely
* in order assuming the device-tree shows objects in order.
*/
cpuset = hwloc_bitmap_alloc();
hwloc_bitmap_set(cpuset, reg);
}
if (NULL == cpuset) {
hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
} else {
struct hwloc_obj *core = NULL;
add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
/* Add core */
core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, (unsigned) reg);
core->cpuset = hwloc_bitmap_dup(cpuset);
hwloc_insert_object_by_cpuset(topology, core);
}
/* Add L1 cache */
try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
hwloc_bitmap_free(cpuset);
}
}
free(device_type);
}
closedir(dt);
/* No cores and L2 cache were found, exiting */
if (0 == cpus.n) {
hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
return;
}
#ifdef HWLOC_DEBUG
for (i = 0; i < cpus.n; ++i) {
hwloc_debug("%u: %s ibm,phandle=%08X l2_cache=%08X ",
i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
if (NULL == cpus.p[i].cpuset) {
hwloc_debug("%s\n", "no cpuset");
} else {
hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
}
}
#endif
/* Scan L2/L3/... caches */
for (i = 0; i < cpus.n; ++i) {
unsigned int level = 2;
hwloc_bitmap_t cpuset;
/* Skip real CPUs */
if (NULL != cpus.p[i].cpuset)
continue;
/* Calculate cache level and CPU mask */
cpuset = hwloc_bitmap_alloc();
if (0 == look_powerpc_device_tree_discover_cache(&cpus,
cpus.p[i].phandle, &level, cpuset)) {
char cpu[256];
snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
}
hwloc_bitmap_free(cpuset);
}
/* Do cleanup */
for (i = 0; i < cpus.n; ++i) {
hwloc_bitmap_free(cpus.p[i].cpuset);
free(cpus.p[i].name);
}
free(cpus.p);
}
struct knl_hwdata {
char memory_mode[32];
char cluster_mode[32];
long long int mcdram_cache_size; /* mcdram_cache_* is valid only if size > 0 */
int mcdram_cache_associativity;
int mcdram_cache_inclusiveness;
int mcdram_cache_line_size;
};
/* Try to handle knl hwdata properties
* Returns 0 on success and -1 otherwise */
static int hwloc_linux_try_handle_knl_hwdata_properties(struct hwloc_linux_backend_data_s *data,
struct knl_hwdata *hwdata,
unsigned DDR_nbnodes,
unsigned long DDR_numa_size,
unsigned MCDRAM_nbnodes,
unsigned long MCDRAM_numa_size)
{
char *knl_cache_file;
int version = 0;
char buffer[512] = {0};
char *data_beg = NULL;
char * fallback_env = getenv("HWLOC_KNL_HDH_FALLBACK");
int fallback = fallback_env ? atoi(fallback_env) : -1; /* by default, only fallback if needed */
hwdata->memory_mode[0] = '\0';
hwdata->cluster_mode[0] = '\0';
hwdata->mcdram_cache_size = -1;
hwdata->mcdram_cache_associativity = -1;
hwdata->mcdram_cache_inclusiveness = -1;
hwdata->mcdram_cache_line_size = -1;
if (fallback == 1) {
hwloc_debug("KNL dumped hwdata ignored, forcing fallback\n");
goto fallback;
}
if (asprintf(&knl_cache_file, "%s/knl_memoryside_cache", data->dumped_hwdata_dirname) < 0)
goto fallback;
hwloc_debug("Reading knl cache data from: %s\n", knl_cache_file);
if (hwloc_read_path_by_length(knl_cache_file, buffer, sizeof(buffer), data->root_fd) < 0) {
hwloc_debug("Unable to open KNL data file `%s' (%s)\n", knl_cache_file, strerror(errno));
free(knl_cache_file);
goto fallback;
}
free(knl_cache_file);
data_beg = &buffer[0];
/* file must start with version information */
if (sscanf(data_beg, "version: %d", &version) != 1) {
fprintf(stderr, "Invalid knl_memoryside_cache header, expected \"version: <int>\".\n");
goto fallback;
}
while (1) {
char *line_end = strstr(data_beg, "\n");
if (!line_end)
break;
if (version >= 1) {
if (!strncmp("cache_size:", data_beg, strlen("cache_size"))) {
sscanf(data_beg, "cache_size: %lld", &hwdata->mcdram_cache_size);
hwloc_debug("read cache_size=%lld\n", hwdata->mcdram_cache_size);
} else if (!strncmp("line_size:", data_beg, strlen("line_size:"))) {
sscanf(data_beg, "line_size: %d", &hwdata->mcdram_cache_line_size);
hwloc_debug("read line_size=%d\n", hwdata->mcdram_cache_line_size);
} else if (!strncmp("inclusiveness:", data_beg, strlen("inclusiveness:"))) {
sscanf(data_beg, "inclusiveness: %d", &hwdata->mcdram_cache_inclusiveness);
hwloc_debug("read inclusiveness=%d\n", hwdata->mcdram_cache_inclusiveness);
} else if (!strncmp("associativity:", data_beg, strlen("associativity:"))) {
sscanf(data_beg, "associativity: %d\n", &hwdata->mcdram_cache_associativity);
hwloc_debug("read associativity=%d\n", hwdata->mcdram_cache_associativity);
}
}
if (version >= 2) {
if (!strncmp("cluster_mode: ", data_beg, strlen("cluster_mode: "))) {
size_t length;
data_beg += strlen("cluster_mode: ");
length = line_end-data_beg;
if (length > sizeof(hwdata->cluster_mode)-1)
length = sizeof(hwdata->cluster_mode)-1;
memcpy(hwdata->cluster_mode, data_beg, length);
hwdata->cluster_mode[length] = '\0';
hwloc_debug("read cluster_mode=%s\n", hwdata->cluster_mode);
} else if (!strncmp("memory_mode: ", data_beg, strlen("memory_mode: "))) {
size_t length;
data_beg += strlen("memory_mode: ");
length = line_end-data_beg;
if (length > sizeof(hwdata->memory_mode)-1)
length = sizeof(hwdata->memory_mode)-1;
memcpy(hwdata->memory_mode, data_beg, length);
hwdata->memory_mode[length] = '\0';
hwloc_debug("read memory_mode=%s\n", hwdata->memory_mode);
}
}
data_beg = line_end + 1;
}
if (hwdata->mcdram_cache_size == -1
|| hwdata->mcdram_cache_line_size == -1
|| hwdata->mcdram_cache_associativity == -1
|| hwdata->mcdram_cache_inclusiveness == -1) {
hwloc_debug("Incorrect file format cache_size=%lld line_size=%d associativity=%d inclusiveness=%d\n",
hwdata->mcdram_cache_size,
hwdata->mcdram_cache_line_size,
hwdata->mcdram_cache_associativity,
hwdata->mcdram_cache_inclusiveness);
hwdata->mcdram_cache_size = -1; /* mark cache as invalid */
}
return 0;
fallback:
if (fallback == 0) {
hwloc_debug("KNL hwdata fallback disabled\n");
return -1;
}
hwloc_debug("Falling back to a heuristic\n");
/* there can be 0 MCDRAM_nbnodes, but we must have at least one DDR node (not cpuless) */
assert(DDR_nbnodes);
/* there are either no MCDRAM nodes, or as many as DDR nodes */
assert(!MCDRAM_nbnodes || MCDRAM_nbnodes == DDR_nbnodes);
if (!MCDRAM_nbnodes && DDR_numa_size <= 16UL*1024*1024*1024) {
/* We only found DDR numa nodes, but they are <=16GB.
* It could be a DDR-less KNL where numa nodes are actually MCDRAM, we can't know for sure.
* Both cases are unlikely, disable the heuristic for now.
*
* In theory we could check if DDR_numa_size == 8/12/16GB exactly (amount of MCDRAM numa size in H50/H25/Flat modes),
* but that's never the case since some kilobytes are always stolen by the system.
*/
hwloc_debug("Cannot guess if MCDRAM is in Cache or if the node is DDR-less (total NUMA node size %lu)\n",
DDR_numa_size);
return -1;
}
/* all commercial KNL/KNM have 16GB of MCDRAM */
unsigned long total_cache_size = 16UL*1024*1024*1024 - MCDRAM_numa_size;
if (!MCDRAM_nbnodes) {
strcpy(hwdata->memory_mode, "Cache");
} else {
if (!total_cache_size)
strcpy(hwdata->memory_mode, "Flat");
else if (total_cache_size == 8UL*1024*1024*1024)
strcpy(hwdata->memory_mode, "Hybrid50");
else if (total_cache_size == 4UL*1024*1024*1024)
strcpy(hwdata->memory_mode, "Hybrid25");
else
fprintf(stderr, "Unexpected KNL MCDRAM cache size %lu\n", total_cache_size);
}
if (DDR_nbnodes == 4) {
strcpy(hwdata->cluster_mode, "SNC4");
} else if (DDR_nbnodes == 2) {
strcpy(hwdata->cluster_mode, "SNC2");
} else if (DDR_nbnodes == 1) {
/* either Quadrant, All2ALL or Hemisphere */
} else {
fprintf(stderr, "Unexpected number of KNL non-MCDRAM NUMA nodes %u\n", DDR_nbnodes);
}
hwdata->mcdram_cache_size = total_cache_size/DDR_nbnodes;
hwdata->mcdram_cache_associativity = 1;
hwdata->mcdram_cache_inclusiveness = 1;
hwdata->mcdram_cache_line_size = 64;
return 0;
}
/**************************************
****** Sysfs Topology Discovery ******
**************************************/
static unsigned *
list_sysfsnode(struct hwloc_linux_backend_data_s *data,
const char *path,
unsigned *nbnodesp)
{
DIR *dir;
unsigned osnode, nbnodes = 0;
unsigned *indexes, index_;
hwloc_bitmap_t nodeset;
struct dirent *dirent;
/* Get the list of nodes first */
dir = hwloc_opendir(path, data->root_fd);
if (!dir)
return NULL;
nodeset = hwloc_bitmap_alloc();
if (!nodeset) {
closedir(dir);
return NULL;
}
while ((dirent = readdir(dir)) != NULL) {
if (strncmp(dirent->d_name, "node", 4))
continue;
osnode = strtoul(dirent->d_name+4, NULL, 0);
hwloc_bitmap_set(nodeset, osnode);
nbnodes++;
}
closedir(dir);
indexes = calloc(nbnodes, sizeof(*indexes));
if (!indexes) {
hwloc_bitmap_free(nodeset);
return NULL;
}
/* we don't know if sysfs returns nodes in order, we can't merge these loops */
/* Unsparsify node indexes.
* We'll need them later because Linux groups sparse distances
* and keeps them in order in the sysfs distance files.
* It'll simplify things in the meantime.
*/
index_ = 0;
hwloc_bitmap_foreach_begin (osnode, nodeset) {
indexes[index_] = osnode;
index_++;
} hwloc_bitmap_foreach_end();
hwloc_bitmap_free(nodeset);
#ifdef HWLOC_DEBUG
hwloc_debug("%s", "NUMA indexes: ");
for (index_ = 0; index_ < nbnodes; index_++)
hwloc_debug(" %u", indexes[index_]);
hwloc_debug("%s", "\n");
#endif
*nbnodesp = nbnodes;
return indexes;
}
static int
look_sysfsnode(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data,
const char *path, unsigned *found)
{
unsigned osnode;
unsigned nbnodes;
hwloc_obj_t * nodes;
unsigned *indexes;
uint64_t * distances;
hwloc_bitmap_t nodes_cpuset;
struct knl_hwdata knl_hwdata;
int failednodes = 0;
unsigned i;
/* NUMA nodes cannot be filtered out */
indexes = list_sysfsnode(data, path, &nbnodes);
if (!indexes)
return 0;
nodes = calloc(nbnodes,
sizeof(hwloc_obj_t));
distances = malloc(nbnodes*nbnodes*sizeof(*distances));
nodes_cpuset = hwloc_bitmap_alloc();
if (NULL == nodes_cpuset || NULL == nodes || NULL == distances) {
free(nodes);
free(indexes);
free(distances);
hwloc_bitmap_free(nodes_cpuset);
nbnodes = 0;
goto out;
}
/* Create NUMA objects */
for (i = 0; i < nbnodes; i++) {
hwloc_obj_t node;
int annotate;
osnode = indexes[i];
node = hwloc_get_numanode_obj_by_os_index(topology, osnode);
annotate = (node != NULL);
if (!annotate) {
/* create a new node */
char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
hwloc_bitmap_t cpuset;
sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
cpuset = hwloc__alloc_read_path_as_cpumask(nodepath, data->root_fd);
if (!cpuset) {
/* This NUMA object won't be inserted, we'll ignore distances */
failednodes++;
continue;
}
if (hwloc_bitmap_intersects(nodes_cpuset, cpuset)) {
/* crazy BIOS with overlapping NUMA node cpusets, impossible on Linux so far */
hwloc_debug_1arg_bitmap("node P#%u cpuset %s intersects with previous nodes, ignoring that node.\n", osnode, cpuset);
hwloc_bitmap_free(cpuset);
failednodes++;
continue;
} else {
hwloc_bitmap_or(nodes_cpuset, nodes_cpuset, cpuset);
}
node = hwloc_alloc_setup_object(topology, HWLOC_OBJ_NUMANODE, osnode);
node->cpuset = cpuset;
node->nodeset = hwloc_bitmap_alloc();
hwloc_bitmap_set(node->nodeset, osnode);
}
hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->attr->numanode);
nodes[i] = node;
hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
osnode, node->cpuset);
}
topology->support.discovery->numa = 1;
topology->support.discovery->numa_memory = 1;
hwloc_bitmap_free(nodes_cpuset);
if (failednodes || nbnodes <= 1) {
/* failed to read/create some nodes, don't bother reading/fixing
* a distance matrix that would likely be wrong anyway.
*/
free(distances);
distances = NULL;
}
if (distances && hwloc_parse_nodes_distances(path, nbnodes, indexes, distances, data->root_fd) < 0) {
free(distances);
distances = NULL;
}
free(indexes);
unsigned nr_knl_clusters = 0;
hwloc_obj_t knl_clusters[4]= { NULL, NULL, NULL, NULL };
int node_knl_cluster[8] = { -1, -1, -1, -1, -1, -1, -1, -1};
if (data->is_knl && !failednodes) {
char *env = getenv("HWLOC_KNL_NUMA_QUIRK");
int noquirk = (env && !atoi(env)) || !distances || !hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP);
int mscache;
unsigned j, closest;
unsigned long MCDRAM_numa_size, DDR_numa_size;
unsigned MCDRAM_nbnodes, DDR_nbnodes;
DDR_numa_size = 0;
DDR_nbnodes = 0;
MCDRAM_numa_size = 0;
MCDRAM_nbnodes = 0;
for(i=0; i<nbnodes; i++)
if (hwloc_bitmap_iszero(nodes[i]->cpuset)) {
MCDRAM_numa_size += nodes[i]->attr->numanode.local_memory;
MCDRAM_nbnodes++;
} else {
DDR_numa_size += nodes[i]->attr->numanode.local_memory;
DDR_nbnodes++;
}
assert(DDR_nbnodes + MCDRAM_nbnodes == nbnodes);
hwloc_linux_try_handle_knl_hwdata_properties(data, &knl_hwdata,
DDR_nbnodes, DDR_numa_size,
MCDRAM_nbnodes, MCDRAM_numa_size);
mscache = knl_hwdata.mcdram_cache_size > 0 && hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_L3CACHE);
if (knl_hwdata.cluster_mode[0])
hwloc_obj_add_info(topology->levels[0][0], "ClusterMode", knl_hwdata.cluster_mode);
if (knl_hwdata.memory_mode[0])
hwloc_obj_add_info(topology->levels[0][0], "MemoryMode", knl_hwdata.memory_mode);
for(i=0; i<nbnodes; i++) {
if (!hwloc_bitmap_iszero(nodes[i]->cpuset)) {
/* DDR, see if there's a MCDRAM cache to add */
if (mscache) {
hwloc_obj_t cache = hwloc_alloc_setup_object(topology, HWLOC_OBJ_L3CACHE, HWLOC_UNKNOWN_INDEX);
if (cache) {
cache->attr->cache.depth = 3;
cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
cache->attr->cache.size = knl_hwdata.mcdram_cache_size;
cache->attr->cache.linesize = knl_hwdata.mcdram_cache_line_size;
cache->attr->cache.associativity = knl_hwdata.mcdram_cache_associativity;
hwloc_obj_add_info(cache, "Inclusive", knl_hwdata.mcdram_cache_inclusiveness ? "1" : "0");
cache->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
cache->nodeset = hwloc_bitmap_dup(nodes[i]->nodeset); /* only applies to DDR */
cache->subtype = strdup("MemorySideCache");
hwloc_insert_object_by_cpuset(topology, cache);
}
}
/* nothing else to do for DDR */
continue;
}
/* MCDRAM */
nodes[i]->subtype = strdup("MCDRAM");
if (noquirk)
continue;
/* DDR is the closest node with CPUs */
closest = (unsigned)-1;
for(j=0; j<nbnodes; j++) {
if (j==i)
continue;
if (hwloc_bitmap_iszero(nodes[j]->cpuset))
/* nodes without CPU, that's another MCDRAM, skip it */
continue;
if (closest == (unsigned)-1 || distances[i*nbnodes+j]<distances[i*nbnodes+closest])
closest = j;
}
if (closest != (unsigned) -1) {
/* Change MCDRAM cpuset to DDR cpuset for clarity.
* Not actually useful if we insert with hwloc__attach_memory_object() below.
* The cpuset will be updated by the core later anyway.
*/
hwloc_bitmap_copy(nodes[i]->cpuset, nodes[closest]->cpuset);
/* Add a Group for Cluster containing this MCDRAM + DDR */
hwloc_obj_t cluster = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, HWLOC_UNKNOWN_INDEX);
hwloc_obj_add_other_obj_sets(cluster, nodes[i]);
hwloc_obj_add_other_obj_sets(cluster, nodes[closest]);
cluster->subtype = strdup("Cluster");
cluster->attr->group.kind = HWLOC_GROUP_KIND_INTEL_KNL_SUBNUMA_CLUSTER;
knl_clusters[nr_knl_clusters] = cluster;
node_knl_cluster[i] = nr_knl_clusters;
node_knl_cluster[closest] = nr_knl_clusters;
nr_knl_clusters++;
}
}
if (!noquirk) {
/* drop the distance matrix, it contradicts the above NUMA layout groups */
free(distances);
distances = NULL;
}
}
/* everything is ready for insertion now */
/* insert knl clusters */
if (data->is_knl) {
for(i=0; i<nr_knl_clusters; i++) {
knl_clusters[i] = hwloc_insert_object_by_cpuset(topology, knl_clusters[i]);
/* failure or replace can be ignored */
}
}
/* insert actual numa nodes */
for (i = 0; i < nbnodes; i++) {
hwloc_obj_t node = nodes[i];
if (node) {
hwloc_obj_t res_obj;
if (data->is_knl && node_knl_cluster[i] != -1) {
/* directly attach to the existing cluster */
hwloc_obj_t parent = knl_clusters[node_knl_cluster[i]];
res_obj = hwloc__attach_memory_object(topology, parent, node, hwloc_report_os_error);
} else {
/* we don't know where to attach, let the core find or insert if needed */
res_obj = hwloc__insert_object_by_cpuset(topology, NULL, node, hwloc_report_os_error);
}
if (res_obj != node)
/* This NUMA node got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
* This object disappeared, we'll ignore distances */
failednodes++;
}
}
if (failednodes) {
free(distances);
distances = NULL;
}
/* Inserted distances now that nodes are properly inserted */
if (distances)
hwloc_internal_distances_add(topology, nbnodes, nodes, distances,
HWLOC_DISTANCES_KIND_FROM_OS|HWLOC_DISTANCES_KIND_MEANS_LATENCY,
HWLOC_DISTANCES_ADD_FLAG_GROUP);
else
free(nodes);
out:
*found = nbnodes - failednodes;
return 0;
}
/* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
static int
look_sysfscpu(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data,
const char *path,
struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
{
hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
#define CPU_TOPOLOGY_STR_LEN 128
char str[CPU_TOPOLOGY_STR_LEN];
DIR *dir;
int i,j;
unsigned caches_added, merge_buggy_core_siblings;
hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
int threadwithcoreid = data->is_amd_with_CU ? -1 : 0; /* -1 means we don't know yet if threads have their own coreids within thread_siblings */
/* fill the cpuset of interesting cpus */
dir = hwloc_opendir(path, data->root_fd);
if (!dir)
return -1;
else {
struct dirent *dirent;
cpuset = hwloc_bitmap_alloc();
while ((dirent = readdir(dir)) != NULL) {
unsigned long cpu;
char online[2];
if (strncmp(dirent->d_name, "cpu", 3))
continue;
cpu = strtoul(dirent->d_name+3, NULL, 0);
/* Maybe we don't have topology information but at least it exists */
hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
/* check whether this processor is online */
sprintf(str, "%s/cpu%lu/online", path, cpu);
if (hwloc_read_path_by_length(str, online, sizeof(online), data->root_fd) == 0) {
if (!atoi(online)) {
hwloc_debug("os proc %lu is offline\n", cpu);
continue;
}
}
/* check whether the kernel exports topology information for this cpu */
sprintf(str, "%s/cpu%lu/topology", path, cpu);
if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
cpu, path, cpu);
continue;
}
hwloc_bitmap_set(cpuset, cpu);
}
closedir(dir);
}
topology->support.discovery->pu = 1;
hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
hwloc_bitmap_weight(cpuset), cpuset);
merge_buggy_core_siblings = (data->arch == HWLOC_LINUX_ARCH_X86);
caches_added = 0;
hwloc_bitmap_foreach_begin(i, cpuset) {
hwloc_bitmap_t packageset, coreset, bookset, threadset;
int tmpint;
if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
/* look at the package */
sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
packageset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
if (packageset) {
hwloc_bitmap_and(packageset, packageset, cpuset);
if (hwloc_bitmap_first(packageset) == i) {
/* first cpu in this package, add the package */
struct hwloc_obj *package;
unsigned mypackageid;
mypackageid = (unsigned) -1;
sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i); /* contains %d at least up to 4.9 */
if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
mypackageid = (unsigned) tmpint;
if (merge_buggy_core_siblings) {
/* check for another package with same physical_package_id */
hwloc_obj_t curpackage = packages;
while (curpackage) {
if (curpackage->os_index == mypackageid) {
/* found another package with same physical_package_id but different core_siblings.
* looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
* merge these core_siblings to extend the existing first package object.
*/
static int reported = 0;
if (!reported && !hwloc_hide_errors()) {
char *a, *b;
hwloc_bitmap_asprintf(&a, curpackage->cpuset);
hwloc_bitmap_asprintf(&b, packageset);
fprintf(stderr, "****************************************************************************\n");
fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
mypackageid, a, b);
fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
fprintf(stderr, "* does not support this processor correctly.\n");
fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
fprintf(stderr, "*\n");
fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
fprintf(stderr, "* along with the files generated by the hwloc-gather-topology script.\n");
fprintf(stderr, "****************************************************************************\n");
reported = 1;
free(a);
free(b);
}
hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
goto package_done;
}
curpackage = curpackage->next_cousin;
}
}
/* no package with same physical_package_id, create a new one */
package = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, mypackageid);
package->cpuset = packageset;
hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
mypackageid, packageset);
/* add cpuinfo */
if (cpuinfo_Lprocs) {
for(j=0; j<(int) cpuinfo_numprocs; j++)
if ((int) cpuinfo_Lprocs[j].Pproc == i) {
hwloc__move_infos(&package->infos, &package->infos_count,
&cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
}
}
/* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
* we'll actually insert the tree at the end of the entire sysfs cpu loop.
*/
package->next_cousin = packages;
packages = package;
packageset = NULL; /* don't free it */
}
package_done:
hwloc_bitmap_free(packageset);
}
}
if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
/* look at the core */
sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
coreset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
if (coreset) {
unsigned mycoreid = (unsigned) -1;
int gotcoreid = 0; /* to avoid reading the coreid twice */
hwloc_bitmap_and(coreset, coreset, cpuset);
if (hwloc_bitmap_weight(coreset) > 1 && threadwithcoreid == -1) {
/* check if this is hyper-threading or different coreids */
unsigned siblingid, siblingcoreid;
mycoreid = (unsigned) -1;
sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
mycoreid = (unsigned) tmpint;
gotcoreid = 1;
siblingid = hwloc_bitmap_first(coreset);
if (siblingid == (unsigned) i)
siblingid = hwloc_bitmap_next(coreset, i);
siblingcoreid = (unsigned) -1;
sprintf(str, "%s/cpu%u/topology/core_id", path, siblingid); /* contains %d at least up to 4.9 */
if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
siblingcoreid = (unsigned) tmpint;
threadwithcoreid = (siblingcoreid != mycoreid);
}
if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
/* regular core */
struct hwloc_obj *core;
if (!gotcoreid) {
mycoreid = (unsigned) -1;
sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
mycoreid = (unsigned) tmpint;
}
core = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, mycoreid);
if (threadwithcoreid)
/* amd multicore compute-unit, create one core per thread */
hwloc_bitmap_only(coreset, i);
core->cpuset = coreset;
hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
mycoreid, core->cpuset);
hwloc_insert_object_by_cpuset(topology, core);
coreset = NULL; /* don't free it */
}
hwloc_bitmap_free(coreset);
}
}
if (hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_GROUP)) {
/* look at the books */
sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
bookset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
if (bookset) {
hwloc_bitmap_and(bookset, bookset, cpuset);
if (hwloc_bitmap_first(bookset) == i) {
struct hwloc_obj *book;
unsigned mybookid;
mybookid = (unsigned) -1;
sprintf(str, "%s/cpu%d/topology/book_id", path, i); /* contains %d at least up to 4.9 */
if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0) {
mybookid = (unsigned) tmpint;
book = hwloc_alloc_setup_object(topology, HWLOC_OBJ_GROUP, mybookid);
book->cpuset = bookset;
hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
mybookid, bookset);
book->subtype = strdup("Book");
book->attr->group.kind = HWLOC_GROUP_KIND_S390_BOOK;
hwloc_insert_object_by_cpuset(topology, book);
bookset = NULL; /* don't free it */
}
}
hwloc_bitmap_free(bookset);
}
}
/* PU cannot be filtered-out */
{
/* look at the thread */
struct hwloc_obj *thread = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, (unsigned) i);
threadset = hwloc_bitmap_alloc();
hwloc_bitmap_only(threadset, i);
thread->cpuset = threadset;
hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
i, threadset);
hwloc_insert_object_by_cpuset(topology, thread);
}
/* look at the caches */
for(j=0; j<10; j++) {
char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
hwloc_bitmap_t cacheset;
sprintf(str, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
cacheset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
if (cacheset) {
if (hwloc_bitmap_iszero(cacheset)) {
/* ia64 returning empty L3 and L2i? use the core set instead */
hwloc_bitmap_t tmpset;
sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
tmpset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
/* only use it if we actually got something */
if (tmpset) {
hwloc_bitmap_free(cacheset);
cacheset = tmpset;
}
}
hwloc_bitmap_and(cacheset, cacheset, cpuset);
if (hwloc_bitmap_first(cacheset) == i) {
unsigned kB;
unsigned linesize;
unsigned sets, lines_per_tag;
unsigned depth; /* 1 for L1, .... */
hwloc_obj_cache_type_t ctype = HWLOC_OBJ_CACHE_UNIFIED; /* default */
hwloc_obj_type_t otype;
struct hwloc_obj *cache;
/* get the cache level depth */
sprintf(str, "%s/cpu%d/cache/index%d/level", path, i, j); /* contains %u at least up to 4.9 */
if (hwloc_read_path_as_uint(str, &depth, data->root_fd) < 0) {
hwloc_bitmap_free(cacheset);
continue;
}
/* cache type */
sprintf(str, "%s/cpu%d/cache/index%d/type", path, i, j);
if (hwloc_read_path_by_length(str, str2, sizeof(str2), data->root_fd) == 0) {
if (!strncmp(str2, "Data", 4))
ctype = HWLOC_OBJ_CACHE_DATA;
else if (!strncmp(str2, "Unified", 7))
ctype = HWLOC_OBJ_CACHE_UNIFIED;
else if (!strncmp(str2, "Instruction", 11))
ctype = HWLOC_OBJ_CACHE_INSTRUCTION;
}
otype = hwloc_cache_type_by_depth_type(depth, ctype);
if (otype == HWLOC_OBJ_TYPE_NONE
|| !hwloc_filter_check_keep_object_type(topology, otype)) {
hwloc_bitmap_free(cacheset);
continue;
}
/* FIXME: if Bulldozer/Piledriver, add compute unit Groups when L2/L1i filtered-out */
/* FIXME: if KNL, add tile Groups when L2/L1i filtered-out */
/* get the cache size */
kB = 0;
sprintf(str, "%s/cpu%d/cache/index%d/size", path, i, j); /* contains %uK at least up to 4.9 */
hwloc_read_path_as_uint(str, &kB, data->root_fd);
/* KNL reports L3 with size=0 and full cpuset in cpuid.
* Let hwloc_linux_try_add_knl_mcdram_cache() detect it better.
*/
if (!kB && otype == HWLOC_OBJ_L3CACHE && data->is_knl) {
hwloc_bitmap_free(cacheset);
continue;
}
/* get the line size */
linesize = 0;
sprintf(str, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j); /* contains %u at least up to 4.9 */
hwloc_read_path_as_uint(str, &linesize, data->root_fd);
/* get the number of sets and lines per tag.
* don't take the associativity directly in "ways_of_associativity" because
* some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
*/
sets = 0;
sprintf(str, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j); /* contains %u at least up to 4.9 */
hwloc_read_path_as_uint(str, &sets, data->root_fd);
lines_per_tag = 1;
sprintf(str, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j); /* contains %u at least up to 4.9 */
hwloc_read_path_as_uint(str, &lines_per_tag, data->root_fd);
/* first cpu in this cache, add the cache */
cache = hwloc_alloc_setup_object(topology, otype, HWLOC_UNKNOWN_INDEX);
cache->attr->cache.size = ((uint64_t)kB) << 10;
cache->attr->cache.depth = depth;
cache->attr->cache.linesize = linesize;
cache->attr->cache.type = ctype;
if (!linesize || !lines_per_tag || !sets)
cache->attr->cache.associativity = 0; /* unknown */
else if (sets == 1)
cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
else
cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
cache->cpuset = cacheset;
hwloc_debug_1arg_bitmap("cache depth %u has cpuset %s\n",
depth, cacheset);
hwloc_insert_object_by_cpuset(topology, cache);
cacheset = NULL; /* don't free it */
++caches_added;
}
}
hwloc_bitmap_free(cacheset);
}
} hwloc_bitmap_foreach_end();
/* actually insert in the tree now that package cpusets have been fixed-up */
while (packages) {
hwloc_obj_t next = packages->next_cousin;
packages->next_cousin = NULL;
hwloc_insert_object_by_cpuset(topology, packages);
packages = next;
}
if (0 == caches_added)
look_powerpc_device_tree(topology, data);
hwloc_bitmap_free(cpuset);
return 0;
}
/****************************************
****** cpuinfo Topology Discovery ******
****************************************/
static int
hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
struct hwloc_info_s **infos, unsigned *infos_count,
int is_global __hwloc_attribute_unused)
{
if (!strcmp("vendor_id", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUVendor", value);
} else if (!strcmp("model name", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUModel", value);
} else if (!strcmp("model", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
} else if (!strcmp("cpu family", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
} else if (!strcmp("stepping", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUStepping", value);
}
return 0;
}
static int
hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
struct hwloc_info_s **infos, unsigned *infos_count,
int is_global __hwloc_attribute_unused)
{
if (!strcmp("vendor", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUVendor", value);
} else if (!strcmp("model name", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUModel", value);
} else if (!strcmp("model", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
} else if (!strcmp("family", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
}
return 0;
}
static int
hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
struct hwloc_info_s **infos, unsigned *infos_count,
int is_global __hwloc_attribute_unused)
{
if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
|| !strcmp("model name", prefix) /* new kernels with one model name per core */) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUModel", value);
} else if (!strcmp("CPU implementer", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUImplementer", value);
} else if (!strcmp("CPU architecture", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
} else if (!strcmp("CPU variant", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUVariant", value);
} else if (!strcmp("CPU part", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUPart", value);
} else if (!strcmp("CPU revision", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPURevision", value);
} else if (!strcmp("Hardware", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "HardwareName", value);
} else if (!strcmp("Revision", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "HardwareRevision", value);
} else if (!strcmp("Serial", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "HardwareSerial", value);
}
return 0;
}
static int
hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
struct hwloc_info_s **infos, unsigned *infos_count,
int is_global)
{
/* common fields */
if (!strcmp("cpu", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "CPUModel", value);
} else if (!strcmp("platform", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "PlatformName", value);
} else if (!strcmp("model", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "PlatformModel", value);
}
/* platform-specific fields */
else if (!strcasecmp("vendor", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "PlatformVendor", value);
} else if (!strcmp("Board ID", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
} else if (!strcmp("Board", prefix)
|| !strcasecmp("Machine", prefix)) {
/* machine and board are similar (and often more precise) than model above */
if (value[0])
hwloc__add_info_nodup(infos, infos_count, "PlatformModel", value, 1);
} else if (!strcasecmp("Revision", prefix)
|| !strcmp("Hardware rev", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
} else if (!strcmp("SVR", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
} else if (!strcmp("PVR", prefix)) {
if (value[0])
hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
}
/* don't match 'board*' because there's also "board l2" on some platforms */
return 0;
}
/*
* avr32: "chip type\t:" => OK
* blackfin: "model name\t:" => OK
* h8300: "CPU:" => OK
* m68k: "CPU:" => OK
* mips: "cpu model\t\t:" => OK
* openrisc: "CPU:" => OK
* sparc: "cpu\t\t:" => OK
* tile: "model name\t:" => OK
* unicore32: "Processor\t:" => OK
* alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:" => "cpu" overwritten by "cpu model", no processor indexes
* cris: "cpu\t\t:" + "cpu model\t:" => only "cpu"
* frv: "CPU-Core:" + "CPU:" => only "CPU"
* mn10300: "cpu core :" + "model name :" => only "model name"
* parisc: "cpu family\t:" + "cpu\t\t:" => only "cpu"
*
* not supported because of conflicts with other arch minor lines:
* m32r: "cpu family\t:" => KO (adding "cpu family" would break "blackfin")
* microblaze: "CPU-Family:" => KO
* sh: "cpu family\t:" + "cpu type\t:" => KO
* xtensa: "model\t\t:" => KO
*/
static int
hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
struct hwloc_info_s **infos, unsigned *infos_count,
int is_global __hwloc_attribute_unused)
{
if (!strcmp("model name", prefix)
|| !strcmp("Processor", prefix)
|| !strcmp("chip type", prefix)
|| !strcmp("cpu model", prefix)
|| !strcasecmp("cpu", prefix)) {
/* keep the last one, assume it's more precise than the first one.
* we should have the Architecture keypair for basic information anyway.
*/
if (value[0])
hwloc__add_info_nodup(infos, infos_count, "CPUModel", value, 1);
}
return 0;
}
/* Lprocs_p set to NULL unless returns > 0 */
static int
hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
const char *path,
struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
struct hwloc_info_s **global_infos, unsigned *global_infos_count)
{
FILE *fd;
unsigned len = 128;
char str[len]; /* vendor/model can be very long */
char *endptr;
unsigned allocated_Lprocs = 0;
struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
unsigned numprocs = 0;
int curproc = -1;
int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_info_s **, unsigned *, int) = NULL;
if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
{
hwloc_debug("could not open %s\n", path);
return -1;
}
# define PROCESSOR "processor"
# define PACKAGEID "physical id" /* the longest one */
# define COREID "core id"
hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
while (fgets(str,len,fd)!=NULL) {
unsigned long Ppkg, Pcore, Pproc;
char *end, *dot, *prefix, *value;
int noend = 0;
/* remove the ending \n */
end = strchr(str, '\n');
if (end)
*end = 0;
else
noend = 1;
/* if empty line, skip and reset curproc */
if (!*str) {
curproc = -1;
continue;
}
/* skip lines with no dot */
dot = strchr(str, ':');
if (!dot)
continue;
/* skip lines not starting with a letter */
if ((*str > 'z' || *str < 'a')
&& (*str > 'Z' || *str < 'A'))
continue;
/* mark the end of the prefix */
prefix = str;
end = dot;
while (end[-1] == ' ' || end[-1] == '\t') end--; /* need a strrspn() */
*end = 0;
/* find beginning of value, its end is already marked */
value = dot+1 + strspn(dot+1, " \t");
/* defines for parsing numbers */
# define getprocnb_begin(field, var) \
if (!strcmp(field,prefix)) { \
var = strtoul(value,&endptr,0); \
if (endptr==value) { \
hwloc_debug("no number in "field" field of %s\n", path); \
goto err; \
} else if (var==ULONG_MAX) { \
hwloc_debug("too big "field" number in %s\n", path); \
goto err; \
} \
hwloc_debug(field " %lu\n", var)
# define getprocnb_end() \
}
/* actually parse numbers */
getprocnb_begin(PROCESSOR, Pproc);
curproc = numprocs++;
if (numprocs > allocated_Lprocs) {
struct hwloc_linux_cpuinfo_proc * tmp;
if (!allocated_Lprocs)
allocated_Lprocs = 8;
else
allocated_Lprocs *= 2;
tmp = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
if (!tmp)
goto err;
Lprocs = tmp;
}
Lprocs[curproc].Pproc = Pproc;
Lprocs[curproc].Pcore = -1;
Lprocs[curproc].Ppkg = -1;
Lprocs[curproc].Lcore = -1;
Lprocs[curproc].Lpkg = -1;
Lprocs[curproc].infos = NULL;
Lprocs[curproc].infos_count = 0;
getprocnb_end() else
getprocnb_begin(PACKAGEID, Ppkg);
Lprocs[curproc].Ppkg = Ppkg;
getprocnb_end() else
getprocnb_begin(COREID, Pcore);
Lprocs[curproc].Pcore = Pcore;
getprocnb_end() else {
/* architecture specific or default routine for parsing cpumodel */
switch (data->arch) {
case HWLOC_LINUX_ARCH_X86:
parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
break;
case HWLOC_LINUX_ARCH_ARM:
parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
break;
case HWLOC_LINUX_ARCH_POWER:
parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
break;
case HWLOC_LINUX_ARCH_IA64:
parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
break;
default:
parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
}
/* we can't assume that we already got a processor index line:
* alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
* tile has a global section with model name before the list of processor lines.
*/
parse_cpuinfo_func(prefix, value,
curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
curproc < 0);
}
if (noend) {
/* ignore end of line */
if (fscanf(fd,"%*[^\n]") == EOF)
break;
getc(fd);
}
}
fclose(fd);
*Lprocs_p = Lprocs;
return numprocs;
err:
fclose(fd);
free(Lprocs);
*Lprocs_p = NULL;
return -1;
}
static void
hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
struct hwloc_info_s *global_infos, unsigned global_infos_count)
{
if (Lprocs) {
unsigned i;
for(i=0; i<numprocs; i++) {
hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
}
free(Lprocs);
}
hwloc__free_infos(global_infos, global_infos_count);
}
static int
look_cpuinfo(struct hwloc_topology *topology,
struct hwloc_linux_cpuinfo_proc * Lprocs,
unsigned numprocs)
{
/* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
unsigned *Lcore_to_Pcore;
unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
unsigned *Lpkg_to_Ppkg;
unsigned numpkgs=0;
unsigned numcores=0;
unsigned long Lproc;
unsigned missingpkg;
unsigned missingcore;
unsigned i,j;
/* initialize misc arrays, there can be at most numprocs entries */
Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
for (i = 0; i < numprocs; i++) {
Lcore_to_Pcore[i] = -1;
Lcore_to_Ppkg[i] = -1;
Lpkg_to_Ppkg[i] = -1;
}
/* create PU objects */
for(Lproc=0; Lproc<numprocs; Lproc++) {
unsigned long Pproc = Lprocs[Lproc].Pproc;
hwloc_obj_t obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PU, (unsigned)Pproc);
obj->cpuset = hwloc_bitmap_alloc();
hwloc_bitmap_only(obj->cpuset, Pproc);
hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
Lproc, Pproc, obj->cpuset);
hwloc_insert_object_by_cpuset(topology, obj);
}
topology->support.discovery->pu = 1;
hwloc_debug("%s", "\n * Topology summary *\n");
hwloc_debug("%u processors)\n", numprocs);
/* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
for(Lproc=0; Lproc<numprocs; Lproc++) {
long Ppkg = Lprocs[Lproc].Ppkg;
if (Ppkg != -1) {
unsigned long Pproc = Lprocs[Lproc].Pproc;
for (i=0; i<numpkgs; i++)
if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
break;
Lprocs[Lproc].Lpkg = i;
hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, (unsigned long) Ppkg);
if (i==numpkgs) {
Lpkg_to_Ppkg[numpkgs] = Ppkg;
numpkgs++;
}
}
}
/* Some buggy Linuxes don't provide numbers for processor 0, which makes us
* provide bogus information. We should rather drop it. */
missingpkg=0;
for(j=0; j<numprocs; j++)
if (Lprocs[j].Ppkg == -1) {
missingpkg=1;
break;
}
/* create package objects */
hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
if (!missingpkg && numpkgs>0
&& hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_PACKAGE)) {
for (i = 0; i < numpkgs; i++) {
struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
int doneinfos = 0;
obj->cpuset = hwloc_bitmap_alloc();
for(j=0; j<numprocs; j++)
if ((unsigned) Lprocs[j].Lpkg == i) {
hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
if (!doneinfos) {
hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
doneinfos = 1;
}
}
hwloc_debug_1arg_bitmap("Package %u has cpuset %s\n", i, obj->cpuset);
hwloc_insert_object_by_cpuset(topology, obj);
}
hwloc_debug("%s", "\n");
}
/* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
for(Lproc=0; Lproc<numprocs; Lproc++) {
long Pcore = Lprocs[Lproc].Pcore;
if (Pcore != -1) {
for (i=0; i<numcores; i++)
if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
break;
Lprocs[Lproc].Lcore = i;
if (i==numcores) {
Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
Lcore_to_Pcore[numcores] = Pcore;
numcores++;
}
}
}
/* Some buggy Linuxes don't provide numbers for processor 0, which makes us
* provide bogus information. We should rather drop it. */
missingcore=0;
for(j=0; j<numprocs; j++)
if (Lprocs[j].Pcore == -1) {
missingcore=1;
break;
}
/* create Core objects */
hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
if (!missingcore && numcores>0
&& hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_CORE)) {
for (i = 0; i < numcores; i++) {
struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
obj->cpuset = hwloc_bitmap_alloc();
for(j=0; j<numprocs; j++)
if ((unsigned) Lprocs[j].Lcore == i)
hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
hwloc_debug_1arg_bitmap("Core %u has cpuset %s\n", i, obj->cpuset);
hwloc_insert_object_by_cpuset(topology, obj);
}
hwloc_debug("%s", "\n");
}
free(Lcore_to_Pcore);
free(Lcore_to_Ppkg);
free(Lpkg_to_Ppkg);
return 0;
}
/*************************************
****** Main Topology Discovery ******
*************************************/
static void
hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
{
char line[64], *tmp, *end;
if (hwloc_read_path_by_length("/proc/elog", line, sizeof(line), data->root_fd) < 0)
return;
if (strncmp(line, "Card ", 5))
return;
tmp = line + 5;
end = strchr(tmp, ':');
if (!end)
return;
*end = '\0';
if (tmp[0])
hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
}
static void
hwloc_gather_system_info(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data)
{
FILE *file;
char line[128]; /* enough for utsname fields */
const char *env;
/* initialize to something sane, in case !is_thissystem and we can't find things in /proc/hwloc-nofile-info */
memset(&data->utsname, 0, sizeof(data->utsname));
data->fallback_nbprocessors = -1; /* unknown yet */
data->pagesize = 4096;
/* read thissystem info */
if (topology->is_thissystem) {
uname(&data->utsname);
data->fallback_nbprocessors = hwloc_fallback_nbprocessors(topology); /* errors managed in hwloc_linux_fallback_pu_level() */
data->pagesize = hwloc_getpagesize();
}
/* overwrite with optional /proc/hwloc-nofile-info */
file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
if (file) {
while (fgets(line, sizeof(line), file)) {
char *tmp = strchr(line, '\n');
if (!strncmp("OSName: ", line, 8)) {
if (tmp)
*tmp = '\0';
strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
} else if (!strncmp("OSRelease: ", line, 11)) {
if (tmp)
*tmp = '\0';
strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
} else if (!strncmp("OSVersion: ", line, 11)) {
if (tmp)
*tmp = '\0';
strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
} else if (!strncmp("HostName: ", line, 10)) {
if (tmp)
*tmp = '\0';
strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
} else if (!strncmp("Architecture: ", line, 14)) {
if (tmp)
*tmp = '\0';
strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
} else if (!strncmp("FallbackNbProcessors: ", line, 22)) {
if (tmp)
*tmp = '\0';
data->fallback_nbprocessors = atoi(line+22);
} else if (!strncmp("PageSize: ", line, 10)) {
if (tmp)
*tmp = '\0';
data->pagesize = strtoull(line+10, NULL, 10);
} else {
hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
/* ignored */
}
}
fclose(file);
}
env = getenv("HWLOC_DUMP_NOFILE_INFO");
if (env && *env) {
file = fopen(env, "w");
if (file) {
if (*data->utsname.sysname)
fprintf(file, "OSName: %s\n", data->utsname.sysname);
if (*data->utsname.release)
fprintf(file, "OSRelease: %s\n", data->utsname.release);
if (*data->utsname.version)
fprintf(file, "OSVersion: %s\n", data->utsname.version);
if (*data->utsname.nodename)
fprintf(file, "HostName: %s\n", data->utsname.nodename);
if (*data->utsname.machine)
fprintf(file, "Architecture: %s\n", data->utsname.machine);
fprintf(file, "FallbackNbProcessors: %d\n", data->fallback_nbprocessors);
fprintf(file, "PageSize: %llu\n", (unsigned long long) data->pagesize);
fclose(file);
}
}
/* detect arch for quirks, using configure #defines if possible, or uname */
#if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
if (topology->is_thissystem)
data->arch = HWLOC_LINUX_ARCH_X86;
#endif
if (data->arch == HWLOC_LINUX_ARCH_UNKNOWN && *data->utsname.machine) {
if (!strcmp(data->utsname.machine, "x86_64")
|| (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
|| !strcmp(data->utsname.machine, "k1om"))
data->arch = HWLOC_LINUX_ARCH_X86;
else if (!strncmp(data->utsname.machine, "arm", 3))
data->arch = HWLOC_LINUX_ARCH_ARM;
else if (!strncmp(data->utsname.machine, "ppc", 3)
|| !strncmp(data->utsname.machine, "power", 5))
data->arch = HWLOC_LINUX_ARCH_POWER;
else if (!strcmp(data->utsname.machine, "ia64"))
data->arch = HWLOC_LINUX_ARCH_IA64;
}
}
/* returns 0 on success, -1 on non-match or error during hardwired load */
static int
hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend *backend)
{
struct hwloc_topology *topology = backend->topology;
struct hwloc_linux_backend_data_s *data = backend->private_data;
if (getenv("HWLOC_NO_HARDWIRED_TOPOLOGY"))
return -1;
if (!strcmp(data->utsname.machine, "s64fx")) {
char line[128];
/* Fujistu K-computer, FX10, and FX100 use specific processors
* whose Linux topology support is broken until 4.1 (acc455cffa75070d55e74fc7802b49edbc080e92and)
* and existing machines will likely never be fixed by kernel upgrade.
*/
/* /proc/cpuinfo starts with one of these lines:
* "cpu : Fujitsu SPARC64 VIIIfx"
* "cpu : Fujitsu SPARC64 XIfx"
* "cpu : Fujitsu SPARC64 IXfx"
*/
if (hwloc_read_path_by_length("/proc/cpuinfo", line, sizeof(line), data->root_fd) < 0)
return -1;
if (strncmp(line, "cpu\t", 4))
return -1;
if (strstr(line, "Fujitsu SPARC64 VIIIfx"))
return hwloc_look_hardwired_fujitsu_k(topology);
else if (strstr(line, "Fujitsu SPARC64 IXfx"))
return hwloc_look_hardwired_fujitsu_fx10(topology);
else if (strstr(line, "FUJITSU SPARC64 XIfx"))
return hwloc_look_hardwired_fujitsu_fx100(topology);
}
return -1;
}
static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep)
{
char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, root_path);
if (cgroup_mntpnt || cpuset_mntpnt) {
cpuset_name = hwloc_read_linux_cpuset_name(root_fd, topology->pid);
if (cpuset_name) {
hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->allowed_cpuset);
hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->allowed_nodeset);
}
free(cgroup_mntpnt);
free(cpuset_mntpnt);
}
*cpuset_namep = cpuset_name;
}
static void
hwloc_linux_fallback_pu_level(struct hwloc_backend *backend)
{
struct hwloc_topology *topology = backend->topology;
struct hwloc_linux_backend_data_s *data = backend->private_data;
if (data->fallback_nbprocessors >= 1)
topology->support.discovery->pu = 1;
else
data->fallback_nbprocessors = 1;
hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
}
static int
hwloc_look_linuxfs(struct hwloc_backend *backend)
{
struct hwloc_topology *topology = backend->topology;
struct hwloc_linux_backend_data_s *data = backend->private_data;
unsigned nbnodes;
char *cpuset_name;
struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
struct hwloc_info_s *global_infos = NULL;
unsigned global_infos_count = 0;
int numprocs;
int already_pus;
int already_numanodes;
const char *sysfs_cpu_path;
const char *sysfs_node_path;
int err;
/* look for sysfs cpu path containing at least one of core_siblings and thread_siblings */
if (!hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd)
|| !hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd))
sysfs_cpu_path = "/sys/bus/cpu/devices";
else if (!hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd)
|| !hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd))
sysfs_cpu_path = "/sys/devices/system/cpu";
else
sysfs_cpu_path = NULL;
hwloc_debug("Found sysfs cpu files under %s\n", sysfs_cpu_path);
/* look for sysfs node path */
if (!hwloc_access("/sys/bus/node/devices/node0/cpumap", R_OK, data->root_fd))
sysfs_node_path = "/sys/bus/node/devices";
else if (!hwloc_access("/sys/devices/system/node/node0/cpumap", R_OK, data->root_fd))
sysfs_node_path = "/sys/devices/system/node";
else
sysfs_node_path = NULL;
hwloc_debug("Found sysfs node files under %s\n", sysfs_node_path);
already_pus = (topology->levels[0][0]->complete_cpuset != NULL
&& !hwloc_bitmap_iszero(topology->levels[0][0]->complete_cpuset));
/* if there are PUs, still look at memory information
* since x86 misses NUMA node information (unless the processor supports topoext)
* memory size.
*/
already_numanodes = (topology->levels[0][0]->complete_nodeset != NULL
&& !hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset));
/* if there are already NUMA nodes, we'll just annotate them with memory information,
* which requires the NUMA level to be connected.
*/
if (already_numanodes)
hwloc_topology_reconnect(topology, 0);
hwloc_alloc_root_sets(topology->levels[0][0]);
/*********************************
* Platform information for later
*/
hwloc_gather_system_info(topology, data);
/**********************
* /proc/cpuinfo
*/
numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
if (numprocs < 0)
numprocs = 0;
/**************************
* detect model for quirks
*/
if (data->arch == HWLOC_LINUX_ARCH_X86 && numprocs > 0) {
unsigned i;
const char *cpuvendor = NULL, *cpufamilynumber = NULL, *cpumodelnumber = NULL;
for(i=0; i<Lprocs[0].infos_count; i++) {
if (!strcmp(Lprocs[0].infos[i].name, "CPUVendor")) {
cpuvendor = Lprocs[0].infos[i].value;
} else if (!strcmp(Lprocs[0].infos[i].name, "CPUFamilyNumber")) {
cpufamilynumber = Lprocs[0].infos[i].value;
} else if (!strcmp(Lprocs[0].infos[i].name, "CPUModelNumber")) {
cpumodelnumber = Lprocs[0].infos[i].value;
}
}
if (cpuvendor && !strcmp(cpuvendor, "GenuineIntel")
&& cpufamilynumber && !strcmp(cpufamilynumber, "6")
&& cpumodelnumber && (!strcmp(cpumodelnumber, "87")
|| !strcmp(cpumodelnumber, "133")))
data->is_knl = 1;
if (cpuvendor && !strcmp(cpuvendor, "AuthenticAMD")
&& cpufamilynumber
&& (!strcmp(cpufamilynumber, "21")
|| !strcmp(cpufamilynumber, "22")))
data->is_amd_with_CU = 1;
}
/**********************
* Gather the list of admin-disabled cpus and mems
*/
hwloc_linux__get_allowed_resources(topology, data->root_path, data->root_fd, &cpuset_name);
/**********************
* CPU information
*/
/* Don't rediscover CPU resources if already done */
if (already_pus)
goto cpudone;
/* Gather the list of cpus now */
err = hwloc_linux_try_hardwired_cpuinfo(backend);
if (!err)
goto cpudone;
/* setup root info */
hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
&global_infos, &global_infos_count);
if (getenv("HWLOC_LINUX_USE_CPUINFO") || !sysfs_cpu_path) {
/* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
* or not containing anything interesting */
if (numprocs > 0)
err = look_cpuinfo(topology, Lprocs, numprocs);
else
err = -1;
if (err < 0)
hwloc_linux_fallback_pu_level(backend);
look_powerpc_device_tree(topology, data);
} else {
/* sysfs */
if (look_sysfscpu(topology, data, sysfs_cpu_path, Lprocs, numprocs) < 0)
/* sysfs but we failed to read cpu topology, fallback */
hwloc_linux_fallback_pu_level(backend);
}
cpudone:
/*********************
* Memory information
*/
/* Get the machine memory attributes */
hwloc_get_procfs_meminfo_info(topology, data, &topology->machine_memory);
/* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
if (sysfs_node_path)
look_sysfsnode(topology, data, sysfs_node_path, &nbnodes);
else
nbnodes = 0;
/**********************
* Misc
*/
/* Gather DMI info */
hwloc__get_dmi_id_info(data, topology->levels[0][0]);
hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
if (cpuset_name) {
hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
free(cpuset_name);
}
hwloc__linux_get_mic_sn(topology, data);
/* data->utsname was filled with real uname or \0, we can safely pass it */
hwloc_add_uname_info(topology, &data->utsname);
hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
return 0;
}
/****************************************
***** Linux PCI backend callbacks ******
****************************************/
/*
* backend callback for retrieving the location of a pci device
*/
static int
hwloc_linux_backend_get_pci_busid_cpuset(struct hwloc_backend *backend,
struct hwloc_pcidev_attr_s *busid, hwloc_bitmap_t cpuset)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
char path[256];
int err;
snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
busid->domain, busid->bus,
busid->dev, busid->func);
err = hwloc__read_path_as_cpumask(path, cpuset, data->root_fd);
if (!err && !hwloc_bitmap_iszero(cpuset))
return 0;
return -1;
}
/*******************************
******* Linux component *******
*******************************/
static void
hwloc_linux_backend_disable(struct hwloc_backend *backend)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
#ifdef HAVE_OPENAT
free(data->root_path);
close(data->root_fd);
#endif
#ifdef HWLOC_HAVE_LIBUDEV
if (data->udev)
udev_unref(data->udev);
#endif
free(data);
}
static struct hwloc_backend *
hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
const void *_data1 __hwloc_attribute_unused,
const void *_data2 __hwloc_attribute_unused,
const void *_data3 __hwloc_attribute_unused)
{
struct hwloc_backend *backend;
struct hwloc_linux_backend_data_s *data;
const char * fsroot_path;
int flags, root = -1;
backend = hwloc_backend_alloc(component);
if (!backend)
goto out;
data = malloc(sizeof(*data));
if (!data) {
errno = ENOMEM;
goto out_with_backend;
}
backend->private_data = data;
backend->discover = hwloc_look_linuxfs;
backend->get_pci_busid_cpuset = hwloc_linux_backend_get_pci_busid_cpuset;
backend->disable = hwloc_linux_backend_disable;
/* default values */
data->arch = HWLOC_LINUX_ARCH_UNKNOWN;
data->is_knl = 0;
data->is_amd_with_CU = 0;
data->is_real_fsroot = 1;
data->root_path = NULL;
fsroot_path = getenv("HWLOC_FSROOT");
if (!fsroot_path)
fsroot_path = "/";
#ifdef HAVE_OPENAT
root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
if (root < 0)
goto out_with_data;
if (strcmp(fsroot_path, "/")) {
backend->is_thissystem = 0;
data->is_real_fsroot = 0;
data->root_path = strdup(fsroot_path);
}
/* Since this fd stays open after hwloc returns, mark it as
close-on-exec so that children don't inherit it. Stevens says
that we should GETFD before we SETFD, so we do. */
flags = fcntl(root, F_GETFD, 0);
if (-1 == flags ||
-1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
close(root);
root = -1;
goto out_with_data;
}
#else
if (strcmp(fsroot_path, "/")) {
errno = ENOSYS;
goto out_with_data;
}
#endif
data->root_fd = root;
#ifdef HWLOC_HAVE_LIBUDEV
data->udev = NULL;
if (data->is_real_fsroot) {
data->udev = udev_new();
}
#endif
data->dumped_hwdata_dirname = getenv("HWLOC_DUMPED_HWDATA_DIR");
if (!data->dumped_hwdata_dirname)
data->dumped_hwdata_dirname = (char *) RUNSTATEDIR "/hwloc/";
return backend;
out_with_data:
#ifdef HAVE_OPENAT
free(data->root_path);
#endif
free(data);
out_with_backend:
free(backend);
out:
return NULL;
}
static struct hwloc_disc_component hwloc_linux_disc_component = {
HWLOC_DISC_COMPONENT_TYPE_CPU,
"linux",
HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
hwloc_linux_component_instantiate,
50,
1,
NULL
};
const struct hwloc_component hwloc_linux_component = {
HWLOC_COMPONENT_ABI,
NULL, NULL,
HWLOC_COMPONENT_TYPE_DISC,
0,
&hwloc_linux_disc_component
};
#ifdef HWLOC_HAVE_LINUXIO
/***********************************
******* Linux I/O component *******
***********************************/
#define HWLOC_LINUXFS_FIND_OSDEV_FLAG_VIRTUAL (1<<0)
#define HWLOC_LINUXFS_FIND_OSDEV_FLAG_USB (1<<1)
static hwloc_obj_t
hwloc_linuxfs_find_osdev_parent(struct hwloc_backend *backend, int root_fd,
const char *osdevpath, unsigned osdev_flags)
{
struct hwloc_topology *topology = backend->topology;
char path[256], buf[10];
int fd;
int foundpci;
unsigned pcidomain = 0, pcibus = 0, pcidev = 0, pcifunc = 0;
unsigned _pcidomain, _pcibus, _pcidev, _pcifunc;
hwloc_bitmap_t cpuset;
const char *tmp;
hwloc_obj_t parent;
int err;
err = hwloc_readlink(osdevpath, path, sizeof(path), root_fd);
if (err < 0) {
/* /sys/class/<class>/<name> is a directory instead of a symlink on old kernels (at least around 2.6.18 and 2.6.25).
* The link to parse can be found in /sys/class/<class>/<name>/device instead, at least for "/pci..."
*/
char olddevpath[256];
snprintf(olddevpath, sizeof(olddevpath), "%s/device", osdevpath);
err = hwloc_readlink(olddevpath, path, sizeof(path), root_fd);
if (err < 0)
return NULL;
}
path[err] = '\0';
if (!(osdev_flags & HWLOC_LINUXFS_FIND_OSDEV_FLAG_VIRTUAL)) {
if (strstr(path, "/virtual/"))
return NULL;
}
if (!(osdev_flags & HWLOC_LINUXFS_FIND_OSDEV_FLAG_USB)) {
if (strstr(path, "/usb"))
return NULL;
}
tmp = strstr(path, "/pci");
if (!tmp)
goto nopci;
tmp = strchr(tmp+4, '/');
if (!tmp)
goto nopci;
tmp++;
/* iterate through busid to find the last one (previous ones are bridges) */
foundpci = 0;
nextpci:
if (sscanf(tmp+1, "%x:%x:%x.%x", &_pcidomain, &_pcibus, &_pcidev, &_pcifunc) == 4) {
foundpci = 1;
pcidomain = _pcidomain;
pcibus = _pcibus;
pcidev = _pcidev;
pcifunc = _pcifunc;
tmp += 13;
goto nextpci;
}
if (sscanf(tmp+1, "%x:%x.%x", &_pcibus, &_pcidev, &_pcifunc) == 3) {
foundpci = 1;
pcidomain = 0;
pcibus = _pcibus;
pcidev = _pcidev;
pcifunc = _pcifunc;
tmp += 8;
goto nextpci;
}
if (foundpci) {
/* attach to a PCI parent */
parent = hwloc_pcidisc_find_by_busid(topology, pcidomain, pcibus, pcidev, pcifunc);
if (parent)
return parent;
/* attach to a normal (non-I/O) parent found by PCI affinity */
parent = hwloc_pcidisc_find_busid_parent(topology, pcidomain, pcibus, pcidev, pcifunc);
if (parent)
return parent;
}
nopci:
/* attach directly near the right NUMA node */
snprintf(path, sizeof(path), "%s/device/numa_node", osdevpath);
fd = hwloc_open(path, root_fd);
if (fd >= 0) {
err = read(fd, buf, sizeof(buf));
close(fd);
if (err > 0) {
int node = atoi(buf);
if (node >= 0) {
parent = hwloc_get_numanode_obj_by_os_index(topology, (unsigned) node);
if (parent) {
/* don't attach I/O under numa node, attach to the same normal parent */
while (hwloc__obj_type_is_memory(parent->type))
parent = parent->parent;
return parent;
}
}
}
}
/* attach directly to the right cpuset */
snprintf(path, sizeof(path), "%s/device/local_cpus", osdevpath);
cpuset = hwloc__alloc_read_path_as_cpumask(path, root_fd);
if (cpuset) {
parent = hwloc_find_insert_io_parent_by_complete_cpuset(topology, cpuset);
hwloc_bitmap_free(cpuset);
if (parent)
return parent;
}
/* FIXME: {numa_node,local_cpus} may be missing when the device link points to a subdirectory.
* For instance, device of scsi blocks may point to foo/ata1/host0/target0:0:0/0:0:0:0/ instead of foo/
* In such case, we should look for device/../../../../{numa_node,local_cpus} instead of device/{numa_node,local_cpus}
* Not needed yet since scsi blocks use the PCI locality above.
*/
/* fallback to the root object */
return hwloc_get_root_obj(topology);
}
static hwloc_obj_t
hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
{
struct hwloc_topology *topology = backend->topology;
struct hwloc_obj *obj = hwloc_alloc_setup_object(topology, HWLOC_OBJ_OS_DEVICE, HWLOC_UNKNOWN_INDEX);
obj->name = strdup(name);
obj->attr->osdev.type = type;
hwloc_insert_object_by_parent(topology, pcidev, obj);
/* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
return obj;
}
static void
hwloc_linuxfs_block_class_fillinfos(struct hwloc_backend *backend __hwloc_attribute_unused, int root_fd,
struct hwloc_obj *obj, const char *osdevpath)
{
#ifdef HWLOC_HAVE_LIBUDEV
struct hwloc_linux_backend_data_s *data = backend->private_data;
#endif
FILE *file;
char path[296]; /* osdevpath <= 256 */
char line[128];
char vendor[64] = "";
char model[64] = "";
char serial[64] = "";
char revision[64] = "";
char blocktype[64] = "";
unsigned sectorsize = 0;
unsigned major_id, minor_id;
char *tmp;
snprintf(path, sizeof(path), "%s/size", osdevpath);
if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
unsigned long long sectors = strtoull(line, NULL, 10);
/* linux always reports size in 512-byte units, we want kB */
snprintf(line, sizeof(line), "%llu", sectors / 2);
hwloc_obj_add_info(obj, "Size", line);
}
snprintf(path, sizeof(path), "%s/queue/hw_sector_size", osdevpath);
if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
sectorsize = strtoul(line, NULL, 10);
}
/* pmem have device/devtype containing "nd_btt" (sectors)
* or "nd_namespace_io" (byte-granularity).
* Note that device/sector_size in btt devices includes integrity metadata
* (512/4096 block + 0/N) while queue/hw_sector_size above is the user sectorsize
* without metadata.
*/
snprintf(path, sizeof(path), "%s/device/devtype", osdevpath);
if (!hwloc_read_path_by_length(path, line, sizeof(line), root_fd)) {
if (!strncmp(line, "nd_", 3)) {
strcpy(blocktype, "NVDIMM"); /* Save the blocktype now since udev reports "" so far */
if (!strcmp(line, "nd_namespace_io"))
sectorsize = 1;
}
}
if (sectorsize) {
snprintf(line, sizeof(line), "%u", sectorsize);
hwloc_obj_add_info(obj, "SectorSize", line);
}
snprintf(path, sizeof(path), "%s/dev", osdevpath);
if (hwloc_read_path_by_length(path, line, sizeof(line), root_fd) < 0)
goto done;
if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
goto done;
tmp = strchr(line, '\n');
if (tmp)
*tmp = '\0';
hwloc_obj_add_info(obj, "LinuxDeviceID", line);
#ifdef HWLOC_HAVE_LIBUDEV
if (data->udev) {
struct udev_device *dev;
const char *prop;
dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
if (!dev)
goto done;
prop = udev_device_get_property_value(dev, "ID_VENDOR");
if (prop) {
strncpy(vendor, prop, sizeof(vendor));
vendor[sizeof(vendor)-1] = '\0';
}
prop = udev_device_get_property_value(dev, "ID_MODEL");
if (prop) {
strncpy(model, prop, sizeof(model));
model[sizeof(model)-1] = '\0';
}
prop = udev_device_get_property_value(dev, "ID_REVISION");
if (prop) {
strncpy(revision, prop, sizeof(revision));
revision[sizeof(revision)-1] = '\0';
}
prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
if (prop) {
strncpy(serial, prop, sizeof(serial));
serial[sizeof(serial)-1] = '\0';
}
prop = udev_device_get_property_value(dev, "ID_TYPE");
if (prop) {
strncpy(blocktype, prop, sizeof(blocktype));
blocktype[sizeof(blocktype)-1] = '\0';
}
udev_device_unref(dev);
} else
/* fallback to reading files, works with any fsroot */
#endif
{
snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
file = hwloc_fopen(path, "r", root_fd);
if (!file)
goto done;
while (NULL != fgets(line, sizeof(line), file)) {
tmp = strchr(line, '\n');
if (tmp)
*tmp = '\0';
if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
strncpy(vendor, line+strlen("E:ID_VENDOR="), sizeof(vendor));
vendor[sizeof(vendor)-1] = '\0';
} else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
strncpy(model, line+strlen("E:ID_MODEL="), sizeof(model));
model[sizeof(model)-1] = '\0';
} else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
strncpy(revision, line+strlen("E:ID_REVISION="), sizeof(revision));
revision[sizeof(revision)-1] = '\0';
} else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
strncpy(serial, line+strlen("E:ID_SERIAL_SHORT="), sizeof(serial));
serial[sizeof(serial)-1] = '\0';
} else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
strncpy(blocktype, line+strlen("E:ID_TYPE="), sizeof(blocktype));
blocktype[sizeof(blocktype)-1] = '\0';
}
}
fclose(file);
}
done:
/* clear fake "ATA" vendor name */
if (!strcasecmp(vendor, "ATA"))
*vendor = '\0';
/* overwrite vendor name from model when possible */
if (!*vendor) {
if (!strncasecmp(model, "wd", 2))
strcpy(vendor, "Western Digital");
else if (!strncasecmp(model, "st", 2))
strcpy(vendor, "Seagate");
else if (!strncasecmp(model, "samsung", 7))
strcpy(vendor, "Samsung");
else if (!strncasecmp(model, "sandisk", 7))
strcpy(vendor, "SanDisk");
else if (!strncasecmp(model, "toshiba", 7))
strcpy(vendor, "Toshiba");
}
if (*vendor)
hwloc_obj_add_info(obj, "Vendor", vendor);
if (*model)
hwloc_obj_add_info(obj, "Model", model);
if (*revision)
hwloc_obj_add_info(obj, "Revision", revision);
if (*serial)
hwloc_obj_add_info(obj, "SerialNumber", serial);
if (!strcmp(blocktype, "disk") || !strncmp(obj->name, "nvme", 4))
obj->subtype = strdup("Disk");
else if (!strcmp(blocktype, "NVDIMM")) /* FIXME: set by us above, to workaround udev returning "" so far */
obj->subtype = strdup("NVDIMM");
else if (!strcmp(blocktype, "tape"))
obj->subtype = strdup("Tape");
else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
obj->subtype = strdup("Removable Media Device");
else {
/* generic, usb mass storage/rbc, usb mass storage/scsi */
}
}
static int
hwloc_linuxfs_lookup_block_class(struct hwloc_backend *backend, unsigned osdev_flags)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
int root_fd = data->root_fd;
DIR *dir;
struct dirent *dirent;
dir = hwloc_opendir("/sys/class/block", root_fd);
if (!dir)
return 0;
while ((dirent = readdir(dir)) != NULL) {
char path[256];
struct stat stbuf;
hwloc_obj_t obj, parent;
int err;
if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
continue;
/* ignore partitions */
err = snprintf(path, sizeof(path), "/sys/class/block/%s/partition", dirent->d_name);
if ((size_t) err < sizeof(path)
&& hwloc_stat(path, &stbuf, root_fd) >= 0)
continue;
err = snprintf(path, sizeof(path), "/sys/class/block/%s", dirent->d_name);
if ((size_t) err >= sizeof(path))
continue;
parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
if (!parent)
continue;
/* USB device are created here but removed later when USB PCI devices get filtered out
* (unless WHOLE_IO is enabled).
*/
obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_BLOCK, dirent->d_name);
hwloc_linuxfs_block_class_fillinfos(backend, root_fd, obj, path);
}
closedir(dir);
return 0;
}
static void
hwloc_linuxfs_net_class_fillinfos(int root_fd,
struct hwloc_obj *obj, const char *osdevpath)
{
struct stat st;
char path[296]; /* osdevpath <= 256 */
char address[128];
snprintf(path, sizeof(path), "%s/address", osdevpath);
if (!hwloc_read_path_by_length(path, address, sizeof(address), root_fd)) {
char *eol = strchr(address, '\n');
if (eol)
*eol = 0;
hwloc_obj_add_info(obj, "Address", address);
}
snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
if (!hwloc_stat(path, &st, root_fd)) {
char hexid[16];
snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
if (!hwloc_read_path_by_length(path, hexid, sizeof(hexid), root_fd)) {
char *eoid;
unsigned long port;
port = strtoul(hexid, &eoid, 0);
if (eoid != hexid) {
char portstr[16];
snprintf(portstr, sizeof(portstr), "%lu", port+1);
hwloc_obj_add_info(obj, "Port", portstr);
}
}
}
}
static int
hwloc_linuxfs_lookup_net_class(struct hwloc_backend *backend, unsigned osdev_flags)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
int root_fd = data->root_fd;
DIR *dir;
struct dirent *dirent;
dir = hwloc_opendir("/sys/class/net", root_fd);
if (!dir)
return 0;
while ((dirent = readdir(dir)) != NULL) {
char path[256];
hwloc_obj_t obj, parent;
int err;
if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
continue;
err = snprintf(path, sizeof(path), "/sys/class/net/%s", dirent->d_name);
if ((size_t) err >= sizeof(path))
continue;
parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
if (!parent)
continue;
obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_NETWORK, dirent->d_name);
hwloc_linuxfs_net_class_fillinfos(root_fd, obj, path);
}
closedir(dir);
return 0;
}
static void
hwloc_linuxfs_infiniband_class_fillinfos(int root_fd,
struct hwloc_obj *obj, const char *osdevpath)
{
char path[296]; /* osdevpath <= 256 */
char guidvalue[20];
unsigned i,j;
snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
size_t len;
len = strspn(guidvalue, "0123456789abcdefx:");
guidvalue[len] = '\0';
hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
}
snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
size_t len;
len = strspn(guidvalue, "0123456789abcdefx:");
guidvalue[len] = '\0';
hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
}
for(i=1; ; i++) {
char statevalue[2];
char lidvalue[11];
char gidvalue[40];
snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
if (!hwloc_read_path_by_length(path, statevalue, sizeof(statevalue), root_fd)) {
char statename[32];
statevalue[1] = '\0'; /* only keep the first byte/digit */
snprintf(statename, sizeof(statename), "Port%uState", i);
hwloc_obj_add_info(obj, statename, statevalue);
} else {
/* no such port */
break;
}
snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
char lidname[32];
size_t len;
len = strspn(lidvalue, "0123456789abcdefx");
lidvalue[len] = '\0';
snprintf(lidname, sizeof(lidname), "Port%uLID", i);
hwloc_obj_add_info(obj, lidname, lidvalue);
}
snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
char lidname[32];
size_t len;
len = strspn(lidvalue, "0123456789");
lidvalue[len] = '\0';
snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
hwloc_obj_add_info(obj, lidname, lidvalue);
}
for(j=0; ; j++) {
snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
if (!hwloc_read_path_by_length(path, gidvalue, sizeof(gidvalue), root_fd)) {
char gidname[32];
size_t len;
len = strspn(gidvalue, "0123456789abcdefx:");
gidvalue[len] = '\0';
if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
/* only keep initialized GIDs */
snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
hwloc_obj_add_info(obj, gidname, gidvalue);
}
} else {
/* no such port */
break;
}
}
}
}
static int
hwloc_linuxfs_lookup_infiniband_class(struct hwloc_backend *backend, unsigned osdev_flags)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
int root_fd = data->root_fd;
DIR *dir;
struct dirent *dirent;
dir = hwloc_opendir("/sys/class/infiniband", root_fd);
if (!dir)
return 0;
while ((dirent = readdir(dir)) != NULL) {
char path[256];
hwloc_obj_t obj, parent;
int err;
if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
continue;
/* blocklist scif* fake devices */
if (!strncmp(dirent->d_name, "scif", 4))
continue;
err = snprintf(path, sizeof(path), "/sys/class/infiniband/%s", dirent->d_name);
if ((size_t) err > sizeof(path))
continue;
parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
if (!parent)
continue;
obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_OPENFABRICS, dirent->d_name);
hwloc_linuxfs_infiniband_class_fillinfos(root_fd, obj, path);
}
closedir(dir);
return 0;
}
static void
hwloc_linuxfs_mic_class_fillinfos(int root_fd,
struct hwloc_obj *obj, const char *osdevpath)
{
char path[296]; /* osdevpath <= 256 */
char family[64];
char sku[64];
char sn[64];
char string[20];
obj->subtype = strdup("MIC");
snprintf(path, sizeof(path), "%s/family", osdevpath);
if (!hwloc_read_path_by_length(path, family, sizeof(family), root_fd)) {
char *eol = strchr(family, '\n');
if (eol)
*eol = 0;
hwloc_obj_add_info(obj, "MICFamily", family);
}
snprintf(path, sizeof(path), "%s/sku", osdevpath);
if (!hwloc_read_path_by_length(path, sku, sizeof(sku), root_fd)) {
char *eol = strchr(sku, '\n');
if (eol)
*eol = 0;
hwloc_obj_add_info(obj, "MICSKU", sku);
}
snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
if (!hwloc_read_path_by_length(path, sn, sizeof(sn), root_fd)) {
char *eol;
eol = strchr(sn, '\n');
if (eol)
*eol = 0;
hwloc_obj_add_info(obj, "MICSerialNumber", sn);
}
snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
unsigned long count = strtoul(string, NULL, 16);
snprintf(string, sizeof(string), "%lu", count);
hwloc_obj_add_info(obj, "MICActiveCores", string);
}
snprintf(path, sizeof(path), "%s/memsize", osdevpath);
if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
unsigned long count = strtoul(string, NULL, 16);
snprintf(string, sizeof(string), "%lu", count);
hwloc_obj_add_info(obj, "MICMemorySize", string);
}
}
static int
hwloc_linuxfs_lookup_mic_class(struct hwloc_backend *backend, unsigned osdev_flags)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
int root_fd = data->root_fd;
unsigned idx;
DIR *dir;
struct dirent *dirent;
dir = hwloc_opendir("/sys/class/mic", root_fd);
if (!dir)
return 0;
while ((dirent = readdir(dir)) != NULL) {
char path[256];
hwloc_obj_t obj, parent;
if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
continue;
if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
continue;
snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
if (!parent)
continue;
obj = hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_COPROC, dirent->d_name);
hwloc_linuxfs_mic_class_fillinfos(root_fd, obj, path);
}
closedir(dir);
return 0;
}
static int
hwloc_linuxfs_lookup_drm_class(struct hwloc_backend *backend, unsigned osdev_flags)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
int root_fd = data->root_fd;
DIR *dir;
struct dirent *dirent;
dir = hwloc_opendir("/sys/class/drm", root_fd);
if (!dir)
return 0;
while ((dirent = readdir(dir)) != NULL) {
char path[256];
hwloc_obj_t parent;
struct stat stbuf;
int err;
if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
continue;
/* only keep main devices, not subdevices for outputs */
err = snprintf(path, sizeof(path), "/sys/class/drm/%s/dev", dirent->d_name);
if ((size_t) err < sizeof(path)
&& hwloc_stat(path, &stbuf, root_fd) < 0)
continue;
/* Most drivers expose a card%d device.
* Some (free?) drivers also expose render%d.
* Old kernels also have a controlD%d. On recent kernels, it's a symlink to card%d (deprecated?).
* There can also exist some output-specific files such as card0-DP-1.
*
* All these aren't very useful compared to CUDA/OpenCL/...
* Hence the DRM class is only enabled when KEEP_ALL.
*
* FIXME: We might want to filter everything out but card%d.
* Maybe look at the driver (read the end of /sys/class/drm/<name>/device/driver symlink),
* to decide whether card%d could be useful (likely not for NVIDIA).
*/
err = snprintf(path, sizeof(path), "/sys/class/drm/%s", dirent->d_name);
if ((size_t) err >= sizeof(path))
continue;
parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
if (!parent)
continue;
hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_GPU, dirent->d_name);
}
closedir(dir);
return 0;
}
static int
hwloc_linuxfs_lookup_dma_class(struct hwloc_backend *backend, unsigned osdev_flags)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
int root_fd = data->root_fd;
DIR *dir;
struct dirent *dirent;
dir = hwloc_opendir("/sys/class/dma", root_fd);
if (!dir)
return 0;
while ((dirent = readdir(dir)) != NULL) {
char path[256];
hwloc_obj_t parent;
int err;
if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
continue;
err = snprintf(path, sizeof(path), "/sys/class/dma/%s", dirent->d_name);
if ((size_t) err >= sizeof(path))
continue;
parent = hwloc_linuxfs_find_osdev_parent(backend, root_fd, path, osdev_flags);
if (!parent)
continue;
hwloc_linux_add_os_device(backend, parent, HWLOC_OBJ_OSDEV_DMA, dirent->d_name);
}
closedir(dir);
return 0;
}
struct hwloc_firmware_dmi_mem_device_header {
unsigned char type;
unsigned char length;
unsigned char handle[2];
unsigned char phy_mem_handle[2];
unsigned char mem_err_handle[2];
unsigned char tot_width[2];
unsigned char dat_width[2];
unsigned char size[2];
unsigned char ff;
unsigned char dev_set;
unsigned char dev_loc_str_num;
unsigned char bank_loc_str_num;
unsigned char mem_type;
unsigned char type_detail[2];
unsigned char speed[2];
unsigned char manuf_str_num;
unsigned char serial_str_num;
unsigned char asset_tag_str_num;
unsigned char part_num_str_num;
/* don't include the following fields since we don't need them,
* some old implementations may miss them.
*/
};
static int check_dmi_entry(const char *buffer)
{
/* reject empty strings */
if (!*buffer)
return 0;
/* reject strings of spaces (at least Dell use this for empty memory slots) */
if (strspn(buffer, " ") == strlen(buffer))
return 0;
return 1;
}
static int
hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
unsigned idx, const char *path, FILE *fd,
struct hwloc_firmware_dmi_mem_device_header *header)
{
unsigned slen;
char buffer[256]; /* enough for memory device strings, or at least for each of them */
unsigned foff; /* offset in raw file */
unsigned boff; /* offset in buffer read from raw file */
unsigned i;
struct hwloc_info_s *infos = NULL;
unsigned infos_count = 0;
hwloc_obj_t misc;
int foundinfo = 0;
/* start after the header */
foff = header->length;
i = 1;
while (1) {
/* read one buffer */
if (fseek(fd, foff, SEEK_SET) < 0)
break;
if (!fgets(buffer, sizeof(buffer), fd))
break;
/* read string at the beginning of the buffer */
boff = 0;
while (1) {
/* stop on empty string */
if (!buffer[boff])
goto done;
/* stop if this string goes to the end of the buffer */
slen = strlen(buffer+boff);
if (boff + slen+1 == sizeof(buffer))
break;
/* string didn't get truncated, should be OK */
if (i == header->manuf_str_num) {
if (check_dmi_entry(buffer+boff)) {
hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
foundinfo = 1;
}
} else if (i == header->serial_str_num) {
if (check_dmi_entry(buffer+boff)) {
hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
foundinfo = 1;
}
} else if (i == header->asset_tag_str_num) {
if (check_dmi_entry(buffer+boff)) {
hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
foundinfo = 1;
}
} else if (i == header->part_num_str_num) {
if (check_dmi_entry(buffer+boff)) {
hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
foundinfo = 1;
}
} else if (i == header->dev_loc_str_num) {
if (check_dmi_entry(buffer+boff)) {
hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
/* only a location, not an actual info about the device */
}
} else if (i == header->bank_loc_str_num) {
if (check_dmi_entry(buffer+boff)) {
hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
/* only a location, not an actual info about the device */
}
} else {
goto done;
}
/* next string in buffer */
boff += slen+1;
i++;
}
/* couldn't read a single full string from that buffer, we're screwed */
if (!boff) {
fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
i, path);
break;
}
/* reread buffer after previous string */
foff += boff;
}
done:
if (!foundinfo) {
/* found no actual info about the device. if there's only location info, the slot may be empty */
goto out_with_infos;
}
misc = hwloc_alloc_setup_object(topology, HWLOC_OBJ_MISC, idx);
if (!misc)
goto out_with_infos;
misc->subtype = strdup("MemoryModule");
hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
/* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
* but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
* with the vendor, and it's hard to be 100% sure 'B' is second socket.
* Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
* or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
*/
hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
return 1;
out_with_infos:
hwloc__free_infos(infos, infos_count);
return 0;
}
static int
hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
struct hwloc_linux_backend_data_s *data)
{
char path[128];
unsigned i;
for(i=0; ; i++) {
FILE *fd;
struct hwloc_firmware_dmi_mem_device_header header;
int err;
snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
fd = hwloc_fopen(path, "r", data->root_fd);
if (!fd)
break;
err = fread(&header, sizeof(header), 1, fd);
if (err != 1) {
fclose(fd);
break;
}
if (header.length < sizeof(header)) {
/* invalid, or too old entry/spec that doesn't contain what we need */
fclose(fd);
break;
}
hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
fclose(fd);
}
return 0;
}
#ifdef HWLOC_HAVE_LINUXPCI
#define HWLOC_PCI_REVISION_ID 0x08
#define HWLOC_PCI_CAP_ID_EXP 0x10
#define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
static int
hwloc_linuxfs_pci_look_pcidevices(struct hwloc_backend *backend)
{
struct hwloc_linux_backend_data_s *data = backend->private_data;
struct hwloc_topology *topology = backend->topology;
hwloc_obj_t tree = NULL;
int root_fd = data->root_fd;
DIR *dir;
struct dirent *dirent;
/* We could lookup /sys/devices/pci.../.../busid1/.../busid2 recursively
* to build the hierarchy of bridges/devices directly.
* But that would require readdirs in all bridge sysfs subdirectories.
* Do a single readdir in the linear list in /sys/bus/pci/devices/...
* and build the hierarchy manually instead.
*/
dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
if (!dir)
return 0;
while ((dirent = readdir(dir)) != NULL) {
#define CONFIG_SPACE_CACHESIZE 256
unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
unsigned domain, bus, dev, func;
unsigned short class_id;
hwloc_obj_type_t type;
hwloc_obj_t obj;
struct hwloc_pcidev_attr_s *attr;
unsigned offset;
char path[64];
char value[16];
size_t ret;
int fd, err;
if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
continue;
/* initialize the config space in case we fail to read it (missing permissions, etc). */
memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
if ((size_t) err < sizeof(path)) {
/* don't use hwloc_read_path_by_length() because we don't want the ending \0 */
fd = hwloc_open(path, root_fd);
if (fd >= 0) {
ret = read(fd, config_space_cache, CONFIG_SPACE_CACHESIZE);
(void) ret; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
close(fd);
}
}
class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
class_id = strtoul(value, NULL, 16) >> 8;
type = hwloc_pcidisc_check_bridge_type(class_id, config_space_cache);
/* filtered? */
if (type == HWLOC_OBJ_PCI_DEVICE) {
enum hwloc_type_filter_e filter;
hwloc_topology_get_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, &filter);
if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
continue;
if (filter == HWLOC_TYPE_FILTER_KEEP_IMPORTANT
&& !hwloc_filter_check_pcidev_subtype_important(class_id))
continue;
} else if (type == HWLOC_OBJ_BRIDGE) {
enum hwloc_type_filter_e filter;
hwloc_topology_get_type_filter(topology, HWLOC_OBJ_BRIDGE, &filter);
if (filter == HWLOC_TYPE_FILTER_KEEP_NONE)
continue;
/* HWLOC_TYPE_FILTER_KEEP_IMPORTANT filtered later in the core */
}
obj = hwloc_alloc_setup_object(topology, type, HWLOC_UNKNOWN_INDEX);
if (!obj)
break;
attr = &obj->attr->pcidev;
attr->domain = domain;
attr->bus = bus;
attr->dev = dev;
attr->func = func;
/* default (unknown) values */
attr->vendor_id = 0;
attr->device_id = 0;
attr->class_id = class_id;
attr->revision = 0;
attr->subvendor_id = 0;
attr->subdevice_id = 0;
attr->linkspeed = 0;
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
attr->vendor_id = strtoul(value, NULL, 16);
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
attr->device_id = strtoul(value, NULL, 16);
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
attr->subvendor_id = strtoul(value, NULL, 16);
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
attr->subdevice_id = strtoul(value, NULL, 16);
/* bridge specific attributes */
if (type == HWLOC_OBJ_BRIDGE) {
if (hwloc_pcidisc_setup_bridge_attr(obj, config_space_cache) < 0)
continue;
}
/* get the revision */
attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
/* try to get the link speed */
offset = hwloc_pcidisc_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE) {
hwloc_pcidisc_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
} else {
/* if not available from config-space (extended part is root-only), look in sysfs files added in 4.13 */
float speed = 0.f;
unsigned width = 0;
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/current_link_speed", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
speed = hwloc_linux_pci_link_speed_from_string(value);
err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/current_link_width", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
width = atoi(value);
attr->linkspeed = speed*width/8;
}
hwloc_pcidisc_tree_insert_by_busid(&tree, obj);
}
closedir(dir);
hwloc_pcidisc_tree_attach(backend->topology, tree);
return 0;
}
static hwloc_obj_t
hwloc_linuxfs_pci_find_pcislot_obj(struct hwloc_obj *tree,
unsigned domain, unsigned bus, unsigned dev)
{
for ( ; tree; tree = tree->next_sibling) {
if (tree->type == HWLOC_OBJ_PCI_DEVICE
|| (tree->type == HWLOC_OBJ_BRIDGE
&& tree->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI)) {
if (tree->attr->pcidev.domain == domain
&& tree->attr->pcidev.bus == bus
&& tree->attr->pcidev.dev == dev
&& tree->attr->pcidev.func == 0)
/* that's the right bus id */
return tree;
if (tree->attr->pcidev.domain > domain
|| (tree->attr->pcidev.domain == domain
&& tree->attr->pcidev.bus > bus))
/* bus id too high, won't find anything later */
return NULL;
if (tree->type == HWLOC_OBJ_BRIDGE
&& tree->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
&& tree->attr->bridge.downstream.pci.domain == domain
&& tree->attr->bridge.downstream.pci.secondary_bus <= bus
&& tree->attr->bridge.downstream.pci.subordinate_bus >= bus)
/* not the right bus id, but it's included in the bus below that bridge */
return hwloc_linuxfs_pci_find_pcislot_obj(tree->io_first_child, domain, bus, dev);
} else if (tree->type == HWLOC_OBJ_BRIDGE
&& tree->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI
&& tree->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
/* non-PCI to PCI bridge, just look at the subordinate bus */
&& tree->attr->bridge.downstream.pci.domain == domain
&& tree->attr->bridge.downstream.pci.secondary_bus <= bus
&& tree->attr->bridge.downstream.pci.subordinate_bus >= bus) {
/* contains our bus, recurse */
return hwloc_linuxfs_pci_find_pcislot_obj(tree->io_first_child, domain, bus, dev);
}
}
return NULL;
}
static int
hwloc_linuxfs_pci_look_pcislots(struct hwloc_backend *backend)
{
struct hwloc_topology *topology = backend->topology;
struct hwloc_linux_backend_data_s *data = backend->private_data;
int root_fd = data->root_fd;
DIR *dir;
struct dirent *dirent;
dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
if (dir) {
while ((dirent = readdir(dir)) != NULL) {
char path[64];
char buf[64];
unsigned domain, bus, dev;
int err;
if (dirent->d_name[0] == '.')
continue;
err = snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
if ((size_t) err < sizeof(path)
&& !hwloc_read_path_by_length(path, buf, sizeof(buf), root_fd)
&& sscanf(buf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
/* may also be %x:%x without a device number but that's only for hotplug when nothing is plugged, ignore those */
hwloc_obj_t obj = hwloc_linuxfs_pci_find_pcislot_obj(hwloc_get_root_obj(topology)->io_first_child, domain, bus, dev);
while (obj) {
/* Apply the slot to that device and its siblings with same domain/bus/dev ID.
* Make sure that siblings are still PCI and on the same bus
* (optional bridge filtering can put different things together).
*/
if (obj->type != HWLOC_OBJ_PCI_DEVICE &&
(obj->type != HWLOC_OBJ_BRIDGE || obj->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_PCI))
break;
if (obj->attr->pcidev.domain != domain
|| obj->attr->pcidev.bus != bus
|| obj->attr->pcidev.dev != dev)
break;
hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
obj = obj->next_sibling;
}
}
}
closedir(dir);
}
return 0;
}
#endif /* HWLOC_HAVE_LINUXPCI */
static int
hwloc_look_linuxfs_io(struct hwloc_backend *backend)
{
struct hwloc_topology *topology = backend->topology;
struct hwloc_linux_backend_data_s *data = NULL;
struct hwloc_backend *tmpbackend;
enum hwloc_type_filter_e pfilter, bfilter, ofilter, mfilter;
int root_fd = -1;
#ifdef HWLOC_HAVE_LINUXPCI
struct hwloc_obj *tmp;
int needpcidiscovery;
#endif
hwloc_topology_get_type_filter(topology, HWLOC_OBJ_PCI_DEVICE, &pfilter);
hwloc_topology_get_type_filter(topology, HWLOC_OBJ_BRIDGE, &bfilter);
hwloc_topology_get_type_filter(topology, HWLOC_OBJ_OS_DEVICE, &ofilter);
hwloc_topology_get_type_filter(topology, HWLOC_OBJ_MISC, &mfilter);
if (bfilter == HWLOC_TYPE_FILTER_KEEP_NONE
&& pfilter == HWLOC_TYPE_FILTER_KEEP_NONE
&& ofilter == HWLOC_TYPE_FILTER_KEEP_NONE
&& mfilter == HWLOC_TYPE_FILTER_KEEP_NONE)
return 0;
/* hackily find the linux backend to steal its private_data (for fsroot) */
tmpbackend = topology->backends;
while (tmpbackend) {
if (tmpbackend->component == &hwloc_linux_disc_component) {
data = tmpbackend->private_data;
break;
}
tmpbackend = tmpbackend->next;
}
if (!data) {
hwloc_debug("linuxio failed to find linux backend private_data, aborting its discovery()\n");
return -1;
}
backend->private_data = data;
root_fd = data->root_fd;
hwloc_debug("linuxio backend stole linux backend root_fd %d\n", root_fd);
if (bfilter != HWLOC_TYPE_FILTER_KEEP_NONE
|| pfilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
#ifdef HWLOC_HAVE_LINUXPCI
/* don't rediscovery PCI devices if another backend did it
* (they are attached to root until later in the core discovery)
*/
needpcidiscovery = 1;
for_each_io_child(tmp, hwloc_get_root_obj(topology)) {
if (tmp->type == HWLOC_OBJ_PCI_DEVICE
|| (tmp->type == HWLOC_OBJ_BRIDGE && tmp->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI)) {
hwloc_debug("%s", "PCI objects already added, ignoring linuxio PCI discovery.\n");
needpcidiscovery = 0;
break;
}
}
if (needpcidiscovery)
hwloc_linuxfs_pci_look_pcidevices(backend);
hwloc_linuxfs_pci_look_pcislots(backend);
#endif /* HWLOC_HAVE_LINUXPCI */
}
if (ofilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
unsigned osdev_flags = 0;
if (getenv("HWLOC_VIRTUAL_LINUX_OSDEV"))
osdev_flags |= HWLOC_LINUXFS_FIND_OSDEV_FLAG_VIRTUAL;
if (ofilter == HWLOC_TYPE_FILTER_KEEP_ALL)
osdev_flags |= HWLOC_LINUXFS_FIND_OSDEV_FLAG_USB;
hwloc_linuxfs_lookup_block_class(backend, osdev_flags);
hwloc_linuxfs_lookup_net_class(backend, osdev_flags);
hwloc_linuxfs_lookup_infiniband_class(backend, osdev_flags);
hwloc_linuxfs_lookup_mic_class(backend, osdev_flags);
if (ofilter != HWLOC_TYPE_FILTER_KEEP_IMPORTANT) {
hwloc_linuxfs_lookup_drm_class(backend, osdev_flags);
hwloc_linuxfs_lookup_dma_class(backend, osdev_flags);
}
}
if (mfilter != HWLOC_TYPE_FILTER_KEEP_NONE) {
hwloc__get_firmware_dmi_memory_info(topology, data);
}
return 0;
}
static struct hwloc_backend *
hwloc_linuxio_component_instantiate(struct hwloc_disc_component *component,
const void *_data1 __hwloc_attribute_unused,
const void *_data2 __hwloc_attribute_unused,
const void *_data3 __hwloc_attribute_unused)
{
struct hwloc_backend *backend;
backend = hwloc_backend_alloc(component);
if (!backend)
return NULL;
backend->discover = hwloc_look_linuxfs_io;
/* backend->is_thissystem should be what the linux backend has,
* but it's actually useless since both backends will change the main topology->is_thissystem in the same way.
*/
/* backend->private_data will point to the main linux private_data after load(),
* once the main linux component is instantiated for sure.
* it remains valid until the main linux component gets disabled during topology destroy.
*/
return backend;
}
static struct hwloc_disc_component hwloc_linuxio_disc_component = {
HWLOC_DISC_COMPONENT_TYPE_MISC,
"linuxio",
HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
hwloc_linuxio_component_instantiate,
19, /* after pci */
1,
NULL
};
const struct hwloc_component hwloc_linuxio_component = {
HWLOC_COMPONENT_ABI,
NULL, NULL,
HWLOC_COMPONENT_TYPE_DISC,
0,
&hwloc_linuxio_disc_component
};
#endif /* HWLOC_HAVE_LINUXIO */