/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "event.h"
#include <ucm/mmap/mmap.h>
#include <ucm/malloc/malloc_hook.h>
#include <ucm/util/sys.h>
#include <ucs/arch/cpu.h>
#include <ucs/datastruct/khash.h>
#include <ucs/sys/compiler.h>
#include <ucs/sys/module.h>
#include <ucs/type/init_once.h>
#include <ucs/type/spinlock.h>
#include <sys/mman.h>
#include <pthread.h>
#include <sys/shm.h>
#include <sys/ipc.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
UCS_LIST_HEAD(ucm_event_installer_list);
static ucs_spinlock_t ucm_kh_lock;
#define ucm_ptr_hash(_ptr) kh_int64_hash_func((uintptr_t)(_ptr))
KHASH_INIT(ucm_ptr_size, const void*, size_t, 1, ucm_ptr_hash, kh_int64_hash_equal)
static pthread_rwlock_t ucm_event_lock = PTHREAD_RWLOCK_INITIALIZER;
static ucs_list_link_t ucm_event_handlers;
static int ucm_external_events = 0;
static khash_t(ucm_ptr_size) ucm_shmat_ptrs;
static size_t ucm_shm_size(int shmid)
{
struct shmid_ds ds;
int ret;
ret = shmctl(shmid, IPC_STAT, &ds);
if (ret < 0) {
return 0;
}
return ds.shm_segsz;
}
static void ucm_event_call_orig(ucm_event_type_t event_type, ucm_event_t *event,
void *arg)
{
switch (event_type) {
case UCM_EVENT_MMAP:
if (event->mmap.result == MAP_FAILED) {
event->mmap.result = ucm_orig_mmap(event->mmap.address,
event->mmap.size,
event->mmap.prot,
event->mmap.flags,
event->mmap.fd,
event->mmap.offset);
}
break;
case UCM_EVENT_MUNMAP:
if (event->munmap.result == -1) {
event->munmap.result = ucm_orig_munmap(event->munmap.address,
event->munmap.size);
}
break;
case UCM_EVENT_MREMAP:
if (event->mremap.result == MAP_FAILED) {
event->mremap.result = ucm_orig_mremap(event->mremap.address,
event->mremap.old_size,
event->mremap.new_size,
event->mremap.flags);
}
break;
case UCM_EVENT_SHMAT:
if (event->shmat.result == MAP_FAILED) {
event->shmat.result = ucm_orig_shmat(event->shmat.shmid,
event->shmat.shmaddr,
event->shmat.shmflg);
}
break;
case UCM_EVENT_SHMDT:
if (event->shmdt.result == -1) {
event->shmdt.result = ucm_orig_shmdt(event->shmdt.shmaddr);
}
break;
case UCM_EVENT_SBRK:
if (event->sbrk.result == MAP_FAILED) {
event->sbrk.result = ucm_orig_sbrk(event->sbrk.increment);
}
break;
case UCM_EVENT_MADVISE:
if (event->madvise.result == -1) {
event->madvise.result = ucm_orig_madvise(event->madvise.addr,
event->madvise.length,
event->madvise.advice);
}
break;
default:
ucm_warn("Got unknown event %d", event_type);
break;
}
}
/*
* Add a handler which calls the original implementation, and declare the callback
* list so that initially it will be the single element on that list.
*/
static ucm_event_handler_t ucm_event_orig_handler = {
.list = UCS_LIST_INITIALIZER(&ucm_event_handlers, &ucm_event_handlers),
.events = UCM_EVENT_MMAP | UCM_EVENT_MUNMAP | UCM_EVENT_MREMAP |
UCM_EVENT_SHMAT | UCM_EVENT_SHMDT | UCM_EVENT_SBRK |
UCM_EVENT_MADVISE, /* All events */
.priority = 0, /* Between negative and positive handlers */
.cb = ucm_event_call_orig
};
static ucs_list_link_t ucm_event_handlers =
UCS_LIST_INITIALIZER(&ucm_event_orig_handler.list,
&ucm_event_orig_handler.list);
void ucm_event_dispatch(ucm_event_type_t event_type, ucm_event_t *event)
{
ucm_event_handler_t *handler;
ucs_list_for_each(handler, &ucm_event_handlers, list) {
if (handler->events & event_type) {
handler->cb(event_type, event, handler->arg);
}
}
}
#define ucm_event_lock(_lock_func) \
{ \
int ret; \
do { \
ret = _lock_func(&ucm_event_lock); \
} while (ret == EAGAIN); \
if (ret != 0) { \
ucm_fatal("%s() failed: %s", #_lock_func, strerror(ret)); \
} \
}
void ucm_event_enter()
{
ucm_event_lock(pthread_rwlock_rdlock);
}
void ucm_event_enter_exclusive()
{
ucm_event_lock(pthread_rwlock_wrlock);
}
void ucm_event_leave()
{
pthread_rwlock_unlock(&ucm_event_lock);
}
void *ucm_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
{
ucm_event_t event;
ucm_trace("ucm_mmap(addr=%p length=%lu prot=0x%x flags=0x%x fd=%d offset=%ld)",
addr, length, prot, flags, fd, offset);
ucm_event_enter();
if ((flags & MAP_FIXED) && (addr != NULL)) {
ucm_dispatch_vm_munmap(addr, length);
}
event.mmap.result = MAP_FAILED;
event.mmap.address = addr;
event.mmap.size = length;
event.mmap.prot = prot;
event.mmap.flags = flags;
event.mmap.fd = fd;
event.mmap.offset = offset;
ucm_event_dispatch(UCM_EVENT_MMAP, &event);
if (event.mmap.result != MAP_FAILED) {
/* Use original length */
ucm_dispatch_vm_mmap(event.mmap.result, length);
}
ucm_event_leave();
return event.mmap.result;
}
int ucm_munmap(void *addr, size_t length)
{
ucm_event_t event;
ucm_event_enter();
ucm_trace("ucm_munmap(addr=%p length=%lu)", addr, length);
ucm_dispatch_vm_munmap(addr, length);
event.munmap.result = -1;
event.munmap.address = addr;
event.munmap.size = length;
ucm_event_dispatch(UCM_EVENT_MUNMAP, &event);
ucm_event_leave();
return event.munmap.result;
}
void ucm_vm_mmap(void *addr, size_t length)
{
ucm_event_enter();
ucm_trace("ucm_vm_mmap(addr=%p length=%lu)", addr, length);
ucm_dispatch_vm_mmap(addr, length);
ucm_event_leave();
}
void ucm_vm_munmap(void *addr, size_t length)
{
ucm_event_enter();
ucm_trace("ucm_vm_munmap(addr=%p length=%lu)", addr, length);
ucm_dispatch_vm_munmap(addr, length);
ucm_event_leave();
}
void *ucm_mremap(void *old_address, size_t old_size, size_t new_size, int flags)
{
ucm_event_t event;
ucm_event_enter();
ucm_trace("ucm_mremap(old_address=%p old_size=%lu new_size=%ld flags=0x%x)",
old_address, old_size, new_size, flags);
ucm_dispatch_vm_munmap(old_address, old_size);
event.mremap.result = MAP_FAILED;
event.mremap.address = old_address;
event.mremap.old_size = old_size;
event.mremap.new_size = new_size;
event.mremap.flags = flags;
ucm_event_dispatch(UCM_EVENT_MREMAP, &event);
if (event.mremap.result != MAP_FAILED) {
/* Use original new_size */
ucm_dispatch_vm_mmap(event.mremap.result, new_size);
}
ucm_event_leave();
return event.mremap.result;
}
void *ucm_shmat(int shmid, const void *shmaddr, int shmflg)
{
uintptr_t attach_addr;
ucm_event_t event;
khiter_t iter;
size_t size;
int result;
ucm_event_enter();
ucm_trace("ucm_shmat(shmid=%d shmaddr=%p shmflg=0x%x)",
shmid, shmaddr, shmflg);
size = ucm_shm_size(shmid);
if ((shmflg & SHM_REMAP) && (shmaddr != NULL)) {
attach_addr = (uintptr_t)shmaddr;
if (shmflg & SHM_RND) {
attach_addr -= attach_addr % SHMLBA;
}
ucm_dispatch_vm_munmap((void*)attach_addr, size);
}
event.shmat.result = MAP_FAILED;
event.shmat.shmid = shmid;
event.shmat.shmaddr = shmaddr;
event.shmat.shmflg = shmflg;
ucm_event_dispatch(UCM_EVENT_SHMAT, &event);
ucs_spin_lock(&ucm_kh_lock);
if (event.shmat.result != MAP_FAILED) {
iter = kh_put(ucm_ptr_size, &ucm_shmat_ptrs, event.mmap.result, &result);
if (result != -1) {
kh_value(&ucm_shmat_ptrs, iter) = size;
}
ucs_spin_unlock(&ucm_kh_lock);
ucm_dispatch_vm_mmap(event.shmat.result, size);
} else {
ucs_spin_unlock(&ucm_kh_lock);
}
ucm_event_leave();
return event.shmat.result;
}
int ucm_shmdt(const void *shmaddr)
{
ucm_event_t event;
khiter_t iter;
size_t size;
ucm_event_enter();
ucm_debug("ucm_shmdt(shmaddr=%p)", shmaddr);
ucs_spin_lock(&ucm_kh_lock);
iter = kh_get(ucm_ptr_size, &ucm_shmat_ptrs, shmaddr);
if (iter != kh_end(&ucm_shmat_ptrs)) {
size = kh_value(&ucm_shmat_ptrs, iter);
kh_del(ucm_ptr_size, &ucm_shmat_ptrs, iter);
} else {
size = ucm_get_shm_seg_size(shmaddr);
}
ucs_spin_unlock(&ucm_kh_lock);
ucm_dispatch_vm_munmap((void*)shmaddr, size);
event.shmdt.result = -1;
event.shmdt.shmaddr = shmaddr;
ucm_event_dispatch(UCM_EVENT_SHMDT, &event);
ucm_event_leave();
return event.shmdt.result;
}
void *ucm_sbrk(intptr_t increment)
{
ucm_event_t event;
ucm_event_enter();
ucm_trace("ucm_sbrk(increment=%+ld)", increment);
if (increment < 0) {
ucm_dispatch_vm_munmap(UCS_PTR_BYTE_OFFSET(ucm_orig_sbrk(0), increment),
-increment);
}
event.sbrk.result = MAP_FAILED;
event.sbrk.increment = increment;
ucm_event_dispatch(UCM_EVENT_SBRK, &event);
if ((increment > 0) && (event.sbrk.result != MAP_FAILED)) {
ucm_dispatch_vm_mmap(UCS_PTR_BYTE_OFFSET(ucm_orig_sbrk(0), -increment),
increment);
}
ucm_event_leave();
return event.sbrk.result;
}
int ucm_brk(void *addr)
{
#if UCM_BISTRO_HOOKS
void *old_addr;
intptr_t increment;
ucm_event_t event;
old_addr = ucm_brk_syscall(0);
/* in case if addr == NULL - it just returns current pointer */
increment = addr ? ((intptr_t)addr - (intptr_t)old_addr) : 0;
ucm_event_enter();
ucm_trace("ucm_brk(addr=%p)", addr);
if (increment < 0) {
ucm_dispatch_vm_munmap(UCS_PTR_BYTE_OFFSET(old_addr, increment),
-increment);
}
event.sbrk.result = (void*)-1;
event.sbrk.increment = increment;
ucm_event_dispatch(UCM_EVENT_SBRK, &event);
if ((increment > 0) && (event.sbrk.result != MAP_FAILED)) {
ucm_dispatch_vm_mmap(old_addr, increment);
}
ucm_event_leave();
return event.sbrk.result == MAP_FAILED ? -1 : 0;
#else
return -1;
#endif
}
int ucm_madvise(void *addr, size_t length, int advice)
{
ucm_event_t event;
ucm_event_enter();
ucm_trace("ucm_madvise(addr=%p length=%zu advice=%d)", addr, length, advice);
/* madvise(MADV_DONTNEED) and madvise(MADV_FREE) are releasing pages */
if ((advice == MADV_DONTNEED)
#if HAVE_DECL_MADV_REMOVE
|| (advice == MADV_REMOVE)
#endif
#if HAVE_DECL_POSIX_MADV_DONTNEED
|| (advice == POSIX_MADV_DONTNEED)
#endif
#if HAVE_DECL_MADV_FREE
|| (advice == MADV_FREE)
#endif
) {
ucm_dispatch_vm_munmap(addr, length);
}
event.madvise.result = -1;
event.madvise.addr = addr;
event.madvise.length = length;
event.madvise.advice = advice;
ucm_event_dispatch(UCM_EVENT_MADVISE, &event);
ucm_event_leave();
return event.madvise.result;
}
void ucm_event_handler_add(ucm_event_handler_t *handler)
{
ucm_event_handler_t *elem;
ucm_event_enter_exclusive();
ucs_list_for_each(elem, &ucm_event_handlers, list) {
if (handler->priority < elem->priority) {
ucs_list_insert_before(&elem->list, &handler->list);
ucm_event_leave();
return;
}
}
ucs_list_add_tail(&ucm_event_handlers, &handler->list);
ucm_event_leave();
}
void ucm_event_handler_remove(ucm_event_handler_t *handler)
{
ucm_event_enter_exclusive();
ucs_list_del(&handler->list);
ucm_event_leave();
}
static int ucm_events_to_native_events(int events)
{
int native_events;
native_events = events & ~(UCM_EVENT_VM_MAPPED | UCM_EVENT_VM_UNMAPPED |
UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE);
if (events & UCM_EVENT_VM_MAPPED) {
native_events |= UCM_NATIVE_EVENT_VM_MAPPED;
}
if (events & UCM_EVENT_VM_UNMAPPED) {
native_events |= UCM_NATIVE_EVENT_VM_UNMAPPED;
}
return native_events;
}
static ucs_status_t ucm_event_install(int events)
{
static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER;
UCS_MODULE_FRAMEWORK_DECLARE(ucm);
ucm_event_installer_t *event_installer;
int native_events, malloc_events;
ucs_status_t status;
UCS_INIT_ONCE(&init_once) {
ucm_prevent_dl_unload();
}
/* Replace aggregate events with the native events which make them */
native_events = ucm_events_to_native_events(events);
/* TODO lock */
status = ucm_mmap_install(native_events);
if (status != UCS_OK) {
ucm_debug("failed to install mmap events");
goto out_unlock;
}
ucm_debug("mmap hooks are ready");
malloc_events = events & ~(UCM_EVENT_MEM_TYPE_ALLOC |
UCM_EVENT_MEM_TYPE_FREE);
status = ucm_malloc_install(malloc_events);
if (status != UCS_OK) {
ucm_debug("failed to install malloc events");
goto out_unlock;
}
ucm_debug("malloc hooks are ready");
/* Call extra event installers */
UCS_MODULE_FRAMEWORK_LOAD(ucm, UCS_MODULE_LOAD_FLAG_NODELETE);
ucs_list_for_each(event_installer, &ucm_event_installer_list, list) {
status = event_installer->install(events);
if (status != UCS_OK) {
goto out_unlock;
}
}
status = UCS_OK;
out_unlock:
return status;
}
ucs_status_t ucm_set_event_handler(int events, int priority,
ucm_event_callback_t cb, void *arg)
{
ucm_event_installer_t *event_installer;
ucm_event_handler_t *handler;
ucs_status_t status;
int flags;
if (events & ~(UCM_EVENT_MMAP|UCM_EVENT_MUNMAP|UCM_EVENT_MREMAP|
UCM_EVENT_SHMAT|UCM_EVENT_SHMDT|
UCM_EVENT_SBRK|
UCM_EVENT_MADVISE|
UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED|
UCM_EVENT_MEM_TYPE_ALLOC|UCM_EVENT_MEM_TYPE_FREE|
UCM_EVENT_FLAG_NO_INSTALL|
UCM_EVENT_FLAG_EXISTING_ALLOC)) {
return UCS_ERR_INVALID_PARAM;
}
if (!ucm_global_opts.enable_events) {
return UCS_ERR_UNSUPPORTED;
}
/* separate event flags from real events */
flags = events & (UCM_EVENT_FLAG_NO_INSTALL |
UCM_EVENT_FLAG_EXISTING_ALLOC);
events &= ~flags;
if (!(flags & UCM_EVENT_FLAG_NO_INSTALL) && (events & ~ucm_external_events)) {
status = ucm_event_install(events & ~ucm_external_events);
if (status != UCS_OK) {
return status;
}
}
handler = malloc(sizeof(*handler));
if (handler == NULL) {
return UCS_ERR_NO_MEMORY;
}
handler->events = events;
handler->priority = priority;
handler->cb = cb;
handler->arg = arg;
ucm_event_handler_add(handler);
if (flags & UCM_EVENT_FLAG_EXISTING_ALLOC) {
ucs_list_for_each(event_installer, &ucm_event_installer_list, list) {
event_installer->get_existing_alloc(handler);
}
}
ucm_debug("added user handler (func=%p arg=%p) for events=0x%x prio=%d", cb,
arg, events, priority);
return UCS_OK;
}
void ucm_set_external_event(int events)
{
ucm_event_enter_exclusive();
ucm_external_events |= events;
ucm_event_leave();
}
void ucm_unset_external_event(int events)
{
ucm_event_enter_exclusive();
ucm_external_events &= ~events;
ucm_event_leave();
}
void ucm_unset_event_handler(int events, ucm_event_callback_t cb, void *arg)
{
ucm_event_handler_t *elem, *tmp;
UCS_LIST_HEAD(gc_list);
ucm_event_enter_exclusive();
ucs_list_for_each_safe(elem, tmp, &ucm_event_handlers, list) {
if ((cb == elem->cb) && (arg == elem->arg)) {
elem->events &= ~events;
if (elem->events == 0) {
ucs_list_del(&elem->list);
ucs_list_add_tail(&gc_list, &elem->list);
}
}
}
ucm_event_leave();
/* Do not release memory while we hold event lock - may deadlock */
ucs_list_for_each_safe(elem, tmp, &gc_list, list) {
free(elem);
}
}
ucs_status_t ucm_test_events(int events)
{
return ucm_mmap_test_installed_events(ucm_events_to_native_events(events));
}
UCS_STATIC_INIT {
ucs_spinlock_init(&ucm_kh_lock);
kh_init_inplace(ucm_ptr_size, &ucm_shmat_ptrs);
}
UCS_STATIC_CLEANUP {
ucs_status_t status;
kh_destroy_inplace(ucm_ptr_size, &ucm_shmat_ptrs);
status = ucs_spinlock_destroy(&ucm_kh_lock);
if (status != UCS_OK) {
ucm_warn("ucs_spinlock_destroy() failed (%d)", status);
}
}