/**
* Copyright (C) Mellanox Technologies Ltd. 2017-2019. ALL RIGHTS RESERVED.
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* See file LICENSE for terms.
*/
#include "gdr_copy_md.h"
#include <string.h>
#include <limits.h>
#include <ucs/debug/log.h>
#include <ucs/sys/sys.h>
#include <ucs/sys/math.h>
#include <ucs/debug/memtrack.h>
#include <ucs/type/class.h>
#include <ucs/profile/profile.h>
#include <ucm/api/ucm.h>
#include <uct/cuda/base/cuda_iface.h>
#define UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN 65536
static ucs_config_field_t uct_gdr_copy_md_config_table[] = {
{"", "", NULL,
ucs_offsetof(uct_gdr_copy_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)},
{"RCACHE", "try", "Enable using memory registration cache",
ucs_offsetof(uct_gdr_copy_md_config_t, enable_rcache), UCS_CONFIG_TYPE_TERNARY},
{"", "RCACHE_ADDR_ALIGN=" UCS_PP_MAKE_STRING(UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN), NULL,
ucs_offsetof(uct_gdr_copy_md_config_t, rcache),
UCS_CONFIG_TYPE_TABLE(uct_md_config_rcache_table)},
{"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */
ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.overhead), UCS_CONFIG_TYPE_TIME},
{"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */
ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.growth), UCS_CONFIG_TYPE_TIME},
{NULL}
};
static ucs_status_t uct_gdr_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr)
{
md_attr->cap.flags = UCT_MD_FLAG_REG |
UCT_MD_FLAG_NEED_RKEY;
md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA);
md_attr->cap.access_mem_type = UCS_MEMORY_TYPE_CUDA;
md_attr->cap.detect_mem_types = 0;
md_attr->cap.max_alloc = 0;
md_attr->cap.max_reg = ULONG_MAX;
md_attr->rkey_packed_size = sizeof(uct_gdr_copy_key_t);
md_attr->reg_cost.overhead = 0;
md_attr->reg_cost.growth = 0;
memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus));
return UCS_OK;
}
static ucs_status_t uct_gdr_copy_mkey_pack(uct_md_h md, uct_mem_h memh,
void *rkey_buffer)
{
uct_gdr_copy_key_t *packed = (uct_gdr_copy_key_t *)rkey_buffer;
uct_gdr_copy_mem_t *mem_hndl = (uct_gdr_copy_mem_t *)memh;
packed->vaddr = mem_hndl->info.va;
packed->bar_ptr = mem_hndl->bar_ptr;
packed->mh = mem_hndl->mh;
return UCS_OK;
}
static ucs_status_t uct_gdr_copy_rkey_unpack(uct_component_t *component,
const void *rkey_buffer,
uct_rkey_t *rkey_p, void **handle_p)
{
uct_gdr_copy_key_t *packed = (uct_gdr_copy_key_t *)rkey_buffer;
uct_gdr_copy_key_t *key;
key = ucs_malloc(sizeof(uct_gdr_copy_key_t), "uct_gdr_copy_key_t");
if (NULL == key) {
ucs_error("failed to allocate memory for uct_gdr_copy_key_t");
return UCS_ERR_NO_MEMORY;
}
key->vaddr = packed->vaddr;
key->bar_ptr = packed->bar_ptr;
key->mh = packed->mh;
*handle_p = NULL;
*rkey_p = (uintptr_t)key;
return UCS_OK;
}
static ucs_status_t uct_gdr_copy_rkey_release(uct_component_t *component,
uct_rkey_t rkey, void *handle)
{
ucs_assert(NULL == handle);
ucs_free((void *)rkey);
return UCS_OK;
}
UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_reg_internal,
(uct_md, address, length, flags, mem_hndl),
uct_md_h uct_md, void *address, size_t length,
unsigned flags, uct_gdr_copy_mem_t *mem_hndl)
{
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address);
int ret;
if (!length) {
memset(mem_hndl, 0, sizeof(*mem_hndl));
return UCS_OK;
}
ret = gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mem_hndl->mh);
if (ret) {
ucs_error("gdr_pin_buffer failed. length :%lu ret:%d", length, ret);
goto err;
}
ret = gdr_map(md->gdrcpy_ctx, mem_hndl->mh, &mem_hndl->bar_ptr, length);
if (ret) {
ucs_error("gdr_map failed. length :%lu ret:%d", length, ret);
goto unpin_buffer;
}
mem_hndl->reg_size = length;
ret = gdr_get_info(md->gdrcpy_ctx, mem_hndl->mh, &mem_hndl->info);
if (ret) {
ucs_error("gdr_get_info failed. ret:%d", ret);
goto unmap_buffer;
}
ucs_trace("registered memory:%p..%p length:%lu info.va:0x%"PRIx64" bar_ptr:%p",
address, UCS_PTR_BYTE_OFFSET(address, length), length,
mem_hndl->info.va, mem_hndl->bar_ptr);
return UCS_OK;
unmap_buffer:
ret = gdr_unmap(md->gdrcpy_ctx, mem_hndl->mh, mem_hndl->bar_ptr, mem_hndl->reg_size);
if (ret) {
ucs_warn("gdr_unmap failed. unpin_size:%lu ret:%d", mem_hndl->reg_size, ret);
}
unpin_buffer:
ret = gdr_unpin_buffer(md->gdrcpy_ctx, mem_hndl->mh);
if (ret) {
ucs_warn("gdr_unpin_buffer failed. ret;%d", ret);
}
err:
return UCS_ERR_IO_ERROR;
}
UCS_PROFILE_FUNC(ucs_status_t, uct_gdr_copy_mem_dereg_internal,
(uct_md, mem_hndl),
uct_md_h uct_md, uct_gdr_copy_mem_t *mem_hndl)
{
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
int ret;
ret = gdr_unmap(md->gdrcpy_ctx, mem_hndl->mh, mem_hndl->bar_ptr, mem_hndl->reg_size);
if (ret) {
ucs_error("gdr_unmap failed. unpin_size:%lu ret:%d", mem_hndl->reg_size, ret);
return UCS_ERR_IO_ERROR;
}
ret = gdr_unpin_buffer(md->gdrcpy_ctx, mem_hndl->mh);
if (ret) {
ucs_error("gdr_unpin_buffer failed. ret:%d", ret);
return UCS_ERR_IO_ERROR;
}
ucs_trace("deregistered memorory. info.va:0x%"PRIx64" bar_ptr:%p",
mem_hndl->info.va, mem_hndl->bar_ptr);
return UCS_OK;
}
static ucs_status_t uct_gdr_copy_mem_reg(uct_md_h uct_md, void *address, size_t length,
unsigned flags, uct_mem_h *memh_p)
{
uct_gdr_copy_mem_t *mem_hndl = NULL;
void *start, *end;
ucs_status_t status;
mem_hndl = ucs_malloc(sizeof(uct_gdr_copy_mem_t), "gdr_copy handle");
if (NULL == mem_hndl) {
ucs_error("failed to allocate memory for gdr_copy_mem_t");
return UCS_ERR_NO_MEMORY;
}
start = ucs_align_down_pow2_ptr(address, GPU_PAGE_SIZE);
end = ucs_align_up_pow2_ptr(UCS_PTR_BYTE_OFFSET(address, length), GPU_PAGE_SIZE);
ucs_assert_always(start <= end);
status = uct_gdr_copy_mem_reg_internal(uct_md, start,
UCS_PTR_BYTE_DIFF(start, end),
0, mem_hndl);
if (status != UCS_OK) {
ucs_free(mem_hndl);
return status;
}
*memh_p = mem_hndl;
return UCS_OK;
}
static ucs_status_t uct_gdr_copy_mem_dereg(uct_md_h uct_md, uct_mem_h memh)
{
uct_gdr_copy_mem_t *mem_hndl = memh;
ucs_status_t status;
status = uct_gdr_copy_mem_dereg_internal(uct_md, mem_hndl);
if (status != UCS_OK) {
ucs_warn("failed to deregister memory handle");
}
ucs_free(mem_hndl);
return status;
}
static ucs_status_t
uct_gdr_copy_query_md_resources(uct_component_t *component,
uct_md_resource_desc_t **resources_p,
unsigned *num_resources_p)
{
gdr_t ctx;
ctx = gdr_open();
if (ctx == NULL) {
ucs_debug("could not open gdr copy. disabling gdr copy resource");
return uct_md_query_empty_md_resource(resources_p, num_resources_p);
}
gdr_close(ctx);
return uct_cuda_base_query_md_resources(component, resources_p,
num_resources_p);
}
static void uct_gdr_copy_md_close(uct_md_h uct_md)
{
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
int ret;
if (md->rcache != NULL) {
ucs_rcache_destroy(md->rcache);
}
ret = gdr_close(md->gdrcpy_ctx);
if (ret) {
ucs_warn("failed to close gdrcopy. ret:%d", ret);
}
ucs_free(md);
}
static uct_md_ops_t md_ops = {
.close = uct_gdr_copy_md_close,
.query = uct_gdr_copy_md_query,
.mkey_pack = uct_gdr_copy_mkey_pack,
.mem_reg = uct_gdr_copy_mem_reg,
.mem_dereg = uct_gdr_copy_mem_dereg,
.detect_memory_type = ucs_empty_function_return_unsupported,
};
static inline uct_gdr_copy_rcache_region_t*
uct_gdr_copy_rache_region_from_memh(uct_mem_h memh)
{
return ucs_container_of(memh, uct_gdr_copy_rcache_region_t, memh);
}
static ucs_status_t
uct_gdr_copy_mem_rcache_reg(uct_md_h uct_md, void *address, size_t length,
unsigned flags, uct_mem_h *memh_p)
{
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
ucs_rcache_region_t *rregion;
ucs_status_t status;
uct_gdr_copy_mem_t *memh;
status = ucs_rcache_get(md->rcache, (void *)address, length, PROT_READ|PROT_WRITE,
&flags, &rregion);
if (status != UCS_OK) {
return status;
}
ucs_assert(rregion->refcount > 0);
memh = &ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t)->memh;
*memh_p = memh;
return UCS_OK;
}
static ucs_status_t uct_gdr_copy_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh)
{
uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t);
uct_gdr_copy_rcache_region_t *region = uct_gdr_copy_rache_region_from_memh(memh);
ucs_rcache_region_put(md->rcache, ®ion->super);
return UCS_OK;
}
static uct_md_ops_t md_rcache_ops = {
.close = uct_gdr_copy_md_close,
.query = uct_gdr_copy_md_query,
.mkey_pack = uct_gdr_copy_mkey_pack,
.mem_reg = uct_gdr_copy_mem_rcache_reg,
.mem_dereg = uct_gdr_copy_mem_rcache_dereg,
.detect_memory_type = ucs_empty_function_return_unsupported,
};
static ucs_status_t
uct_gdr_copy_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache,
void *arg, ucs_rcache_region_t *rregion,
uint16_t rcache_mem_reg_flags)
{
uct_gdr_copy_md_t *md = context;
int *flags = arg;
uct_gdr_copy_rcache_region_t *region;
region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t);
return uct_gdr_copy_mem_reg_internal(&md->super, (void*)region->super.super.start,
region->super.super.end -
region->super.super.start,
*flags, ®ion->memh);
}
static void uct_gdr_copy_rcache_mem_dereg_cb(void *context, ucs_rcache_t *rcache,
ucs_rcache_region_t *rregion)
{
uct_gdr_copy_md_t *md = context;
uct_gdr_copy_rcache_region_t *region;
region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t);
(void)uct_gdr_copy_mem_dereg_internal(&md->super, ®ion->memh);
}
static void uct_gdr_copy_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache,
ucs_rcache_region_t *rregion, char *buf,
size_t max)
{
uct_gdr_copy_rcache_region_t *region = ucs_derived_of(rregion,
uct_gdr_copy_rcache_region_t);
uct_gdr_copy_mem_t *memh = ®ion->memh;
snprintf(buf, max, "bar ptr:%p", memh->bar_ptr);
}
static ucs_rcache_ops_t uct_gdr_copy_rcache_ops = {
.mem_reg = uct_gdr_copy_rcache_mem_reg_cb,
.mem_dereg = uct_gdr_copy_rcache_mem_dereg_cb,
.dump_region = uct_gdr_copy_rcache_dump_region_cb
};
static ucs_status_t
uct_gdr_copy_md_open(uct_component_t *component, const char *md_name,
const uct_md_config_t *config, uct_md_h *md_p)
{
const uct_gdr_copy_md_config_t *md_config =
ucs_derived_of(config, uct_gdr_copy_md_config_t);
ucs_status_t status;
uct_gdr_copy_md_t *md;
ucs_rcache_params_t rcache_params;
md = ucs_malloc(sizeof(uct_gdr_copy_md_t), "uct_gdr_copy_md_t");
if (NULL == md) {
ucs_error("failed to allocate memory for uct_gdr_copy_md_t");
return UCS_ERR_NO_MEMORY;
}
md->super.ops = &md_ops;
md->super.component = &uct_gdr_copy_component;
md->rcache = NULL;
md->reg_cost = md_config->uc_reg_cost;
md->gdrcpy_ctx = gdr_open();
if (md->gdrcpy_ctx == NULL) {
ucs_error("failed to open gdr copy");
status = UCS_ERR_IO_ERROR;
goto err_free_md;
}
if (md_config->enable_rcache != UCS_NO) {
rcache_params.region_struct_size = sizeof(uct_gdr_copy_rcache_region_t);
rcache_params.alignment = md_config->rcache.alignment;
rcache_params.max_alignment = UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN;
rcache_params.ucm_events = UCM_EVENT_MEM_TYPE_FREE;
rcache_params.ucm_event_priority = md_config->rcache.event_prio;
rcache_params.context = md;
rcache_params.ops = &uct_gdr_copy_rcache_ops;
status = ucs_rcache_create(&rcache_params, "gdr_copy", NULL, &md->rcache);
if (status == UCS_OK) {
md->super.ops = &md_rcache_ops;
md->reg_cost.overhead = 0;
md->reg_cost.growth = 0;
} else {
ucs_assert(md->rcache == NULL);
if (md_config->enable_rcache == UCS_YES) {
status = UCS_ERR_IO_ERROR;
goto err_close_gdr;
} else {
ucs_debug("could not create registration cache for: %s",
ucs_status_string(status));
}
}
}
*md_p = (uct_md_h) md;
status = UCS_OK;
out:
return status;
err_close_gdr:
gdr_close(md->gdrcpy_ctx);
err_free_md:
ucs_free(md);
goto out;
}
uct_component_t uct_gdr_copy_component = {
.query_md_resources = uct_gdr_copy_query_md_resources,
.md_open = uct_gdr_copy_md_open,
.cm_open = ucs_empty_function_return_unsupported,
.rkey_unpack = uct_gdr_copy_rkey_unpack,
.rkey_ptr = ucs_empty_function_return_unsupported,
.rkey_release = uct_gdr_copy_rkey_release,
.name = "gdr_copy",
.md_config = {
.name = "GDR-copy memory domain",
.prefix = "GDR_COPY_",
.table = uct_gdr_copy_md_config_table,
.size = sizeof(uct_gdr_copy_md_config_t),
},
.cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY,
.tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_gdr_copy_component),
.flags = 0
};
UCT_COMPONENT_REGISTER(&uct_gdr_copy_component);