/*
* Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <util/mmio.h>
#include "mlx5dv_dr.h"
#include "wqe.h"
#define QUEUE_SIZE 128
#define SIGNAL_PER_DIV_QUEUE 16
#define TH_NUMS_TO_DRAIN 2
enum {
CQ_OK = 0,
CQ_EMPTY = -1,
CQ_POLL_ERR = -2
};
struct dr_qp_init_attr {
uint32_t cqn;
uint32_t pdn;
struct mlx5dv_devx_uar *uar;
struct ibv_qp_cap cap;
};
static void *dr_cq_get_cqe(struct dr_cq *dr_cq, int n)
{
return dr_cq->buf + n * dr_cq->cqe_sz;
}
static void *dr_cq_get_sw_cqe(struct dr_cq *dr_cq, int n)
{
void *cqe = dr_cq_get_cqe(dr_cq, n & (dr_cq->ncqe - 1));
struct mlx5_cqe64 *cqe64;
cqe64 = (dr_cq->cqe_sz == 64) ? cqe : cqe + 64;
if (likely(mlx5dv_get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
!((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^
!!(n & dr_cq->ncqe)))
return cqe64;
else
return NULL;
}
static int dr_get_next_cqe(struct dr_cq *dr_cq,
struct mlx5_cqe64 **pcqe64)
{
struct mlx5_cqe64 *cqe64;
cqe64 = dr_cq_get_sw_cqe(dr_cq, dr_cq->cons_index);
if (!cqe64)
return CQ_EMPTY;
++dr_cq->cons_index;
/*
* Make sure we read CQ entry contents after we've checked the
* ownership bit.
*/
udma_from_device_barrier();
*pcqe64 = cqe64;
return CQ_OK;
}
static int dr_parse_cqe(struct dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
{
uint16_t wqe_ctr;
uint8_t opcode;
int idx;
wqe_ctr = be16toh(cqe64->wqe_counter);
opcode = mlx5dv_get_cqe_opcode(cqe64);
if (opcode == MLX5_CQE_REQ_ERR) {
idx = wqe_ctr & (dr_cq->qp->sq.wqe_cnt - 1);
dr_cq->qp->sq.tail = dr_cq->qp->sq.wqe_head[idx] + 1;
} else if (opcode == MLX5_CQE_RESP_ERR) {
++dr_cq->qp->sq.tail;
} else {
idx = wqe_ctr & (dr_cq->qp->sq.wqe_cnt - 1);
dr_cq->qp->sq.tail = dr_cq->qp->sq.wqe_head[idx] + 1;
return CQ_OK;
}
return CQ_POLL_ERR;
}
static int dr_cq_poll_one(struct dr_cq *dr_cq)
{
struct mlx5_cqe64 *cqe64;
int err;
err = dr_get_next_cqe(dr_cq, &cqe64);
if (err == CQ_EMPTY)
return err;
return dr_parse_cqe(dr_cq, cqe64);
}
static int dr_poll_cq(struct dr_cq *dr_cq, int ne)
{
int npolled;
int err = 0;
for (npolled = 0; npolled < ne; ++npolled) {
err = dr_cq_poll_one(dr_cq);
if (err != CQ_OK)
break;
}
dr_cq->db[MLX5_CQ_SET_CI] = htobe32(dr_cq->cons_index &
0xffffff);
return err == CQ_POLL_ERR ? err : npolled;
}
/* We calculate for specific RC QP with the required functionality */
static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr)
{
int size;
int inl_size = 0;
int tot_size;
size = sizeof(struct mlx5_wqe_ctrl_seg) +
sizeof(struct mlx5_wqe_raddr_seg);
if (attr->cap.max_inline_data)
inl_size = size + align(sizeof(struct mlx5_wqe_inl_data_seg) +
attr->cap.max_inline_data, 16);
size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
tot_size = max_int(size, inl_size);
return align(tot_size, MLX5_SEND_WQE_BB);
}
static int dr_calc_sq_size(struct dr_qp *dr_qp,
struct dr_qp_init_attr *attr)
{
int wqe_size;
int wq_size;
wqe_size = dr_qp_calc_rc_send_wqe(attr);
dr_qp->max_inline_data = wqe_size -
(sizeof(struct mlx5_wqe_ctrl_seg) +
sizeof(struct mlx5_wqe_raddr_seg)) -
sizeof(struct mlx5_wqe_inl_data_seg);
wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
dr_qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
dr_qp->sq.wqe_shift = STATIC_ILOG_32(MLX5_SEND_WQE_BB) - 1;
dr_qp->sq.max_gs = attr->cap.max_send_sge;
dr_qp->sq.max_post = wq_size / wqe_size;
return wq_size;
}
static int dr_qp_calc_recv_wqe(struct dr_qp_init_attr *attr)
{
uint32_t size;
int num_scatter;
num_scatter = max_t(uint32_t, attr->cap.max_recv_sge, 1);
size = sizeof(struct mlx5_wqe_data_seg) * num_scatter;
size = roundup_pow_of_two(size);
return size;
}
static int dr_calc_rq_size(struct dr_qp *dr_qp,
struct dr_qp_init_attr *attr)
{
int wqe_size;
int wq_size;
wqe_size = dr_qp_calc_recv_wqe(attr);
wq_size = roundup_pow_of_two(attr->cap.max_recv_wr) * wqe_size;
wq_size = max(wq_size, MLX5_SEND_WQE_BB);
dr_qp->rq.wqe_cnt = wq_size / wqe_size;
dr_qp->rq.wqe_shift = ilog32(wqe_size - 1);
dr_qp->rq.max_post = 1 << ilog32(wq_size / wqe_size - 1);
dr_qp->rq.max_gs = wqe_size / sizeof(struct mlx5_wqe_data_seg);
return wq_size;
}
static int dr_calc_wq_size(struct dr_qp *dr_qp, struct dr_qp_init_attr *attr)
{
int result;
int ret;
result = dr_calc_sq_size(dr_qp, attr);
ret = dr_calc_rq_size(dr_qp, attr);
result += ret;
dr_qp->sq.offset = ret;
dr_qp->rq.offset = 0;
return result;
}
static int dr_qp_alloc_buf(struct dr_qp *dr_qp, int size)
{
int al_size;
int ret;
dr_qp->sq.wqe_head = malloc(dr_qp->sq.wqe_cnt *
sizeof(*dr_qp->sq.wqe_head));
if (!dr_qp->sq.wqe_head) {
errno = ENOMEM;
return errno;
}
al_size = align(size, sysconf(_SC_PAGESIZE));
ret = posix_memalign(&dr_qp->buf.buf, sysconf(_SC_PAGESIZE), al_size);
if (ret) {
errno = ret;
goto free_wqe_head;
}
dr_qp->buf.length = al_size;
dr_qp->buf.type = MLX5_ALLOC_TYPE_ANON;
memset(dr_qp->buf.buf, 0, dr_qp->buf.length);
return 0;
free_wqe_head:
free(dr_qp->sq.wqe_head);
return ret;
}
static struct dr_qp *dr_create_rc_qp(struct ibv_context *ctx,
struct dr_qp_init_attr *attr)
{
struct dr_devx_qp_create_attr qp_create_attr;
struct mlx5dv_devx_obj *obj;
struct dr_qp *dr_qp;
int size;
int ret;
dr_qp = calloc(1, sizeof(*dr_qp));
if (!dr_qp) {
errno = ENOMEM;
return NULL;
}
size = dr_calc_wq_size(dr_qp, attr);
if (dr_qp_alloc_buf(dr_qp, size))
goto err_alloc_bufs;
dr_qp->sq_start = dr_qp->buf.buf + dr_qp->sq.offset;
dr_qp->sq.qend = dr_qp->buf.buf + dr_qp->sq.offset +
(dr_qp->sq.wqe_cnt << dr_qp->sq.wqe_shift);
dr_qp->rq.head = 0;
dr_qp->rq.tail = 0;
dr_qp->sq.cur_post = 0;
ret = posix_memalign((void **)&dr_qp->db, 8, 8);
if (ret) {
errno = ret;
goto err_db_alloc;
}
dr_qp->db[MLX5_RCV_DBR] = 0;
dr_qp->db[MLX5_SND_DBR] = 0;
dr_qp->db_umem = mlx5dv_devx_umem_reg(ctx, dr_qp->db, 8,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ);
if (!dr_qp->db_umem)
goto err_db_umem;
dr_qp->buf_umem = mlx5dv_devx_umem_reg(ctx, dr_qp->buf.buf,
dr_qp->buf.length,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ);
if (!dr_qp->buf_umem)
goto err_buf_umem;
qp_create_attr.page_id = attr->uar->page_id;
qp_create_attr.pdn = attr->pdn;
qp_create_attr.cqn = attr->cqn;
qp_create_attr.pm_state = MLX5_QPC_PM_STATE_MIGRATED;
qp_create_attr.service_type = MLX5_QPC_ST_RC;
qp_create_attr.buff_umem_id = dr_qp->buf_umem->umem_id;
qp_create_attr.db_umem_id = dr_qp->db_umem->umem_id;
qp_create_attr.sq_wqe_cnt = dr_qp->sq.wqe_cnt;
qp_create_attr.rq_wqe_cnt = dr_qp->rq.wqe_cnt;
qp_create_attr.rq_wqe_shift = dr_qp->rq.wqe_shift;
obj = dr_devx_create_qp(ctx, &qp_create_attr);
if (!obj)
goto err_qp_create;
dr_qp->uar = attr->uar;
dr_qp->obj = obj;
return dr_qp;
err_qp_create:
mlx5dv_devx_umem_dereg(dr_qp->buf_umem);
err_buf_umem:
mlx5dv_devx_umem_dereg(dr_qp->db_umem);
err_db_umem:
free(dr_qp->db);
err_db_alloc:
free(dr_qp->sq.wqe_head);
free(dr_qp->buf.buf);
err_alloc_bufs:
free(dr_qp);
return NULL;
}
static int dr_destroy_qp(struct dr_qp *dr_qp)
{
int ret;
ret = mlx5dv_devx_obj_destroy(dr_qp->obj);
if (ret)
return ret;
ret = mlx5dv_devx_umem_dereg(dr_qp->buf_umem);
if (ret)
return ret;
ret = mlx5dv_devx_umem_dereg(dr_qp->db_umem);
if (ret)
return ret;
free(dr_qp->db);
free(dr_qp->sq.wqe_head);
free(dr_qp->buf.buf);
free(dr_qp);
return 0;
}
static void dr_set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
uint64_t remote_addr, uint32_t rkey)
{
rseg->raddr = htobe64(remote_addr);
rseg->rkey = htobe32(rkey);
rseg->reserved = 0;
}
static void dr_post_send_db(struct dr_qp *dr_qp, int size, void *ctrl)
{
dr_qp->sq.head += 2; /* RDMA_WRITE + RDMA_READ */
/*
* Make sure that descriptors are written before
* updating doorbell record and ringing the doorbell
*/
udma_to_device_barrier();
dr_qp->db[MLX5_SND_DBR] = htobe32(dr_qp->sq.cur_post & 0xffff);
/* Make sure that the doorbell write happens before the memcpy
* to WC memory below
*/
mmio_wc_start();
mmio_write64_be((uint8_t *)dr_qp->uar->reg_addr, *(__be64 *)ctrl);
mmio_flush_writes();
}
static void dr_set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg,
struct dr_data_seg *data_seg)
{
dseg->byte_count = htobe32(data_seg->length);
dseg->lkey = htobe32(data_seg->lkey);
dseg->addr = htobe64(data_seg->addr);
}
static int dr_set_data_inl_seg(struct dr_qp *dr_qp,
struct dr_data_seg *data_seg,
void *wqe, uint32_t opcode, int *sz)
{
struct mlx5_wqe_inline_seg *seg;
void *qend = dr_qp->sq.qend;
int inl = 0;
void *addr;
int copy;
int len;
seg = wqe;
wqe += sizeof(*seg);
addr = (void *)(unsigned long)(data_seg->addr);
len = data_seg->length;
inl += len;
if (unlikely(wqe + len > qend)) {
copy = qend - wqe;
memcpy(wqe, addr, copy);
addr += copy;
len -= copy;
wqe = dr_qp->sq_start;
}
memcpy(wqe, addr, len);
wqe += len;
if (likely(inl)) {
seg->byte_count = htobe32(inl | MLX5_INLINE_SEG);
*sz = align(inl + sizeof(seg->byte_count), 16) / 16;
} else {
*sz = 0;
}
return 0;
}
static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *ctrl,
struct dr_data_seg *data_seg)
{
*(uint32_t *)((void *)ctrl + 8) = 0;
ctrl->imm = 0;
ctrl->fm_ce_se = data_seg->send_flags & IBV_SEND_SIGNALED ?
MLX5_WQE_CTRL_CQ_UPDATE : 0;
}
static void dr_rdma_segments(struct dr_qp *dr_qp, uint64_t remote_addr,
uint32_t rkey, struct dr_data_seg *data_seg,
uint32_t opcode, int nreq)
{
struct mlx5_wqe_ctrl_seg *ctrl = NULL;
void *qend = dr_qp->sq.qend;
unsigned idx;
int size = 0;
void *seg;
idx = dr_qp->sq.cur_post & (dr_qp->sq.wqe_cnt - 1);
ctrl = dr_qp->sq_start + (idx << MLX5_SEND_WQE_SHIFT);
seg = ctrl;
dr_set_ctrl_seg(ctrl, data_seg);
seg += sizeof(*ctrl);
size = sizeof(*ctrl) / 16;
dr_set_raddr_seg(seg, remote_addr, rkey);
seg += sizeof(struct mlx5_wqe_raddr_seg);
size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
if (data_seg->send_flags & IBV_SEND_INLINE) {
int uninitialized_var(sz);
dr_set_data_inl_seg(dr_qp, data_seg, seg, opcode, &sz);
size += sz;
} else {
if (unlikely(seg == qend))
seg = dr_qp->sq_start;
dr_set_data_ptr_seg(seg, data_seg);
size += sizeof(struct mlx5_wqe_data_seg) / 16;
}
ctrl->opmod_idx_opcode =
htobe32(((dr_qp->sq.cur_post & 0xffff) << 8) | opcode);
ctrl->qpn_ds = htobe32(size | (dr_qp->obj->object_id << 8));
dr_qp->sq.wqe_head[idx] = dr_qp->sq.head + nreq;
dr_qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
if (nreq)
dr_post_send_db(dr_qp, size, ctrl);
}
static void dr_post_send(struct dr_qp *dr_qp, struct postsend_info *send_info)
{
dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
&send_info->write, MLX5_OPCODE_RDMA_WRITE, 0);
dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
&send_info->read, MLX5_OPCODE_RDMA_READ, 1);
}
/*
* dr_send_fill_and_append_ste_send_info: Add data to be sent with send_list
* parameters:
* @ste - The data that attached to this specific ste
* @size - of data to write
* @offset - of the data from start of the hw_ste entry
* @data - data
* @ste_info - ste to be sent with send_list
* @send_list - to append into it
* @copy_data - if true indicates that the data should be kept because it's not
* backuped any where (like in re-hash).
* if false, it lets the data to be updated after it was added to
* the list.
*/
void dr_send_fill_and_append_ste_send_info(struct dr_ste *ste, uint16_t size,
uint16_t offset, uint8_t *data,
struct dr_ste_send_info *ste_info,
struct list_head *send_list,
bool copy_data)
{
ste_info->size = size;
ste_info->ste = ste;
ste_info->offset = offset;
if (copy_data) {
memcpy(ste_info->data_cont, data, size);
ste_info->data = ste_info->data_cont;
} else {
ste_info->data = data;
}
list_add_tail(send_list, &ste_info->send_list);
}
static bool dr_is_device_fatal(struct mlx5dv_dr_domain *dmn)
{
struct mlx5_context *mlx5_ctx = to_mctx(dmn->ctx);
if (mlx5_ctx->flags & MLX5_CTX_FLAGS_FATAL_STATE)
return true;
return false;
}
/*
* The function tries to consume one wc each time, unless the queue is full, in
* that case, which means that the hw is behind the sw in a full queue len
* the function will drain the cq till it empty.
*/
static int dr_handle_pending_wc(struct mlx5dv_dr_domain *dmn,
struct dr_send_ring *send_ring)
{
bool is_drain = false;
int ne;
if (send_ring->pending_wqe >= send_ring->signal_th) {
/* Queue is full start drain it */
if (send_ring->pending_wqe >= dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
is_drain = true;
do {
/*
* On IBV_EVENT_DEVICE_FATAL a success is returned to
* let the application free its resources successfully
*/
if (dr_is_device_fatal(dmn))
return 0;
ne = dr_poll_cq(&send_ring->cq, 1);
if (ne < 0) {
dr_dbg(dmn, "poll CQ failed\n");
return ne;
} else if (ne == 1) {
send_ring->pending_wqe -= send_ring->signal_th;
}
} while (is_drain && send_ring->pending_wqe);
}
return 0;
}
static void dr_fill_data_segs(struct dr_send_ring *send_ring,
struct postsend_info *send_info)
{
unsigned int inline_flag;
send_ring->pending_wqe++;
if (!send_info->write.lkey)
inline_flag = IBV_SEND_INLINE;
else
inline_flag = 0;
send_info->write.send_flags = inline_flag;
if (send_ring->pending_wqe % send_ring->signal_th == 0)
send_info->write.send_flags |= IBV_SEND_SIGNALED;
send_ring->pending_wqe++;
send_info->read.length = send_info->write.length;
if (inline_flag) {
/* Read into dedicated buffer */
send_info->read.addr = (uintptr_t)send_ring->sync_buff;
send_info->read.lkey = send_ring->sync_mr->lkey;
} else {
/* Read into the same write area */
send_info->read.addr = (uintptr_t)send_info->write.addr;
send_info->read.lkey = send_ring->mr->lkey;
}
if (send_ring->pending_wqe % send_ring->signal_th == 0)
send_info->read.send_flags = IBV_SEND_SIGNALED;
else
send_info->read.send_flags = 0;
}
static int dr_postsend_icm_data(struct mlx5dv_dr_domain *dmn,
struct postsend_info *send_info)
{
struct dr_send_ring *send_ring = dmn->send_ring;
uint32_t buff_offset;
int ret;
ret = dr_handle_pending_wc(dmn, send_ring);
if (ret)
return ret;
if (send_info->write.length > dmn->info.max_inline_size) {
buff_offset = (send_ring->tx_head & (dmn->send_ring->signal_th - 1)) *
send_ring->max_post_send_size;
/* Copy to ring mr */
memcpy(send_ring->buf + buff_offset,
(void *) (uintptr_t)send_info->write.addr,
send_info->write.length);
send_info->write.addr = (uintptr_t)send_ring->buf + buff_offset;
send_info->write.lkey = send_ring->mr->lkey;
}
send_ring->tx_head++;
dr_fill_data_segs(send_ring, send_info);
dr_post_send(send_ring->qp, send_info);
return 0;
}
static int dr_get_tbl_copy_details(struct mlx5dv_dr_domain *dmn,
struct dr_ste_htbl *htbl,
uint8_t **data,
uint32_t *byte_size,
int *iterations,
int *num_stes)
{
int alloc_size;
if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
*iterations = htbl->chunk->byte_size / dmn->send_ring->max_post_send_size;
*byte_size = dmn->send_ring->max_post_send_size;
alloc_size = *byte_size;
*num_stes = *byte_size / DR_STE_SIZE;
} else {
*iterations = 1;
*num_stes = htbl->chunk->num_of_entries;
alloc_size = *num_stes * DR_STE_SIZE;
}
*data = calloc(1, alloc_size);
if (!*data) {
errno = ENOMEM;
return errno;
}
return 0;
}
/*
* dr_postsend_ste: write size bytes into offset from the hw icm.
*
* Input:
* dmn - Domain
* ste - The ste struct that contains the data (at least part of it)
* data - The real data to send
* size - data size for writing.
* offset - The offset from the icm mapped data to start write to.
* this for write only part of the buffer.
*
* Return: 0 on success.
*/
int dr_send_postsend_ste(struct mlx5dv_dr_domain *dmn, struct dr_ste *ste,
uint8_t *data, uint16_t size, uint16_t offset)
{
struct postsend_info send_info = {};
send_info.write.addr = (uintptr_t) data;
send_info.write.length = size;
send_info.write.lkey = 0;
send_info.remote_addr = dr_ste_get_mr_addr(ste) + offset;
send_info.rkey = ste->htbl->chunk->rkey;
return dr_postsend_icm_data(dmn, &send_info);
}
int dr_send_postsend_htbl(struct mlx5dv_dr_domain *dmn, struct dr_ste_htbl *htbl,
uint8_t *formated_ste, uint8_t *mask)
{
uint32_t byte_size = htbl->chunk->byte_size;
int i, j, num_stes_per_iter, iterations;
uint8_t *data;
int ret;
ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
&iterations, &num_stes_per_iter);
if (ret)
return ret;
/* Send the data iteration times */
for (i = 0; i < iterations; i++) {
uint32_t ste_index = i * (byte_size / DR_STE_SIZE);
struct postsend_info send_info = {};
/* Copy all ste's on the data buffer, need to add the bit_mask */
for (j = 0; j < num_stes_per_iter; j++) {
if (dr_ste_is_not_valid_entry(htbl->ste_arr[ste_index + j].hw_ste)) {
memcpy(data + (j * DR_STE_SIZE),
formated_ste, DR_STE_SIZE);
} else {
/* Copy data */
memcpy(data + (j * DR_STE_SIZE), htbl->ste_arr[ste_index + j].hw_ste,
DR_STE_SIZE_REDUCED);
/* Copy bit_mask */
memcpy(data + (j * DR_STE_SIZE) + DR_STE_SIZE_REDUCED,
mask, DR_STE_SIZE_MASK);
}
}
send_info.write.addr = (uintptr_t) data;
send_info.write.length = byte_size;
send_info.write.lkey = 0;
send_info.remote_addr = dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
send_info.rkey = htbl->chunk->rkey;
ret = dr_postsend_icm_data(dmn, &send_info);
if (ret)
goto out_free;
}
out_free:
free(data);
return ret;
}
/* Initialize htble with default STEs */
int dr_send_postsend_formated_htbl(struct mlx5dv_dr_domain *dmn,
struct dr_ste_htbl *htbl,
uint8_t *ste_init_data,
bool update_hw_ste)
{
uint32_t byte_size = htbl->chunk->byte_size;
int i, num_stes, iterations, ret;
uint8_t *data;
ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
&iterations, &num_stes);
if (ret)
return ret;
for (i = 0; i < num_stes; i++) {
uint8_t *copy_dst;
/* Copy the same ste on the data buffer */
copy_dst = data + i * DR_STE_SIZE;
memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
if (update_hw_ste) {
/* Copy the reduced ste to hash table ste_arr */
copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
}
}
/* Send the data iteration times */
for (i = 0; i < iterations; i++) {
uint32_t ste_index = i * (byte_size / DR_STE_SIZE);
struct postsend_info send_info = {};
send_info.write.addr = (uintptr_t) data;
send_info.write.length = byte_size;
send_info.write.lkey = 0;
send_info.remote_addr = dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
send_info.rkey = htbl->chunk->rkey;
ret = dr_postsend_icm_data(dmn, &send_info);
if (ret)
goto out_free;
}
out_free:
free(data);
return ret;
}
int dr_send_postsend_action(struct mlx5dv_dr_domain *dmn,
struct mlx5dv_dr_action *action)
{
struct postsend_info send_info = {};
int ret;
send_info.write.addr = (uintptr_t) action->rewrite.data;
send_info.write.length = action->rewrite.num_of_actions *
DR_MODIFY_ACTION_SIZE;
send_info.write.lkey = 0;
send_info.remote_addr = action->rewrite.chunk->mr_addr;
send_info.rkey = action->rewrite.chunk->rkey;
pthread_mutex_lock(&dmn->mutex);
ret = dr_postsend_icm_data(dmn, &send_info);
pthread_mutex_unlock(&dmn->mutex);
return ret;
}
static int dr_prepare_qp_to_rts(struct mlx5dv_dr_domain *dmn)
{
struct dr_devx_qp_rts_attr rts_attr = {};
struct dr_devx_qp_rtr_attr rtr_attr = {};
struct dr_qp *dr_qp = dmn->send_ring->qp;
enum ibv_mtu mtu = IBV_MTU_1024;
uint16_t gid_index = 0;
int port = 1;
int ret;
/* Init */
ret = dr_devx_modify_qp_rst2init(dmn->ctx, dr_qp->obj, port);
if (ret) {
dr_dbg(dmn, "Failed to modify QP to INIT, ret: %d\n", ret);
return ret;
}
/* RTR */
ret = dr_devx_query_gid(dmn->ctx, port, gid_index, &rtr_attr.dgid_attr);
if (ret) {
dr_dbg(dmn, "can't read sgid of index %d\n", gid_index);
return ret;
}
rtr_attr.mtu = mtu;
rtr_attr.qp_num = dr_qp->obj->object_id;
rtr_attr.min_rnr_timer = 12;
rtr_attr.port_num = port;
rtr_attr.sgid_index = gid_index;
ret = dr_devx_modify_qp_init2rtr(dmn->ctx, dr_qp->obj, &rtr_attr);
if (ret) {
dr_dbg(dmn, "Failed to modify QP to RTR, ret: %d\n", ret);
return ret;
}
/* RTS */
rts_attr.timeout = 14;
rts_attr.retry_cnt = 7;
rts_attr.rnr_retry = 7;
ret = dr_devx_modify_qp_rtr2rts(dmn->ctx, dr_qp->obj, &rts_attr);
if (ret) {
dr_dbg(dmn, "Failed to modify QP to RTS, ret: %d\n", ret);
return ret;
}
return 0;
}
/* Each domain has its own ib resources */
int dr_send_ring_alloc(struct mlx5dv_dr_domain *dmn)
{
struct dr_qp_init_attr init_attr = {};
struct mlx5dv_pd mlx5_pd = {};
struct mlx5dv_cq mlx5_cq = {};
int cq_size, page_size;
struct mlx5dv_obj obj;
int size;
int access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_READ;
int ret;
dmn->send_ring = calloc(1, sizeof(*dmn->send_ring));
if (!dmn->send_ring) {
dr_dbg(dmn, "Couldn't allocate send-ring\n");
errno = ENOMEM;
return errno;
}
cq_size = QUEUE_SIZE + 1;
dmn->send_ring->cq.ibv_cq = ibv_create_cq(dmn->ctx, cq_size, NULL, NULL, 0);
if (!dmn->send_ring->cq.ibv_cq) {
dr_dbg(dmn, "Failed to create CQ with %u entries\n", cq_size);
ret = ENODEV;
errno = ENODEV;
goto free_send_ring;
}
obj.cq.in = dmn->send_ring->cq.ibv_cq;
obj.cq.out = &mlx5_cq;
ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ);
if (ret)
goto clean_cq;
dmn->send_ring->cq.buf = mlx5_cq.buf;
dmn->send_ring->cq.db = mlx5_cq.dbrec;
dmn->send_ring->cq.ncqe = mlx5_cq.cqe_cnt;
dmn->send_ring->cq.cqe_sz = mlx5_cq.cqe_size;
obj.pd.in = dmn->pd;
obj.pd.out = &mlx5_pd;
ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_PD);
if (ret)
goto clean_cq;
init_attr.cqn = mlx5_cq.cqn;
init_attr.pdn = mlx5_pd.pdn;
init_attr.uar = dmn->uar;
init_attr.cap.max_send_wr = QUEUE_SIZE;
init_attr.cap.max_recv_wr = 1;
init_attr.cap.max_send_sge = 1;
init_attr.cap.max_recv_sge = 1;
init_attr.cap.max_inline_data = DR_STE_SIZE;
dmn->send_ring->qp = dr_create_rc_qp(dmn->ctx, &init_attr);
if (!dmn->send_ring->qp) {
dr_dbg(dmn, "Couldn't create QP\n");
ret = errno;
goto clean_cq;
}
dmn->send_ring->cq.qp = dmn->send_ring->qp;
dmn->info.max_send_wr = QUEUE_SIZE;
dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
DR_STE_SIZE);
dmn->send_ring->signal_th = dmn->info.max_send_wr / SIGNAL_PER_DIV_QUEUE;
/* Prepare qp to be used */
ret = dr_prepare_qp_to_rts(dmn);
if (ret) {
dr_dbg(dmn, "Couldn't prepare QP\n");
goto clean_qp;
}
dmn->send_ring->max_post_send_size =
dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K, DR_ICM_TYPE_STE);
/* Allocating the max size as a buffer for writing */
size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
page_size = sysconf(_SC_PAGESIZE);
ret = posix_memalign(&dmn->send_ring->buf, page_size, size);
if (ret) {
dr_dbg(dmn, "Couldn't allocate send-ring buf.\n");
errno = ret;
goto clean_qp;
}
memset(dmn->send_ring->buf, 0, size);
dmn->send_ring->buf_size = size;
dmn->send_ring->mr = ibv_reg_mr(dmn->pd, dmn->send_ring->buf, size,
access_flags);
if (!dmn->send_ring->mr) {
dr_dbg(dmn, "Couldn't register send-ring MR\n");
ret = errno;
goto free_mem;
}
dmn->send_ring->sync_mr = ibv_reg_mr(dmn->pd, dmn->send_ring->sync_buff,
MIN_READ_SYNC,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE);
if (!dmn->send_ring->sync_mr) {
dr_dbg(dmn, "Couldn't register sync mr\n");
ret = errno;
goto clean_mr;
}
return 0;
clean_mr:
ibv_dereg_mr(dmn->send_ring->mr);
free_mem:
free(dmn->send_ring->buf);
clean_qp:
dr_destroy_qp(dmn->send_ring->qp);
clean_cq:
ibv_destroy_cq(dmn->send_ring->cq.ibv_cq);
free_send_ring:
free(dmn->send_ring);
return ret;
}
void dr_send_ring_free(struct dr_send_ring *send_ring)
{
dr_destroy_qp(send_ring->qp);
ibv_destroy_cq(send_ring->cq.ibv_cq);
ibv_dereg_mr(send_ring->sync_mr);
ibv_dereg_mr(send_ring->mr);
free(send_ring->buf);
free(send_ring);
}
int dr_send_ring_force_drain(struct mlx5dv_dr_domain *dmn)
{
struct dr_send_ring *send_ring = dmn->send_ring;
struct postsend_info send_info = {};
uint8_t data[DR_STE_SIZE];
int i, num_of_sends_req;
int ret;
/* Sending this amount of requests makes sure we will get drain */
num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
/* Send fake requests forcing the last to be signaled */
send_info.write.addr = (uintptr_t) data;
send_info.write.length = DR_STE_SIZE;
send_info.write.lkey = 0;
/* Using the sync_mr in order to write/read */
send_info.remote_addr = (uintptr_t) send_ring->sync_mr->addr;
send_info.rkey = send_ring->sync_mr->rkey;
for (i = 0; i < num_of_sends_req; i++) {
ret = dr_postsend_icm_data(dmn, &send_info);
if (ret)
return ret;
}
ret = dr_handle_pending_wc(dmn, send_ring);
return ret;
}