Blob Blame History Raw
/*
 * Copyright (c) 2015-2016  QLogic Corporation
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and /or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <config.h>

#include <assert.h>
#include <endian.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <errno.h>
#include <pthread.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdbool.h>

#include "qelr.h"
#include "qelr_chain.h"
#include "qelr_verbs.h"
#include <util/compiler.h>
#include <util/util.h>
#include <util/mmio.h>
#include <stdio.h>
#include <stdlib.h>

#define QELR_SQE_ELEMENT_SIZE	(sizeof(struct rdma_sq_sge))
#define QELR_RQE_ELEMENT_SIZE	(sizeof(struct rdma_rq_sge))
#define QELR_CQE_SIZE		(sizeof(union rdma_cqe))

static void qelr_inc_sw_cons_u16(struct qelr_qp_hwq_info *info)
{
	info->cons = (info->cons + 1) % info->max_wr;
	info->wqe_cons++;
}

static void qelr_inc_sw_prod_u16(struct qelr_qp_hwq_info *info)
{
	info->prod = (info->prod + 1) % info->max_wr;
}

static inline int qelr_wq_is_full(struct qelr_qp_hwq_info *info)
{
	return (((info->prod + 1) % info->max_wr) == info->cons);
}

int qelr_query_device(struct ibv_context *context,
		      struct ibv_device_attr *attr)
{
	struct ibv_query_device cmd;
	uint64_t fw_ver;
	unsigned int major, minor, revision, eng;
	int status;

	bzero(attr, sizeof(*attr));
	status = ibv_cmd_query_device(context, attr, &fw_ver, &cmd,
				      sizeof(cmd));

	major = (fw_ver >> 24) & 0xff;
	minor = (fw_ver >> 16) & 0xff;
	revision = (fw_ver >> 8) & 0xff;
	eng = fw_ver & 0xff;

	snprintf(attr->fw_ver, sizeof(attr->fw_ver),
		 "%d.%d.%d.%d", major, minor, revision, eng);

	return status;
}

int qelr_query_port(struct ibv_context *context, uint8_t port,
		    struct ibv_port_attr *attr)
{
	struct ibv_query_port cmd;
	int status;

	status = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
	return status;
}

struct ibv_pd *qelr_alloc_pd(struct ibv_context *context)
{
	struct qelr_alloc_pd cmd;
	struct qelr_alloc_pd_resp resp;
	struct qelr_pd *pd;
	struct qelr_devctx *cxt = get_qelr_ctx(context);

	pd = malloc(sizeof(*pd));
	if (!pd)
		return NULL;

	bzero(pd, sizeof(*pd));
	memset(&cmd, 0, sizeof(cmd));

	if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd.ibv_cmd, sizeof(cmd),
			     &resp.ibv_resp, sizeof(resp))) {
		free(pd);
		return NULL;
	}

	pd->pd_id = resp.pd_id;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_INIT, "Allocated pd: %d\n", pd->pd_id);

	return &pd->ibv_pd;
}

int qelr_dealloc_pd(struct ibv_pd *ibpd)
{
	int rc = 0;
	struct qelr_pd *pd = get_qelr_pd(ibpd);
	struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context);

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_INIT, "Deallocated pd: %d\n",
		   pd->pd_id);

	rc = ibv_cmd_dealloc_pd(ibpd);

	if (rc)
		return rc;

	free(pd);

	return rc;
}

struct ibv_mr *qelr_reg_mr(struct ibv_pd *ibpd, void *addr, size_t len,
			   uint64_t hca_va, int access)
{
	struct qelr_mr *mr;
	struct ibv_reg_mr cmd;
	struct qelr_reg_mr_resp resp;
	struct qelr_pd *pd = get_qelr_pd(ibpd);
	struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context);

	mr = malloc(sizeof(*mr));
	if (!mr)
		return NULL;

	bzero(mr, sizeof(*mr));

	if (ibv_cmd_reg_mr(ibpd, addr, len, hca_va, access, &mr->vmr, &cmd,
			   sizeof(cmd), &resp.ibv_resp, sizeof(resp))) {
		free(mr);
		return NULL;
	}

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_MR,
		   "MR Register %p completed successfully pd_id=%d addr=%p len=%zu access=%d lkey=%x rkey=%x\n",
		   mr, pd->pd_id, addr, len, access, mr->vmr.ibv_mr.lkey,
		   mr->vmr.ibv_mr.rkey);

	return &mr->vmr.ibv_mr;
}

int qelr_dereg_mr(struct verbs_mr *vmr)
{
	struct qelr_devctx *cxt = get_qelr_ctx(vmr->ibv_mr.context);
	int rc;

	rc = ibv_cmd_dereg_mr(vmr);
	if (rc)
		return rc;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_MR,
		   "MR DERegister %p completed successfully\n", vmr);

	free(vmr);

	return 0;
}

static void consume_cqe(struct qelr_cq *cq)
{
	if (cq->latest_cqe == cq->toggle_cqe)
		cq->chain_toggle ^= RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK;

	cq->latest_cqe = qelr_chain_consume(&cq->chain);
}

static inline int qelr_cq_entries(int entries)
{
	/* FW requires an extra entry */
	return entries + 1;
}

struct ibv_cq *qelr_create_cq(struct ibv_context *context, int cqe,
			      struct ibv_comp_channel *channel,
			      int comp_vector)
{
	struct qelr_devctx *cxt = get_qelr_ctx(context);
	struct qelr_create_cq_resp resp = {};
	struct qelr_create_cq cmd;
	struct qelr_cq *cq;
	int chain_size;
	int rc;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ,
		   "create cq: context=%p, cqe=%d, channel=%p, comp_vector=%d\n",
		   context, cqe, channel, comp_vector);

	if (!cqe || cqe > cxt->max_cqes) {
		DP_ERR(cxt->dbg_fp,
		       "create cq: failed. attempted to allocate %d cqes but valid range is 1...%d\n",
		       cqe, cqe > cxt->max_cqes);
		return NULL;
	}

	/* allocate CQ structure */
	cq = calloc(1, sizeof(*cq));
	if (!cq)
		return NULL;

	/* allocate CQ buffer */
	chain_size = qelr_cq_entries(cqe) * QELR_CQE_SIZE;
	rc = qelr_chain_alloc(&cq->chain, chain_size, cxt->kernel_page_size,
			      QELR_CQE_SIZE);
	if (rc)
		goto err_0;

	cmd.addr = (uintptr_t) cq->chain.first_addr;
	cmd.len = cq->chain.size;
	rc = ibv_cmd_create_cq(context, cqe, channel, comp_vector,
			       &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd),
			       &resp.ibv_resp, sizeof(resp));
	if (rc) {
		DP_ERR(cxt->dbg_fp, "create cq: failed with rc = %d\n", rc);
		goto err_1;
	}

	/* map the doorbell and prepare its data */
	cq->db.data.icid = htole16(resp.icid);
	cq->db.data.params = DB_AGG_CMD_SET <<
		RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT;
	cq->db_addr = cxt->db_addr + resp.db_offset;

	if (resp.db_rec_addr) {
		cq->db_rec_map = mmap(NULL, cxt->kernel_page_size, PROT_WRITE,
				      MAP_SHARED, context->cmd_fd,
				      resp.db_rec_addr);
		if (cq->db_rec_map == MAP_FAILED) {
			int errsv = errno;

			DP_ERR(cxt->dbg_fp,
			       "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n",
			       resp.db_rec_addr, cxt->kernel_page_size,
			       context->cmd_fd, errsv);
			goto err_1;
		}
		cq->db_rec_addr = cq->db_rec_map;
	} else {
		/* Kernel doesn't support doorbell recovery. Point to dummy
		 * location instead
		 */
		cq->db_rec_addr = &cxt->db_rec_addr_dummy;
	}

	/* point to the very last element, passing this we will toggle */
	cq->toggle_cqe = qelr_chain_get_last_elem(&cq->chain);
	cq->chain_toggle = RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK;
	cq->latest_cqe = NULL; /* must be different from chain_toggle */
	consume_cqe(cq);

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ,
		   "create cq: successfully created %p\n", cq);

	return &cq->ibv_cq;

err_1:
	qelr_chain_free(&cq->chain);
err_0:
	free(cq);

	return NULL;
}

int qelr_destroy_cq(struct ibv_cq *ibv_cq)
{
	struct qelr_devctx *cxt = get_qelr_ctx(ibv_cq->context);
	struct qelr_cq *cq = get_qelr_cq(ibv_cq);
	int rc;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "destroy cq: %p\n", cq);

	rc = ibv_cmd_destroy_cq(ibv_cq);
	if (rc) {
		DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ,
		           "destroy cq: failed to destroy %p, got %d.\n", cq,
			   rc);
		return rc;
	}

	qelr_chain_free(&cq->chain);
	if (cq->db_rec_map)
		munmap(cq->db_rec_map, cxt->kernel_page_size);

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ,
		   "destroy cq: successfully destroyed %p\n", cq);

	free(cq);

	return 0;
}

int qelr_query_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr)
{
	struct ibv_query_srq cmd;

	return ibv_cmd_query_srq(ibv_srq, attr, &cmd, sizeof(cmd));
}

int qelr_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr,
		    int attr_mask)
{
	struct ibv_modify_srq cmd;

	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof(cmd));

}

static void qelr_destroy_srq_buffers(struct ibv_srq *ibv_srq)
{
	struct qelr_srq *srq = get_qelr_srq(ibv_srq);
	uint32_t *virt_prod_pair_addr;
	uint32_t prod_size;

	qelr_chain_free(&srq->hw_srq.chain);

	virt_prod_pair_addr = srq->hw_srq.virt_prod_pair_addr;
	prod_size = sizeof(struct rdma_srq_producers);

	ibv_dofork_range(virt_prod_pair_addr, prod_size);
	munmap(virt_prod_pair_addr, prod_size);
}

int qelr_destroy_srq(struct ibv_srq *ibv_srq)
{
	struct qelr_srq *srq = get_qelr_srq(ibv_srq);
	int ret;

	ret = ibv_cmd_destroy_srq(ibv_srq);
	if (ret)
		return ret;

	qelr_destroy_srq_buffers(ibv_srq);
	free(srq);

	return 0;
}

static void qelr_create_srq_configure_req(struct qelr_srq *srq,
					  struct qelr_create_srq *req)
{
	req->srq_addr = (uintptr_t)srq->hw_srq.chain.first_addr;
	req->srq_len = srq->hw_srq.chain.size;
	req->prod_pair_addr = (uintptr_t)srq->hw_srq.virt_prod_pair_addr;
}

static int qelr_create_srq_buffers(struct qelr_devctx *cxt,
					  struct qelr_srq *srq,
					  struct ibv_srq_init_attr *attrs)
{
	uint32_t max_wr, max_sges;
	int chain_size, prod_size;
	void *addr;
	int rc;

	max_wr = attrs->attr.max_wr;
	if (!max_wr)
		return -EINVAL;

	max_wr = min_t(uint32_t, max_wr, cxt->max_srq_wr);
	max_sges = max_wr * (cxt->sges_per_srq_wr + 1); /* +1 for header */
	chain_size = max_sges * QELR_RQE_ELEMENT_SIZE;

	rc = qelr_chain_alloc(&srq->hw_srq.chain, chain_size,
			      cxt->kernel_page_size, QELR_RQE_ELEMENT_SIZE);
	if (rc) {
		DP_ERR(cxt->dbg_fp,
		       "create srq: failed to map srq, got %d", rc);
		return rc;
	}

	prod_size = sizeof(struct rdma_srq_producers);
	addr = mmap(NULL, prod_size, PROT_READ | PROT_WRITE,
		    MAP_PRIVATE | MAP_ANONYMOUS, -1,
		    0);
	if (addr == MAP_FAILED) {
		DP_ERR(cxt->dbg_fp,
		       "create srq: failed to map producer, got %d", errno);
		qelr_chain_free(&srq->hw_srq.chain);
		return errno;
	}

	rc = ibv_dontfork_range(addr, prod_size);
	if (rc) {
		munmap(addr, prod_size);
		qelr_chain_free(&srq->hw_srq.chain);
		return rc;
	}

	srq->hw_srq.virt_prod_pair_addr = addr;
	srq->hw_srq.max_sges = cxt->sges_per_srq_wr;
	srq->hw_srq.max_wr = max_wr;

	return 0;
}

struct ibv_srq *qelr_create_srq(struct ibv_pd *pd,
				struct ibv_srq_init_attr *init_attr)
{
	struct qelr_devctx *cxt = get_qelr_ctx(pd->context);
	struct qelr_create_srq req;
	struct qelr_create_srq_resp resp;
	struct qelr_srq *srq;
	int ret;

	srq = calloc(1, sizeof(*srq));
	if (!srq)
		return NULL;

	ret = qelr_create_srq_buffers(cxt, srq, init_attr);
	if (ret) {
		free(srq);
		return NULL;
	}

	pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE);
	qelr_create_srq_configure_req(srq, &req);
	ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, init_attr, &req.ibv_cmd,
				    sizeof(req), &resp.ibv_resp, sizeof(resp));
	if (ret) {
		qelr_destroy_srq_buffers(&srq->ibv_srq);
		free(srq);
		return NULL;
	}

	return &srq->ibv_srq;
}

static void qelr_free_rq(struct qelr_qp *qp)
{
	free(qp->rqe_wr_id);
}

static void qelr_free_sq(struct qelr_qp *qp)
{
	free(qp->wqe_wr_id);
}

static void qelr_chain_free_sq(struct qelr_qp *qp)
{
	qelr_chain_free(&qp->sq.chain);
}

static void qelr_chain_free_rq(struct qelr_qp *qp)
{
	qelr_chain_free(&qp->rq.chain);
}

static inline int qelr_create_qp_buffers_sq(struct qelr_devctx *cxt,
					    struct qelr_qp *qp,
					    struct ibv_qp_init_attr *attrs)
{
	uint32_t max_send_wr, max_send_sges, max_send_buf;
	int chain_size;
	int rc;

	/* SQ */
	max_send_wr = attrs->cap.max_send_wr;
	max_send_wr = max_t(uint32_t, max_send_wr, 1);
	max_send_wr = min_t(uint32_t, max_send_wr, cxt->max_send_wr);
	max_send_sges = max_send_wr * cxt->sges_per_send_wr;
	max_send_buf = max_send_sges * QELR_SQE_ELEMENT_SIZE;

	chain_size = max_send_buf;
	rc = qelr_chain_alloc(&qp->sq.chain, chain_size, cxt->kernel_page_size,
			      QELR_SQE_ELEMENT_SIZE);
	if (rc)
		DP_ERR(cxt->dbg_fp, "create qp: failed to map SQ chain, got %d", rc);

	qp->sq.max_wr = max_send_wr;
	qp->sq.max_sges = cxt->sges_per_send_wr;

	return rc;
}

static inline int qelr_create_qp_buffers_rq(struct qelr_devctx *cxt,
					    struct qelr_qp *qp,
					    struct ibv_qp_init_attr *attrs)
{
	uint32_t max_recv_wr, max_recv_sges, max_recv_buf;
	int chain_size;
	int rc;

	/* RQ */
	max_recv_wr = attrs->cap.max_recv_wr;
	max_recv_wr = max_t(uint32_t, max_recv_wr, 1);
	max_recv_wr = min_t(uint32_t, max_recv_wr, cxt->max_recv_wr);
	max_recv_sges = max_recv_wr * cxt->sges_per_recv_wr;
	max_recv_buf = max_recv_sges * QELR_RQE_ELEMENT_SIZE;

	chain_size = max_recv_buf;
	rc = qelr_chain_alloc(&qp->rq.chain, chain_size, cxt->kernel_page_size,
			      QELR_RQE_ELEMENT_SIZE);
	if (rc)
		DP_ERR(cxt->dbg_fp, "create qp: failed to map RQ chain, got %d", rc);

	qp->rq.max_wr = max_recv_wr;
	qp->rq.max_sges = cxt->sges_per_recv_wr;

	return rc;
}

static inline int qelr_create_qp_buffers(struct qelr_devctx *cxt,
					 struct qelr_qp *qp,
					 struct ibv_qp_init_attr *attrs)
{
	int rc;

	rc = qelr_create_qp_buffers_sq(cxt, qp, attrs);
	if (rc)
		return rc;

	rc = qelr_create_qp_buffers_rq(cxt, qp, attrs);
	if (rc) {
		qelr_chain_free_sq(qp);
		if (qp->sq.db_rec_map)
			munmap(qp->sq.db_rec_map, cxt->kernel_page_size);
		return rc;
	}

	return 0;
}

static inline int qelr_configure_qp_sq(struct qelr_devctx *cxt,
				       struct qelr_qp *qp,
				       struct ibv_qp_init_attr *attrs,
				       struct qelr_create_qp_resp *resp)
{
	qp->sq.icid = resp->sq_icid;
	qp->sq.db_data.data.icid = htole16(resp->sq_icid);
	qp->sq.prod = 0;
	qp->sq.db = cxt->db_addr + resp->sq_db_offset;
	qp->sq.edpm_db = cxt->db_addr;
	if (resp->sq_db_rec_addr) {
		qp->sq.db_rec_map = mmap(NULL, cxt->kernel_page_size,
					 PROT_WRITE, MAP_SHARED,
					 cxt->ibv_ctx.context.cmd_fd,
					 resp->sq_db_rec_addr);

		if (qp->sq.db_rec_map == MAP_FAILED) {
			int errsv = errno;

			DP_ERR(cxt->dbg_fp,
			       "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n",
			       resp->sq_db_rec_addr, cxt->kernel_page_size,
			       cxt->ibv_ctx.context.cmd_fd, errsv);
			return -ENOMEM;
		}
		qp->sq.db_rec_addr = qp->sq.db_rec_map;
	} else {
		/* Kernel doesn't support doorbell recovery. Point to dummy
		 * location instead
		 */
		qp->sq.db_rec_addr = &cxt->db_rec_addr_dummy;
	}

	/* shadow SQ */
	qp->sq.max_wr++;	/* prod/cons method requires N+1 elements */
	qp->wqe_wr_id = calloc(qp->sq.max_wr, sizeof(*qp->wqe_wr_id));
	if (!qp->wqe_wr_id) {
		DP_ERR(cxt->dbg_fp,
		       "create qp: failed shadow SQ memory allocation\n");
		return -ENOMEM;
	}
	return 0;
}

static inline int qelr_configure_qp_rq(struct qelr_devctx *cxt,
				       struct qelr_qp *qp,
				       struct ibv_qp_init_attr *attrs,
				       struct qelr_create_qp_resp *resp)
{
	/* RQ */
	qp->rq.icid = resp->rq_icid;
	qp->rq.db_data.data.icid = htole16(resp->rq_icid);
	qp->rq.db = cxt->db_addr + resp->rq_db_offset;
	qp->rq.iwarp_db2 = cxt->db_addr + resp->rq_db2_offset;
	qp->rq.iwarp_db2_data.data.icid = htole16(qp->rq.icid);
	qp->rq.iwarp_db2_data.data.value = htole16(DQ_TCM_IWARP_POST_RQ_CF_CMD);
	qp->rq.prod = 0;

	if (resp->rq_db_rec_addr) {
		qp->rq.db_rec_map = mmap(NULL, cxt->kernel_page_size,
					 PROT_WRITE, MAP_SHARED,
					 cxt->ibv_ctx.context.cmd_fd,
					 resp->rq_db_rec_addr);
		if (qp->rq.db_rec_map == MAP_FAILED) {
			int errsv = errno;

			DP_ERR(cxt->dbg_fp,
			       "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n",
			       resp->rq_db_rec_addr, cxt->kernel_page_size,
			       cxt->ibv_ctx.context.cmd_fd, errsv);
			return -ENOMEM;
		}
		qp->rq.db_rec_addr = qp->rq.db_rec_map;
	} else {
		/* Kernel doesn't support doorbell recovery. Point to dummy
		 * location instead
		 */
		qp->rq.db_rec_addr = &cxt->db_rec_addr_dummy;
	}

	/* shadow RQ */
	qp->rq.max_wr++;	/* prod/cons method requires N+1 elements */
	qp->rqe_wr_id = calloc(qp->rq.max_wr, sizeof(*qp->rqe_wr_id));
	if (!qp->rqe_wr_id) {
		DP_ERR(cxt->dbg_fp,
		       "create qp: failed shadow RQ memory allocation\n");
		return -ENOMEM;
	}

	return 0;
}

static inline int qelr_configure_qp(struct qelr_devctx *cxt, struct qelr_qp *qp,
				    struct ibv_qp_init_attr *attrs,
				    struct qelr_create_qp_resp *resp)
{
	int rc;

	/* general */
	pthread_spin_init(&qp->q_lock, PTHREAD_PROCESS_PRIVATE);
	qp->qp_id = resp->qp_id;
	qp->state = QELR_QPS_RST;
	qp->sq_sig_all = attrs->sq_sig_all;
	qp->atomic_supported = resp->atomic_supported;

	rc = qelr_configure_qp_sq(cxt, qp, attrs, resp);
	if (rc)
		return rc;
	rc = qelr_configure_qp_rq(cxt, qp, attrs, resp);
	if (rc)
		qelr_free_sq(qp);

	return rc;
}

static inline void qelr_print_qp_init_attr(
		struct qelr_devctx *cxt,
		struct ibv_qp_init_attr *attr)
{
	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP,
		   "create qp: send_cq=%p, recv_cq=%p, srq=%p, max_inline_data=%d, max_recv_sge=%d, max_recv_wr=%d, max_send_sge=%d, max_send_wr=%d, qp_type=%d, sq_sig_all=%d\n",
		   attr->send_cq, attr->recv_cq, attr->srq,
		   attr->cap.max_inline_data, attr->cap.max_recv_sge,
		   attr->cap.max_recv_wr, attr->cap.max_send_sge,
		   attr->cap.max_send_wr, attr->qp_type, attr->sq_sig_all);
}

static inline void
qelr_create_qp_configure_sq_req(struct qelr_qp *qp,
				struct qelr_create_qp *req)
{
	req->sq_addr = (uintptr_t)qp->sq.chain.first_addr;
	req->sq_len = qp->sq.chain.size;
}

static inline void
qelr_create_qp_configure_rq_req(struct qelr_qp *qp,
				struct qelr_create_qp *req)
{
	req->rq_addr = (uintptr_t)qp->rq.chain.first_addr;
	req->rq_len = qp->rq.chain.size;
}

static inline void
qelr_create_qp_configure_req(struct qelr_qp *qp,
			     struct qelr_create_qp *req)
{
	memset(req, 0, sizeof(*req));
	req->qp_handle_hi = U64_HI(qp);
	req->qp_handle_lo = U64_LO(qp);
	qelr_create_qp_configure_sq_req(qp, req);
	qelr_create_qp_configure_rq_req(qp, req);
}

struct ibv_qp *qelr_create_qp(struct ibv_pd *pd,
			      struct ibv_qp_init_attr *attrs)
{
	struct qelr_devctx *cxt = get_qelr_ctx(pd->context);
	struct qelr_create_qp_resp resp = {};
	struct qelr_create_qp req;
	struct qelr_qp *qp;
	int rc;

	qelr_print_qp_init_attr(cxt, attrs);

	qp = calloc(1, sizeof(*qp));
	if (!qp)
		return NULL;

	if (attrs->srq)
		qp->srq = get_qelr_srq(attrs->srq);

	rc = qelr_create_qp_buffers(cxt, qp, attrs);
	if (rc)
		goto err0;

	qelr_create_qp_configure_req(qp, &req);

	rc = ibv_cmd_create_qp(pd, &qp->ibv_qp, attrs, &req.ibv_cmd,
			       sizeof(req), &resp.ibv_resp, sizeof(resp));
	if (rc) {
		DP_ERR(cxt->dbg_fp,
		       "create qp: failed on ibv_cmd_create_qp with %d\n", rc);
		goto err1;
	}

	rc = qelr_configure_qp(cxt, qp, attrs, &resp);
	if (rc)
		goto err2;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP,
		   "create qp: successfully created %p. handle_hi=%x handle_lo=%x\n",
		   qp, req.qp_handle_hi, req.qp_handle_lo);

	return &qp->ibv_qp;

err2:
	rc = ibv_cmd_destroy_qp(&qp->ibv_qp);
	if (rc)
		DP_ERR(cxt->dbg_fp, "create qp: fatal fault. rc=%d\n", rc);
err1:
	qelr_chain_free_sq(qp);
	qelr_chain_free_rq(qp);
err0:
	free(qp);

	return NULL;
}

static void qelr_print_ah_attr(struct qelr_devctx *cxt, struct ibv_ah_attr *attr)
{
	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP,
		   "grh.dgid=[%#" PRIx64 ":%#" PRIx64 "], grh.flow_label=%d, grh.sgid_index=%d, grh.hop_limit=%d, grh.traffic_class=%d, dlid=%d, sl=%d, src_path_bits=%d, static_rate = %d, port_num=%d\n",
		   be64toh(attr->grh.dgid.global.interface_id),
		   be64toh(attr->grh.dgid.global.subnet_prefix),
		   attr->grh.flow_label, attr->grh.hop_limit,
		   attr->grh.sgid_index, attr->grh.traffic_class, attr->dlid,
		   attr->sl, attr->src_path_bits,
		   attr->static_rate, attr->port_num);
}

static void qelr_print_qp_attr(struct qelr_devctx *cxt, struct ibv_qp_attr *attr)
{
	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP,
		   "\tqp_state=%d\tcur_qp_state=%d\tpath_mtu=%d\tpath_mig_state=%d\tqkey=%d\trq_psn=%d\tsq_psn=%d\tdest_qp_num=%d\tqp_access_flags=%d\tmax_inline_data=%d\tmax_recv_sge=%d\tmax_recv_wr=%d\tmax_send_sge=%d\tmax_send_wr=%d\tpkey_index=%d\talt_pkey_index=%d\ten_sqd_async_notify=%d\tsq_draining=%d\tmax_rd_atomic=%d\tmax_dest_rd_atomic=%d\tmin_rnr_timer=%d\tport_num=%d\ttimeout=%d\tretry_cnt=%d\trnr_retry=%d\talt_port_num=%d\talt_timeout=%d\n",
		   attr->qp_state, attr->cur_qp_state, attr->path_mtu,
		   attr->path_mig_state, attr->qkey, attr->rq_psn, attr->sq_psn,
		   attr->dest_qp_num, attr->qp_access_flags,
		   attr->cap.max_inline_data, attr->cap.max_recv_sge,
		   attr->cap.max_recv_wr, attr->cap.max_send_sge,
		   attr->cap.max_send_wr, attr->pkey_index,
		   attr->alt_pkey_index, attr->en_sqd_async_notify,
		   attr->sq_draining, attr->max_rd_atomic,
		   attr->max_dest_rd_atomic, attr->min_rnr_timer,
		   attr->port_num, attr->timeout, attr->retry_cnt,
		   attr->rnr_retry, attr->alt_port_num, attr->alt_timeout);

	qelr_print_ah_attr(cxt, &attr->ah_attr);
	qelr_print_ah_attr(cxt, &attr->alt_ah_attr);
}

int qelr_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
		    int attr_mask, struct ibv_qp_init_attr *init_attr)
{
	struct ibv_query_qp cmd;
	struct qelr_devctx *cxt = get_qelr_ctx(qp->context);
	int rc;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Query %p, attr_mask=0x%x\n",
		   get_qelr_qp(qp), attr_mask);

	rc = ibv_cmd_query_qp(qp, attr, attr_mask,
			      init_attr, &cmd, sizeof(cmd));

	qelr_print_qp_attr(cxt, attr);

	return rc;
}

static enum qelr_qp_state get_qelr_qp_state(enum ibv_qp_state qps)
{
	switch (qps) {
	case IBV_QPS_RESET:
		return QELR_QPS_RST;
	case IBV_QPS_INIT:
		return QELR_QPS_INIT;
	case IBV_QPS_RTR:
		return QELR_QPS_RTR;
	case IBV_QPS_RTS:
		return QELR_QPS_RTS;
	case IBV_QPS_SQD:
		return QELR_QPS_SQD;
	case IBV_QPS_SQE:
		return QELR_QPS_SQE;
	case IBV_QPS_ERR:
	default:
		return QELR_QPS_ERR;
	};
}

static void qelr_reset_qp_hwq_info(struct qelr_qp_hwq_info *q)
{
	qelr_chain_reset(&q->chain);
	q->prod = 0;
	q->cons = 0;
	q->wqe_cons = 0;
	q->db_data.data.value = 0;
}

static int qelr_update_qp_state(struct qelr_qp *qp,
				enum ibv_qp_state new_ib_state)
{
	int status = 0;
	enum qelr_qp_state new_state;

	/* iWARP states are updated implicitely by driver and don't have a
	 * real purpose in user-lib.
	 */
	if (IS_IWARP(qp->ibv_qp.context->device))
		return 0;

	new_state = get_qelr_qp_state(new_ib_state);

	pthread_spin_lock(&qp->q_lock);

	if (new_state == qp->state) {
		pthread_spin_unlock(&qp->q_lock);
		return 0;
	}

	switch (qp->state) {
	case QELR_QPS_RST:
		switch (new_state) {
		case QELR_QPS_INIT:
			qp->prev_wqe_size = 0;
			qelr_reset_qp_hwq_info(&qp->sq);
			qelr_reset_qp_hwq_info(&qp->rq);
			break;
		default:
			status = -EINVAL;
			break;
		};
		break;
	case QELR_QPS_INIT:
		/* INIT->XXX */
		switch (new_state) {
		case QELR_QPS_RTR:
			/* Update doorbell (in case post_recv was done before
			 * move to RTR)
			 */
			if (IS_ROCE(qp->ibv_qp.context->device)) {
				mmio_wc_start();
				writel(qp->rq.db_data.raw, qp->rq.db);
				mmio_flush_writes();
			}
			break;
		case QELR_QPS_ERR:
			break;
		default:
			/* invalid state change. */
			status = -EINVAL;
			break;
		};
		break;
	case QELR_QPS_RTR:
		/* RTR->XXX */
		switch (new_state) {
		case QELR_QPS_RTS:
			break;
		case QELR_QPS_ERR:
			break;
		default:
			/* invalid state change. */
			status = -EINVAL;
			break;
		};
		break;
	case QELR_QPS_RTS:
		/* RTS->XXX */
		switch (new_state) {
		case QELR_QPS_SQD:
		case QELR_QPS_SQE:
			break;
		case QELR_QPS_ERR:
			break;
		default:
			/* invalid state change. */
			status = -EINVAL;
			break;
		};
		break;
	case QELR_QPS_SQD:
		/* SQD->XXX */
		switch (new_state) {
		case QELR_QPS_RTS:
		case QELR_QPS_SQE:
		case QELR_QPS_ERR:
			break;
		default:
			/* invalid state change. */
			status = -EINVAL;
			break;
		};
		break;
	case QELR_QPS_SQE:
		switch (new_state) {
		case QELR_QPS_RTS:
		case QELR_QPS_ERR:
			break;
		default:
			/* invalid state change. */
			status = -EINVAL;
			break;
		};
		break;
	case QELR_QPS_ERR:
		/* ERR->XXX */
		switch (new_state) {
		case QELR_QPS_RST:
			break;
		default:
			status = -EINVAL;
			break;
		};
		break;
	default:
		status = -EINVAL;
		break;
	};
	if (!status)
		qp->state = new_state;

	pthread_spin_unlock(&qp->q_lock);

	return status;
}

int qelr_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
		     int attr_mask)
{
	struct ibv_modify_qp cmd = {};
	struct qelr_qp *qp = get_qelr_qp(ibqp);
	struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context);
	union ibv_gid sgid, *p_dgid;
	int rc;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Modify %p, attr_mask=0x%x\n",
		   qp, attr_mask);

	qelr_print_qp_attr(cxt, attr);

	rc = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd));
	if (rc) {
		DP_ERR(cxt->dbg_fp, "QP Modify: Failed command. rc=%d\n", rc);
		return rc;
	}

	if (attr_mask & IBV_QP_STATE) {
		rc = qelr_update_qp_state(qp, attr->qp_state);
		DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP,
			   "QP Modify state %d->%d, rc=%d\n", qp->state,
			   attr->qp_state, rc);
		if (rc) {
			DP_ERR(cxt->dbg_fp,
			       "QP Modify: Failed to update state. rc=%d\n",
			       rc);

			return rc;
		}
	}

	/* EDPM must be disabled if GIDs match */
	if (attr_mask & IBV_QP_AV) {
		rc = ibv_query_gid(ibqp->context, attr->ah_attr.port_num,
				   attr->ah_attr.grh.sgid_index, &sgid);

		if (!rc) {
			p_dgid = &attr->ah_attr.grh.dgid;
			qp->edpm_disabled = !memcmp(&sgid, p_dgid,
						    sizeof(sgid));
			DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP,
				   "QP Modify: %p, edpm_disabled=%d\n", qp,
				   qp->edpm_disabled);
		} else  {
			DP_ERR(cxt->dbg_fp,
			       "QP Modify: Failed querying GID. rc=%d\n",
			       rc);
		}
	}

	return 0;
}

int qelr_destroy_qp(struct ibv_qp *ibqp)
{
	struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context);
	struct qelr_qp *qp = get_qelr_qp(ibqp);
	int rc = 0;

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "destroy qp: %p\n", qp);

	rc = ibv_cmd_destroy_qp(ibqp);
	if (rc) {
		DP_ERR(cxt->dbg_fp,
		       "destroy qp: failed to destroy %p, got %d.\n", qp, rc);
		return rc;
	}

	qelr_free_sq(qp);
	qelr_free_rq(qp);
	qelr_chain_free_sq(qp);
	qelr_chain_free_rq(qp);
	if (qp->sq.db_rec_map)
		munmap(qp->sq.db_rec_map, cxt->kernel_page_size);
	if (qp->rq.db_rec_map)
		munmap(qp->rq.db_rec_map, cxt->kernel_page_size);

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP,
		   "destroy cq: successfully destroyed %p\n", qp);

	free(qp);

	return 0;
}

static int sge_data_len(struct ibv_sge *sg_list, int num_sge)
{
	int i, len = 0;

	for (i = 0; i < num_sge; i++)
		len += sg_list[i].length;
	return len;
}

static void swap_wqe_data64(uint64_t *p)
{
	__be64 *bep=(__be64 *)p;
	int i;

	for (i = 0; i < ROCE_WQE_ELEM_SIZE / sizeof(uint64_t); i++, p++, bep++)
		*bep = htobe64(*p);
}

static inline void qelr_init_dpm_info(struct qelr_devctx *cxt,
				      struct qelr_qp *qp,
				      struct ibv_send_wr *wr,
				      struct qelr_dpm *dpm,
				      int data_size)
{
	dpm->is_edpm = 0;
	dpm->is_ldpm = 0;

	/* DPM only succeeds when transmit queues are empty */
	if (!qelr_chain_is_full(&qp->sq.chain))
		return;

	/* Check if edpm can be used */
	if (wr->send_flags & IBV_SEND_INLINE && !qp->edpm_disabled &&
	    cxt->dpm_flags & QELR_DPM_FLAGS_ENHANCED) {
		memset(dpm, 0, sizeof(*dpm));
		dpm->rdma_ext = (struct qelr_rdma_ext *)&dpm->payload;
		dpm->is_edpm = 1;
		return;
	}

	 /* Check if ldpm can be used - not inline and limited to ldpm_limit */
	if (cxt->dpm_flags & QELR_DPM_FLAGS_LEGACY &&
	    !(wr->send_flags & IBV_SEND_INLINE) &&
	    data_size <= cxt->ldpm_limit_size) {
		memset(dpm, 0, sizeof(*dpm));
		dpm->is_ldpm = 1;
	}
}

#define QELR_IB_OPCODE_SEND_ONLY                         0x04
#define QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE          0x05
#define QELR_IB_OPCODE_RDMA_WRITE_ONLY                   0x0a
#define QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE    0x0b
#define QELR_IB_OPCODE_SEND_WITH_INV			 0x17
#define QELR_IS_IMM_OR_INV(opcode) \
	(((opcode) == QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE) || \
	 ((opcode) == QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE) || \
	 ((opcode) == QELR_IB_OPCODE_SEND_WITH_INV))

static inline void qelr_edpm_set_msg_data(struct qelr_qp *qp,
					  struct qelr_dpm *dpm,
					  uint8_t opcode,
					  uint16_t length,
					  uint8_t se,
					  uint8_t comp)
{
	uint32_t wqe_size, dpm_size, params;

	params = 0;
	wqe_size = length + (QELR_IS_IMM_OR_INV(opcode) ? sizeof(uint32_t) : 0);
	dpm_size = wqe_size + sizeof(struct db_roce_dpm_data);

	SET_FIELD(params, DB_ROCE_DPM_PARAMS_DPM_TYPE, DPM_ROCE);
	SET_FIELD(params, DB_ROCE_DPM_PARAMS_OPCODE, opcode);
	SET_FIELD(params, DB_ROCE_DPM_PARAMS_WQE_SIZE, wqe_size);
	SET_FIELD(params, DB_ROCE_DPM_PARAMS_COMPLETION_FLG, comp ? 1 : 0);
	SET_FIELD(params, DB_ROCE_DPM_PARAMS_S_FLG, se ? 1 : 0);
	SET_FIELD(params, DB_ROCE_DPM_PARAMS_SIZE,
		  (dpm_size + sizeof(uint64_t) - 1) / sizeof(uint64_t));

	dpm->msg.data.params.params = htole32(params);
}

static inline void qelr_edpm_set_inv_imm(struct qelr_qp *qp,
					 struct qelr_dpm *dpm,
					 __be32 data)
{
	memcpy(&dpm->payload[dpm->payload_offset], &data, sizeof(data));

	dpm->payload_offset += sizeof(data);
	dpm->payload_size += sizeof(data);
}

static inline void qelr_edpm_set_rdma_ext(struct qelr_qp *qp,
					  struct qelr_dpm *dpm,
					  uint64_t remote_addr,
					  uint32_t rkey)
{
	dpm->rdma_ext->remote_va = htobe64(remote_addr);
	dpm->rdma_ext->remote_key = htobe32(rkey);
	dpm->payload_offset += sizeof(*dpm->rdma_ext);
	dpm->payload_size += sizeof(*dpm->rdma_ext);
}

static inline void qelr_edpm_set_payload(struct qelr_qp *qp,
					 struct qelr_dpm *dpm, char *buf,
					 uint32_t length)
{
	memcpy(&dpm->payload[dpm->payload_offset], buf, length);

	dpm->payload_offset += length;
}

static void qelr_prepare_sq_inline_data(struct qelr_qp *qp,
					    struct qelr_dpm *dpm,
					    int data_size,
					    uint8_t *wqe_size,
					    struct ibv_send_wr *wr,
					    uint8_t *bits, uint8_t bit)
{
	int i;
	uint32_t seg_siz;
	char *seg_prt, *wqe;

	if (!data_size)
		return;

	/* set the bit */
	*bits |= bit;

	seg_prt = NULL;
	wqe = NULL;
	seg_siz = 0;

	/* copy data inline */
	for (i = 0; i < wr->num_sge; i++) {
		uint32_t len = wr->sg_list[i].length;
		void *src = (void *)(uintptr_t)wr->sg_list[i].addr;

		if (dpm->is_edpm)
			qelr_edpm_set_payload(qp, dpm, src, len);

		while (len > 0) {
			uint32_t cur;

			/* new segment required */
			if (!seg_siz) {
				wqe = (char *)qelr_chain_produce(&qp->sq.chain);
				seg_prt = wqe;
				seg_siz = sizeof(struct rdma_sq_common_wqe);
				(*wqe_size)++;
			}

			/* calculate currently allowed length */
			cur = min(len, seg_siz);

			memcpy(seg_prt, src, cur);

			/* update segment variables */
			seg_prt += cur;
			seg_siz -= cur;
			/* update sge variables */
			src += cur;
			len -= cur;

			/* swap fully-completed segments */
			if (!seg_siz)
				swap_wqe_data64((uint64_t *)wqe);
		}
	}

	/* swap last not completed segment */
	if (seg_siz)
		swap_wqe_data64((uint64_t *)wqe);

	if (dpm->is_edpm) {
		dpm->payload_size += data_size;

		if (wr->opcode == IBV_WR_RDMA_WRITE ||
		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
			dpm->rdma_ext->dma_length = htobe32(data_size);
	}
}

static void qelr_prepare_sq_sges(struct qelr_qp *qp,
				 struct qelr_dpm *dpm,
				     uint8_t *wqe_size,
				     struct ibv_send_wr *wr)
{
	int i;

	for (i = 0; i < wr->num_sge; i++) {
		struct rdma_sq_sge *sge = qelr_chain_produce(&qp->sq.chain);

		TYPEPTR_ADDR_SET(sge, addr, wr->sg_list[i].addr);
		sge->l_key = htole32(wr->sg_list[i].lkey);
		sge->length = htole32(wr->sg_list[i].length);

		if (dpm->is_ldpm) {
			memcpy(&dpm->payload[dpm->payload_size], sge,
			       sizeof(*sge));
			dpm->payload_size += sizeof(*sge);
		}
	}

	if (wqe_size)
		*wqe_size += wr->num_sge;
}

static uint32_t qelr_prepare_sq_rdma_data(struct qelr_qp *qp,
					  struct qelr_dpm *dpm,
					  int data_size,
					  uint8_t *p_wqe_size,
					  struct rdma_sq_rdma_wqe_1st *rwqe,
					  struct rdma_sq_rdma_wqe_2nd *rwqe2,
					  struct ibv_send_wr *wr,
					  bool is_imm)
{
	memset(rwqe2, 0, sizeof(*rwqe2));
	rwqe2->r_key = htole32(wr->wr.rdma.rkey);
	TYPEPTR_ADDR_SET(rwqe2, remote_va, wr->wr.rdma.remote_addr);
	rwqe->length = htole32(data_size);

	if (is_imm)
		rwqe->imm_data = htole32(be32toh(wr->imm_data));

	if (wr->send_flags & IBV_SEND_INLINE &&
	    (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM ||
	     wr->opcode == IBV_WR_RDMA_WRITE)) {
		uint8_t flags = 0;

		SET_FIELD2(flags, RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG, 1);
		qelr_prepare_sq_inline_data(qp, dpm, data_size,
					    p_wqe_size, wr,
					    &rwqe->flags, flags);
		rwqe->wqe_size = *p_wqe_size;
	} else {
		if (dpm->is_ldpm)
			dpm->payload_size = sizeof(*rwqe) + sizeof(*rwqe2);
		qelr_prepare_sq_sges(qp, dpm, p_wqe_size, wr);
		rwqe->wqe_size = *p_wqe_size;

		if (dpm->is_ldpm) {
			memcpy(dpm->payload, rwqe, sizeof(*rwqe));
			memcpy(&dpm->payload[sizeof(*rwqe)], rwqe2,
			       sizeof(*rwqe2));
		}
	}

	return data_size;
}

static uint32_t qelr_prepare_sq_send_data(struct qelr_qp *qp,
					  struct qelr_dpm *dpm,
					  int data_size,
					  uint8_t *p_wqe_size,
					  struct rdma_sq_send_wqe_1st *swqe,
					  struct rdma_sq_send_wqe_2st *swqe2,
					  struct ibv_send_wr *wr,
					  bool is_imm)
{
	memset(swqe2, 0, sizeof(*swqe2));
	swqe->length = htole32(data_size);

	if (is_imm)
		swqe->inv_key_or_imm_data = htole32(be32toh(wr->imm_data));

	if (wr->send_flags & IBV_SEND_INLINE) {
		uint8_t flags = 0;

		SET_FIELD2(flags, RDMA_SQ_SEND_WQE_INLINE_FLG, 1);
		qelr_prepare_sq_inline_data(qp, dpm, data_size,
					    p_wqe_size, wr,
					    &swqe->flags, flags);
		swqe->wqe_size = *p_wqe_size;
	} else {
		if (dpm->is_ldpm)
			dpm->payload_size = sizeof(*swqe) + sizeof(*swqe2);

		qelr_prepare_sq_sges(qp, dpm, p_wqe_size, wr);
		swqe->wqe_size = *p_wqe_size;
		if (dpm->is_ldpm) {
			memcpy(dpm->payload, swqe, sizeof(*swqe));
			memcpy(&dpm->payload[sizeof(*swqe)], swqe2,
			       sizeof(*swqe2));
		}
	}

	return data_size;
}

static void qelr_prepare_sq_atom_data(struct qelr_qp *qp,
				      struct qelr_dpm *dpm,
				      struct rdma_sq_atomic_wqe_1st *awqe1,
				      struct rdma_sq_atomic_wqe_2nd *awqe2,
				      struct rdma_sq_atomic_wqe_3rd *awqe3,
				      struct ibv_send_wr *wr)
{
	if (dpm->is_ldpm) {
		memcpy(&dpm->payload[dpm->payload_size], awqe1, sizeof(*awqe1));
		dpm->payload_size += sizeof(*awqe1);
		memcpy(&dpm->payload[dpm->payload_size], awqe2, sizeof(*awqe2));
		dpm->payload_size += sizeof(*awqe2);
		memcpy(&dpm->payload[dpm->payload_size], awqe3, sizeof(*awqe3));
		dpm->payload_size += sizeof(*awqe3);
	}

	qelr_prepare_sq_sges(qp, dpm, NULL, wr);
}

static inline void qelr_ldpm_prepare_data(struct qelr_qp *qp,
					  struct qelr_dpm *dpm)
{
	uint32_t val, params;

	/* DPM size is given in 8 bytes so we round up */
	val = dpm->payload_size + sizeof(struct db_roce_dpm_data);
	val = DIV_ROUND_UP(val, sizeof(uint64_t));

	params = 0;
	SET_FIELD(params, DB_ROCE_DPM_PARAMS_SIZE, val);
	SET_FIELD(params, DB_ROCE_DPM_PARAMS_DPM_TYPE, DPM_LEGACY);

	dpm->msg.data.params.params = htole32(params);
}

static enum ibv_wc_opcode qelr_ibv_to_wc_opcode(enum ibv_wr_opcode opcode)
{
	switch (opcode) {
	case IBV_WR_RDMA_WRITE:
	case IBV_WR_RDMA_WRITE_WITH_IMM:
		return IBV_WC_RDMA_WRITE;
	case IBV_WR_SEND_WITH_IMM:
	case IBV_WR_SEND:
	case IBV_WR_SEND_WITH_INV:
		return IBV_WC_SEND;
	case IBV_WR_RDMA_READ:
		return IBV_WC_RDMA_READ;
	case IBV_WR_ATOMIC_CMP_AND_SWP:
		return IBV_WC_COMP_SWAP;
	case IBV_WR_ATOMIC_FETCH_AND_ADD:
		return IBV_WC_FETCH_ADD;
	default:
		return IBV_WC_SEND;
	}
}

static inline void doorbell_qp(struct qelr_qp *qp)
{
	mmio_wc_start();
	writel(qp->sq.db_data.raw, qp->sq.db);
	/* copy value to doorbell recovery mechanism */
	qp->sq.db_rec_addr->db_data = qp->sq.db_data.raw;
	mmio_flush_writes();
}

static inline void doorbell_dpm_qp(struct qelr_devctx *cxt, struct qelr_qp *qp,
				   struct qelr_dpm *dpm)
{
	uint32_t offset = 0;
	uint64_t *payload = (uint64_t *)dpm->payload;
	uint32_t num_dwords;
	int bytes = 0;
	void *db_addr;

	mmio_wc_start();

	/* Write message header */
	dpm->msg.data.icid = qp->sq.db_data.data.icid;
	dpm->msg.data.prod_val = qp->sq.db_data.data.value;
	db_addr = qp->sq.edpm_db;
	writeq(dpm->msg.raw, db_addr);

	/* Write mesage body */
	bytes += sizeof(uint64_t);
	num_dwords = DIV_ROUND_UP(dpm->payload_size, sizeof(uint64_t));

	db_addr += sizeof(dpm->msg.data);

	if (bytes == cxt->edpm_trans_size) {
		mmio_flush_writes();
		bytes = 0;
	}

	while (offset < num_dwords) {
		/* endianity is different between FW and DORQ HW block */
		if (dpm->is_ldpm)
			mmio_write64_be(db_addr, htobe64(payload[offset]));
		else /* EDPM */
			mmio_write64(db_addr, payload[offset]);

		bytes += sizeof(uint64_t);
		db_addr += sizeof(uint64_t);

		/* Writing to a wc bar. We need to flush the writes every
		 * edpm transaction size otherwise the CPU could optimize away
		 * the duplicate stores.
		 */
		if (bytes == cxt->edpm_trans_size) {
			mmio_flush_writes();
			bytes = 0;
		}
		offset++;
	}

	mmio_flush_writes();
}

static inline int qelr_can_post_send(struct qelr_devctx *cxt,
				      struct qelr_qp *qp,
				      struct ibv_send_wr *wr,
				      int data_size)
{
	/* Invalid WR */
	if (wr->num_sge > qp->sq.max_sges) {
		DP_ERR(cxt->dbg_fp,
		       "error: WR is bad. Post send on QP %p failed\n",
		       qp);
		return -EINVAL;
	}

	/* WR overflow */
	if (qelr_wq_is_full(&qp->sq)) {
		DP_ERR(cxt->dbg_fp,
		       "error: WQ is full. Post send on QP %p failed (this error appears only once)\n",
		       qp);
		return -ENOMEM;
	}

	/* WQE overflow */
	if (qelr_chain_get_elem_left_u32(&qp->sq.chain) <
			QELR_MAX_SQ_WQE_SIZE) {
		DP_ERR(cxt->dbg_fp,
		       "error: WQ PBL is full. Post send on QP %p failed (this error appears only once)\n",
		       qp);
		return -ENOMEM;
	}

	if ((wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP ||
	     wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) &&
	    !qp->atomic_supported) {
		DP_ERR(cxt->dbg_fp, "Atomic not supported on this machine\n");
		return -EINVAL;
	}

	if ((wr->send_flags & IBV_SEND_INLINE) &&
	    (data_size > ROCE_REQ_MAX_INLINE_DATA_SIZE)) {
		DP_ERR(cxt->dbg_fp, "Too much inline data in WR: %d\n", data_size);
		return -EINVAL;
	}


	return 0;
}

static int __qelr_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp,
			    struct ibv_send_wr *wr, int data_size,
			    int *normal_db_required)
{
	uint8_t se, comp, fence;
	struct rdma_sq_common_wqe *wqe;
	struct rdma_sq_send_wqe_1st *swqe;
	struct rdma_sq_send_wqe_2st *swqe2;
	struct rdma_sq_rdma_wqe_1st *rwqe;
	struct rdma_sq_rdma_wqe_2nd *rwqe2;
	struct rdma_sq_atomic_wqe_1st *awqe1;
	struct rdma_sq_atomic_wqe_2nd *awqe2;
	struct rdma_sq_atomic_wqe_3rd *awqe3;
	struct qelr_dpm dpm;
	uint32_t wqe_length;
	uint8_t wqe_size;
	uint16_t db_val;
	int rc = 0;

	qelr_init_dpm_info(cxt, qp, wr, &dpm, data_size);

	wqe = qelr_chain_produce(&qp->sq.chain);

	comp = (!!(wr->send_flags & IBV_SEND_SIGNALED)) ||
			(!!qp->sq_sig_all);
	qp->wqe_wr_id[qp->sq.prod].signaled = comp;

	/* common fields */
	wqe->flags = 0;
	se = !!(wr->send_flags & IBV_SEND_SOLICITED);
	fence = !!(wr->send_flags & IBV_SEND_FENCE);
	SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_SE_FLG, se);
	SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_COMP_FLG, comp);
	SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_RD_FENCE_FLG, fence);
	wqe->prev_wqe_size = qp->prev_wqe_size;

	qp->wqe_wr_id[qp->sq.prod].opcode = qelr_ibv_to_wc_opcode(wr->opcode);

	switch (wr->opcode) {
	case IBV_WR_SEND_WITH_IMM:
		wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_IMM;
		swqe = (struct rdma_sq_send_wqe_1st *)wqe;

		wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES;
		swqe2 = (struct rdma_sq_send_wqe_2st *)qelr_chain_produce(&qp->sq.chain);

		if (dpm.is_edpm)
			qelr_edpm_set_inv_imm(qp, &dpm, wr->imm_data);

		wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size,
						       &wqe_size, swqe, swqe2,
						       wr, 1 /* Imm */);

		if (dpm.is_edpm)
			qelr_edpm_set_msg_data(qp, &dpm,
					       QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE,
					       wqe_length, se, comp);
		else if (dpm.is_ldpm)
			qelr_ldpm_prepare_data(qp, &dpm);

		qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size;
		qp->prev_wqe_size = wqe_size;
		qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length;
		break;

	case IBV_WR_SEND:
		wqe->req_type = RDMA_SQ_REQ_TYPE_SEND;
		swqe = (struct rdma_sq_send_wqe_1st *)wqe;

		wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES;
		swqe2 = (struct rdma_sq_send_wqe_2st *)qelr_chain_produce(&qp->sq.chain);
		wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size,
						       &wqe_size, swqe, swqe2,
						       wr, 0);
		if (dpm.is_edpm)
			qelr_edpm_set_msg_data(qp, &dpm,
					       QELR_IB_OPCODE_SEND_ONLY,
					       wqe_length, se, comp);
		else if (dpm.is_ldpm)
			qelr_ldpm_prepare_data(qp, &dpm);

		qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size;
		qp->prev_wqe_size = wqe_size;
		qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length;
		break;

	case IBV_WR_SEND_WITH_INV:
		wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_INVALIDATE;
		swqe = (struct rdma_sq_send_wqe_1st *)wqe;

		wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES;
		swqe2 = qelr_chain_produce(&qp->sq.chain);

		if (dpm.is_edpm)
			qelr_edpm_set_inv_imm(qp, &dpm,
					      htobe32(wr->invalidate_rkey));

		swqe->inv_key_or_imm_data = htole32(wr->invalidate_rkey);

		wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size,
						       &wqe_size, swqe, swqe2,
						       wr, 0);

		if (dpm.is_edpm)
			qelr_edpm_set_msg_data(qp, &dpm,
					       QELR_IB_OPCODE_SEND_WITH_INV,
					       wqe_length, se, comp);
		else if (dpm.is_ldpm)
			qelr_ldpm_prepare_data(qp, &dpm);

		qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size;
		qp->prev_wqe_size = wqe_size;
		qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length;

		break;

	case IBV_WR_RDMA_WRITE_WITH_IMM:
		wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM;
		rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe;

		wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES;
		rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain);
		if (dpm.is_edpm) {
			qelr_edpm_set_rdma_ext(qp, &dpm, wr->wr.rdma.remote_addr,
					       wr->wr.rdma.rkey);
			qelr_edpm_set_inv_imm(qp, &dpm, wr->imm_data);
		}

		wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size,
						       rwqe, rwqe2, wr, 1 /* Imm */);
		if (dpm.is_edpm)
			qelr_edpm_set_msg_data(qp, &dpm,
					       QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE,
					       wqe_length + sizeof(*dpm.rdma_ext),
					       se, comp);
		else if (dpm.is_ldpm)
			qelr_ldpm_prepare_data(qp, &dpm);

		qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size;
		qp->prev_wqe_size = wqe_size;
		qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length;
		break;

	case IBV_WR_RDMA_WRITE:
		wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR;
		rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe;

		wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES;
		rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain);
		if (dpm.is_edpm)
			qelr_edpm_set_rdma_ext(qp, &dpm,
					       wr->wr.rdma.remote_addr,
					       wr->wr.rdma.rkey);

		wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size,
						       rwqe, rwqe2, wr, 0);
		if (dpm.is_edpm)
			qelr_edpm_set_msg_data(qp, &dpm,
					       QELR_IB_OPCODE_RDMA_WRITE_ONLY,
					       wqe_length +
					       sizeof(*dpm.rdma_ext),
					       se, comp);
		else if (dpm.is_ldpm)
			qelr_ldpm_prepare_data(qp, &dpm);

		qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size;
		qp->prev_wqe_size = wqe_size;
		qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length;
		break;

	case IBV_WR_RDMA_READ:
		wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_RD;
		rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe;

		wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES;
		rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain);
		wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size,
						       rwqe, rwqe2, wr, 0);
		if (dpm.is_ldpm)
			qelr_ldpm_prepare_data(qp, &dpm);

		qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size;
		qp->prev_wqe_size = wqe_size;
		qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length;
		break;

	case IBV_WR_ATOMIC_CMP_AND_SWP:
	case IBV_WR_ATOMIC_FETCH_AND_ADD:
		awqe1 = (struct rdma_sq_atomic_wqe_1st *)wqe;
		awqe1->wqe_size = 4;

		awqe2 = (struct rdma_sq_atomic_wqe_2nd *)qelr_chain_produce(&qp->sq.chain);
		TYPEPTR_ADDR_SET(awqe2, remote_va, wr->wr.atomic.remote_addr);
		awqe2->r_key = htole32(wr->wr.atomic.rkey);

		awqe3 = (struct rdma_sq_atomic_wqe_3rd *)qelr_chain_produce(&qp->sq.chain);

		if (wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) {
			wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_ADD;
			TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.compare_add);
		} else {
			wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP;
			TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.swap);
			TYPEPTR_ADDR_SET(awqe3, cmp_data, wr->wr.atomic.compare_add);
		}

		qelr_prepare_sq_atom_data(qp, &dpm, awqe1, awqe2, awqe3, wr);
		if (dpm.is_ldpm)
			qelr_ldpm_prepare_data(qp, &dpm);
		qp->wqe_wr_id[qp->sq.prod].wqe_size = awqe1->wqe_size;
		qp->prev_wqe_size = awqe1->wqe_size;

		break;

	default:
		/* restore prod to its position before this WR was processed */
		qelr_chain_set_prod(&qp->sq.chain,
				    le16toh(qp->sq.db_data.data.value),
				    wqe);

		/* restore prev_wqe_size */
		qp->prev_wqe_size = wqe->prev_wqe_size;

		rc = -EINVAL;
		DP_ERR(cxt->dbg_fp,
		       "Invalid opcode %d in work request on QP %p\n",
		       wr->opcode, qp);
		break;
	}

	if (rc)
		return rc;

	qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id;
	qelr_inc_sw_prod_u16(&qp->sq);
	db_val = le16toh(qp->sq.db_data.data.value) + 1;
	qp->sq.db_data.data.value = htole16(db_val);

	if (dpm.is_edpm || dpm.is_ldpm) {
		doorbell_dpm_qp(cxt, qp, &dpm);
		*normal_db_required = 0;
	} else {
		*normal_db_required = 1;
	}

	return 0;
}

int qelr_post_send(struct ibv_qp *ib_qp, struct ibv_send_wr *wr,
		   struct ibv_send_wr **bad_wr)
{
	struct qelr_devctx *cxt = get_qelr_ctx(ib_qp->context);
	struct qelr_qp *qp = get_qelr_qp(ib_qp);
	int doorbell_required = 0;
	*bad_wr = NULL;
	int rc = 0;

	pthread_spin_lock(&qp->q_lock);

	if (IS_ROCE(ib_qp->context->device) &&
	    (qp->state != QELR_QPS_RTS && qp->state != QELR_QPS_ERR &&
	     qp->state != QELR_QPS_SQD)) {
		pthread_spin_unlock(&qp->q_lock);
		*bad_wr = wr;
		return -EINVAL;
	}

	while (wr) {
		int data_size = sge_data_len(wr->sg_list, wr->num_sge);

		rc = qelr_can_post_send(cxt, qp, wr, data_size);
		if (rc) {
			*bad_wr = wr;
			break;
		}

		rc = __qelr_post_send(cxt, qp, wr, data_size, &doorbell_required);
		if (rc) {
			*bad_wr = wr;
			break;
		}

		wr = wr->next;
	}

	if (doorbell_required)
		doorbell_qp(qp);

	pthread_spin_unlock(&qp->q_lock);

	return rc;
}

static uint32_t qelr_srq_elem_left(struct qelr_srq_hwq_info *hw_srq)
{
	uint32_t used;

	/* Calculate number of elements used based on producer
	 * count and consumer count and subtract it from max
	 * work request supported so that we get elements left.
	 */
	used = (uint32_t)(((uint64_t)((uint64_t)~0U) + 1 +
			  (uint64_t)(hw_srq->wr_prod_cnt)) -
			  (uint64_t)hw_srq->wr_cons_cnt);

	return hw_srq->max_wr - used;
}

int qelr_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr,
		       struct ibv_recv_wr **bad_wr)
{
	struct qelr_devctx *cxt = get_qelr_ctx(ibsrq->context);
	struct qelr_srq *srq = get_qelr_srq(ibsrq);
	struct qelr_srq_hwq_info *hw_srq = &srq->hw_srq;
	struct qelr_chain *chain;
	int status = 0;

	pthread_spin_lock(&srq->lock);

	chain = &srq->hw_srq.chain;
	while (wr) {
		struct rdma_srq_wqe_header *hdr;
		int i;

		if (!qelr_srq_elem_left(hw_srq) ||
		    wr->num_sge > srq->hw_srq.max_sges) {
			DP_ERR(cxt->dbg_fp,
			       "Can't post WR  (%d,%d) || (%d > %d)\n",
			       hw_srq->wr_prod_cnt, hw_srq->wr_cons_cnt,
			       wr->num_sge,
			       srq->hw_srq.max_sges);
			status = -ENOMEM;
			*bad_wr = wr;
			break;
		}

		hdr = qelr_chain_produce(chain);

		SRQ_HDR_SET(hdr, wr->wr_id, wr->num_sge);

		hw_srq->wr_prod_cnt++;
		hw_srq->wqe_prod++;
		hw_srq->sge_prod++;

		DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ,
			   "SRQ WR: SGEs: %d with wr_id[%d] = %" PRIx64 "\n",
			    wr->num_sge, hw_srq->wqe_prod, wr->wr_id);

		for (i = 0; i < wr->num_sge; i++) {
			struct rdma_srq_sge *srq_sge;

			srq_sge = qelr_chain_produce(chain);
			SRQ_SGE_SET(srq_sge, wr->sg_list[i].addr,
				    wr->sg_list[i].length, wr->sg_list[i].lkey);

			DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ,
				   "[%d]: len %d key %x addr %x:%x\n",
				   i, srq_sge->length, srq_sge->l_key,
				   srq_sge->addr.hi, srq_sge->addr.lo);
			hw_srq->sge_prod++;
		}

		/* Make sure that descriptors are written before we update
		 * producers.
		 */

		udma_ordering_write_barrier();

		struct rdma_srq_producers *virt_prod;

		virt_prod = srq->hw_srq.virt_prod_pair_addr;
		virt_prod->sge_prod = htole32(hw_srq->sge_prod);
		virt_prod->wqe_prod = htole32(hw_srq->wqe_prod);

		wr = wr->next;
	}

	DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ,
		   "POST: Elements in SRQ: %d\n",
		   qelr_chain_get_elem_left_u32(chain));
	pthread_spin_unlock(&srq->lock);

	return status;
}

int qelr_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
		   struct ibv_recv_wr **bad_wr)
{
	int status = 0;
	struct qelr_qp *qp =  get_qelr_qp(ibqp);
	struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context);
	uint16_t db_val;
	uint8_t iwarp = IS_IWARP(ibqp->context->device);

	if (unlikely(qp->srq)) {
		DP_ERR(cxt->dbg_fp,
		       "QP is associated with SRQ, cannot post RQ buffers\n");
		*bad_wr = wr;
		return -EINVAL;
	}

	pthread_spin_lock(&qp->q_lock);

	if (!iwarp && qp->state == QELR_QPS_RST) {
		pthread_spin_unlock(&qp->q_lock);
		*bad_wr = wr;
		return -EINVAL;
	}

	while (wr) {
		int i;

		if (qelr_chain_get_elem_left_u32(&qp->rq.chain) <
		    QELR_MAX_RQ_WQE_SIZE || wr->num_sge > qp->rq.max_sges) {
			DP_ERR(cxt->dbg_fp,
			       "Can't post WR  (%d < %d) || (%d > %d)\n",
			       qelr_chain_get_elem_left_u32(&qp->rq.chain),
			       QELR_MAX_RQ_WQE_SIZE, wr->num_sge,
			       qp->rq.max_sges);
			status = -ENOMEM;
			*bad_wr = wr;
			break;
		}
		for (i = 0; i < wr->num_sge; i++) {
			uint32_t flags = 0;
			struct rdma_rq_sge *rqe;

			/* first one must include the number of SGE in the
			 * list
			 */
			if (!i)
				SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES,
					  wr->num_sge);

			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY,
				  wr->sg_list[i].lkey);
			rqe = qelr_chain_produce(&qp->rq.chain);
			RQ_SGE_SET(rqe, wr->sg_list[i].addr,
				   wr->sg_list[i].length, flags);
		}
		/* Special case of no sges. FW requires between 1-4 sges...
		 * in this case we need to post 1 sge with length zero. this is
		 * because rdma write with immediate consumes an RQ.
		 */
		if (!wr->num_sge) {
			uint32_t flags = 0;
			struct rdma_rq_sge *rqe;

			/* first one must include the number of SGE in the
			 * list
			 */
			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, 0);
			SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, 1);

			rqe = qelr_chain_produce(&qp->rq.chain);
			RQ_SGE_SET(rqe, 0, 0, flags);
			i = 1;
		}

		qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id;
		qp->rqe_wr_id[qp->rq.prod].wqe_size = i;

		qelr_inc_sw_prod_u16(&qp->rq);

		mmio_wc_start();

		db_val = le16toh(qp->rq.db_data.data.value) + 1;
		qp->rq.db_data.data.value = htole16(db_val);

		writel(qp->rq.db_data.raw, qp->rq.db);
		/* copy value to doorbell recovery mechanism */
		qp->rq.db_rec_addr->db_data = qp->rq.db_data.raw;
		mmio_flush_writes();

		if (iwarp) {
			writel(qp->rq.iwarp_db2_data.raw, qp->rq.iwarp_db2);
			mmio_flush_writes();
		}
		wr = wr->next;
	}

	pthread_spin_unlock(&qp->q_lock);

	return status;
}

static int is_valid_cqe(struct qelr_cq *cq, union rdma_cqe *cqe)
{
	struct rdma_cqe_requester *resp_cqe = &cqe->req;

	return (resp_cqe->flags & RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK) ==
		cq->chain_toggle;
}

static enum rdma_cqe_type cqe_get_type(union rdma_cqe *cqe)
{
	struct rdma_cqe_requester *resp_cqe = &cqe->req;

	return GET_FIELD(resp_cqe->flags, RDMA_CQE_REQUESTER_TYPE);
}

static struct qelr_qp *cqe_get_qp(union rdma_cqe *cqe)
{
	struct regpair *qph = &cqe->req.qp_handle;

	return (struct qelr_qp *)HILO_U64(le32toh(qph->hi), le32toh(qph->lo));
}

static int process_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries,
		       struct ibv_wc *wc, uint16_t hw_cons,
		       enum ibv_wc_status status, int force)
{
	struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context);
	uint16_t cnt = 0;

	while (num_entries && qp->sq.wqe_cons != hw_cons) {
		if (!qp->wqe_wr_id[qp->sq.cons].signaled && !force) {
			/* skip WC */
			goto next_cqe;
		}

		/* fill WC */
		wc->status = status;
		wc->wc_flags = 0;
		wc->qp_num = qp->qp_id;

		/* common section */
		wc->wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id;
		wc->opcode = qp->wqe_wr_id[qp->sq.cons].opcode;

		switch (wc->opcode) {
		case IBV_WC_RDMA_WRITE:
			wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len;
			DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ,
				   "POLL REQ CQ: IBV_WC_RDMA_WRITE byte_len=%d\n",
				   qp->wqe_wr_id[qp->sq.cons].bytes_len);
			break;
		case IBV_WC_COMP_SWAP:
		case IBV_WC_FETCH_ADD:
			wc->byte_len = 8;
			break;
		case IBV_WC_RDMA_READ:
		case IBV_WC_SEND:
		case IBV_WC_BIND_MW:
			wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len;
			DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ,
				   "POLL REQ CQ: IBV_WC_RDMA_READ / IBV_WC_SEND\n");
			break;
		default:
			break;
		}

		num_entries--;
		wc++;
		cnt++;
next_cqe:
		while (qp->wqe_wr_id[qp->sq.cons].wqe_size--)
			qelr_chain_consume(&qp->sq.chain);
		qelr_inc_sw_cons_u16(&qp->sq);
	}

	return cnt;
}

static int qelr_poll_cq_req(struct qelr_qp *qp, struct qelr_cq *cq,
			    int num_entries, struct ibv_wc *wc,
			    struct rdma_cqe_requester *req)
{
	struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context);
	uint16_t sq_cons = le16toh(req->sq_cons);
	int cnt = 0;

	switch (req->status) {
	case RDMA_CQE_REQ_STS_OK:
		cnt = process_req(qp, cq, num_entries, wc, sq_cons,
				  IBV_WC_SUCCESS, 0);
		break;
	case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR:
		DP_ERR(cxt->dbg_fp,
		       "Error: POLL CQ with ROCE_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. QP icid=0x%x\n",
		       qp->sq.icid);
		cnt = process_req(qp, cq, num_entries, wc, sq_cons,
				  IBV_WC_WR_FLUSH_ERR, 1);
		break;
	default: /* other errors case */
		/* process all WQE before the consumer */
		qp->state = QELR_QPS_ERR;
		cnt = process_req(qp, cq, num_entries, wc, sq_cons - 1,
				  IBV_WC_SUCCESS, 0);
		wc += cnt;
		/* if we have extra WC fill it with actual error info */
		if (cnt < num_entries) {
			enum ibv_wc_status wc_status;

			switch (req->status) {
			case    RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_BAD_RESP_ERR;
				break;
			case    RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_LOC_LEN_ERR;
				break;
			case    RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_LOC_QP_OP_ERR;
				break;
			case    RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_LOC_PROT_ERR;
				break;
			case    RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_MW_BIND_ERR;
				break;
			case    RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_REM_INV_REQ_ERR;
				break;
			case    RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_REM_ACCESS_ERR;
				break;
			case    RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_REM_OP_ERR;
				break;
			case    RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR:
				DP_ERR(cxt->dbg_fp,
				       "Error: POLL CQ with RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_RNR_RETRY_EXC_ERR;
				break;
			case    RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR:
				DP_ERR(cxt->dbg_fp,
				       "RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_RETRY_EXC_ERR;
				break;
			default:
				DP_ERR(cxt->dbg_fp,
				       "IBV_WC_GENERAL_ERR. QP icid=0x%x\n",
				       qp->sq.icid);
				wc_status = IBV_WC_GENERAL_ERR;
			}

			cnt += process_req(qp, cq, 1, wc, sq_cons, wc_status,
					   1 /* force use of WC */);
		}
	}

	return cnt;
}

static void __process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq,
			       struct ibv_wc *wc,
			       struct rdma_cqe_responder *resp, uint64_t wr_id)
{
	struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context);
	enum ibv_wc_status wc_status = IBV_WC_SUCCESS;
	uint8_t flags;

	wc->opcode = IBV_WC_RECV;
	wc->wr_id = wr_id;
	wc->wc_flags = 0;
	switch (resp->status) {
	case RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR:
		wc_status = IBV_WC_LOC_ACCESS_ERR;
		break;
	case RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR:
		wc_status = IBV_WC_LOC_LEN_ERR;
		break;
	case RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR:
		wc_status = IBV_WC_LOC_QP_OP_ERR;
		break;
	case RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR:
		wc_status = IBV_WC_LOC_PROT_ERR;
		break;
	case RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR:
		wc_status = IBV_WC_MW_BIND_ERR;
		break;
	case RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR:
		wc_status = IBV_WC_REM_INV_RD_REQ_ERR;
		break;
	case RDMA_CQE_RESP_STS_OK:
		wc_status = IBV_WC_SUCCESS;
		wc->byte_len = le32toh(resp->length);

		flags = resp->flags & QELR_RESP_RDMA_IMM;

		switch (flags) {
		case QELR_RESP_RDMA_IMM:
			/* update opcode */
			wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
			SWITCH_FALLTHROUGH;
		case QELR_RESP_IMM:
			wc->imm_data = htobe32(le32toh(resp->imm_data_or_inv_r_Key));
			wc->wc_flags |= IBV_WC_WITH_IMM;
			break;
		case QELR_RESP_INV:
			wc->invalidated_rkey = le32toh(resp->imm_data_or_inv_r_Key);
			wc->wc_flags |= IBV_WC_WITH_INV;
			break;
		case QELR_RESP_RDMA:
			DP_ERR(cxt->dbg_fp, "Invalid flags detected\n");
			break;
		default:
			/* valid configuration, but nothing to do here */
			break;
		}

		break;
	default:
		wc->status = IBV_WC_GENERAL_ERR;
		DP_ERR(cxt->dbg_fp, "Invalid CQE status detected\n");
	}

	/* fill WC */
	wc->status = wc_status;
	wc->qp_num = qp->qp_id;
}

static int process_resp_one_srq(struct qelr_qp *qp, struct qelr_cq *cq,
				struct ibv_wc *wc,
				struct rdma_cqe_responder *resp)
{
	struct qelr_srq_hwq_info *hw_srq = &qp->srq->hw_srq;
	uint64_t wr_id;

	wr_id = (((uint64_t)(le32toh(resp->srq_wr_id.hi))) << 32) +
		le32toh(resp->srq_wr_id.lo);

	if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) {
		wc->byte_len = 0;
		wc->status = IBV_WC_WR_FLUSH_ERR;
		wc->qp_num = qp->qp_id;
		wc->wr_id = wr_id;
	} else {
		__process_resp_one(qp, cq, wc, resp, wr_id);
	}

	hw_srq->wr_cons_cnt++;

	return 1;
}

static int process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq,
			    struct ibv_wc *wc, struct rdma_cqe_responder *resp)
{
	uint64_t wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id;

	__process_resp_one(qp, cq, wc, resp, wr_id);

	while (qp->rqe_wr_id[qp->rq.cons].wqe_size--)
		qelr_chain_consume(&qp->rq.chain);

	qelr_inc_sw_cons_u16(&qp->rq);

	return 1;
}

static int process_resp_flush(struct qelr_qp *qp, struct qelr_cq *cq,
			      int num_entries, struct ibv_wc *wc,
			      uint16_t hw_cons)
{
	uint16_t cnt = 0;

	while (num_entries && qp->rq.wqe_cons != hw_cons) {
		/* fill WC */
		wc->status = IBV_WC_WR_FLUSH_ERR;
		wc->qp_num = qp->qp_id;
		wc->byte_len = 0;
		wc->wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id;
		num_entries--;
		wc++;
		cnt++;
		while (qp->rqe_wr_id[qp->rq.cons].wqe_size--)
			qelr_chain_consume(&qp->rq.chain);
		qelr_inc_sw_cons_u16(&qp->rq);
	}

	return cnt;
}

/* return latest CQE (needs processing) */
static union rdma_cqe *get_cqe(struct qelr_cq *cq)
{
	return cq->latest_cqe;
}

static void try_consume_req_cqe(struct qelr_cq *cq, struct qelr_qp *qp,
				struct rdma_cqe_requester *req, int *update)
{
	uint16_t sq_cons = le16toh(req->sq_cons);

	if (sq_cons == qp->sq.wqe_cons) {
		consume_cqe(cq);
		*update |= 1;
	}
}

/* used with flush only, when resp->rq_cons is valid */
static void try_consume_resp_cqe(struct qelr_cq *cq, struct qelr_qp *qp,
				 uint16_t rq_cons, int *update)
{
	if (rq_cons == qp->rq.wqe_cons) {
		consume_cqe(cq);
		*update |= 1;
	}
}

static int qelr_poll_cq_resp_srq(struct qelr_qp *qp, struct qelr_cq *cq,
				 int num_entries, struct ibv_wc *wc,
				 struct rdma_cqe_responder *resp, int *update)
{
	int cnt;

	cnt = process_resp_one_srq(qp, cq, wc, resp);
	consume_cqe(cq);
	*update |= 1;

	return cnt;
}

static int qelr_poll_cq_resp(struct qelr_qp *qp, struct qelr_cq *cq,
			     int num_entries, struct ibv_wc *wc,
			     struct rdma_cqe_responder *resp, int *update)
{
	uint16_t rq_cons = le16toh(resp->rq_cons);
	int cnt;

	if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) {
		cnt = process_resp_flush(qp, cq, num_entries, wc, rq_cons);
		try_consume_resp_cqe(cq, qp, rq_cons, update);
	} else {
		cnt = process_resp_one(qp, cq, wc, resp);
		consume_cqe(cq);
		*update |= 1;
	}

	return cnt;
}

static void doorbell_cq(struct qelr_cq *cq, uint32_t cons, uint8_t flags)
{
	mmio_wc_start();
	cq->db.data.agg_flags = flags;
	cq->db.data.value = htole32(cons);

	writeq(cq->db.raw, cq->db_addr);
	/* copy value to doorbell recovery mechanism */
	cq->db_rec_addr->db_data = cq->db.raw;
	mmio_flush_writes();
}

int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc)
{
	struct qelr_cq *cq = get_qelr_cq(ibcq);
	int done = 0;
	union rdma_cqe *cqe = get_cqe(cq);
	int update = 0;
	uint32_t db_cons;

	while (num_entries && is_valid_cqe(cq, cqe)) {
		int cnt = 0;
		struct qelr_qp *qp;

		/* prevent speculative reads of any field of CQE */
		udma_from_device_barrier();

		qp = cqe_get_qp(cqe);
		if (!qp) {
			DP_ERR(stderr,
			       "Error: CQE QP pointer is NULL. CQE=%p\n", cqe);
			break;
		}

		switch (cqe_get_type(cqe)) {
		case RDMA_CQE_TYPE_REQUESTER:
			cnt = qelr_poll_cq_req(qp, cq, num_entries, wc,
					       &cqe->req);
			try_consume_req_cqe(cq, qp, &cqe->req, &update);
			break;
		case RDMA_CQE_TYPE_RESPONDER_RQ:
			cnt = qelr_poll_cq_resp(qp, cq, num_entries, wc,
						&cqe->resp, &update);
			break;
		case RDMA_CQE_TYPE_RESPONDER_SRQ:
			cnt = qelr_poll_cq_resp_srq(qp, cq, num_entries, wc,
						    &cqe->resp, &update);
			break;
		case RDMA_CQE_TYPE_INVALID:
		default:
			printf("Error: invalid CQE type = %d\n",
			       cqe_get_type(cqe));
		}
		num_entries -= cnt;
		wc += cnt;
		done += cnt;

		cqe = get_cqe(cq);
	}

	db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1;
	if (update) {
		/* doorbell notifies about latest VALID entry,
		 * but chain already point to the next INVALID one
		 */
		doorbell_cq(cq, db_cons, cq->arm_flags);
	}

	return done;
}

void qelr_cq_event(struct ibv_cq *ibcq)
{
	/* Trigger received, can reset arm flags */
	struct qelr_cq *cq = get_qelr_cq(ibcq);

	cq->arm_flags = 0;
}

int qelr_arm_cq(struct ibv_cq *ibcq, int solicited)
{
	struct qelr_cq *cq = get_qelr_cq(ibcq);
	uint32_t db_cons;

	db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1;
	cq->arm_flags = solicited ? DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD :
				    DQ_UCM_ROCE_CQ_ARM_CF_CMD;

	doorbell_cq(cq, db_cons, cq->arm_flags);

	return 0;
}

void qelr_async_event(struct ibv_context *context,
		      struct ibv_async_event *event)
{
	struct qelr_cq *cq = NULL;
	struct qelr_qp *qp = NULL;

	switch (event->event_type) {
	case IBV_EVENT_CQ_ERR:
		cq = get_qelr_cq(event->element.cq);
		break;
	case IBV_EVENT_QP_FATAL:
	case IBV_EVENT_QP_REQ_ERR:
	case IBV_EVENT_QP_ACCESS_ERR:
	case IBV_EVENT_PATH_MIG_ERR:{
			qp = get_qelr_qp(event->element.qp);
			break;
		}
	case IBV_EVENT_SQ_DRAINED:
	case IBV_EVENT_PATH_MIG:
	case IBV_EVENT_COMM_EST:
	case IBV_EVENT_QP_LAST_WQE_REACHED:
		break;
	case IBV_EVENT_SRQ_LIMIT_REACHED:
	case IBV_EVENT_SRQ_ERR:
		return;
	case IBV_EVENT_PORT_ACTIVE:
	case IBV_EVENT_PORT_ERR:
		break;
	default:
		break;
	}

	fprintf(stderr, "qelr_async_event not implemented yet cq=%p qp=%p\n",
		cq, qp);
}