/** * Copyright (C) Mellanox Technologies Ltd. 2016-2018. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ #ifndef UCT_DC_IFACE_H #define UCT_DC_IFACE_H #include #include #include #include #include #include #include /* * HW tag matching */ #if IBV_HW_TM # if HAVE_INFINIBAND_TM_TYPES_H /* upstream tm_types.h doesn't provide RAVH header */ struct ibv_ravh { uint32_t sl_dct; uint32_t reserved; /* must be zero */ uint64_t dc_access_key; }; # else # define ibv_ravh ibv_exp_tmh_ravh # endif # define UCT_DC_RNDV_HDR_LEN (sizeof(struct ibv_rvh) + \ sizeof(struct ibv_ravh)) #else # define UCT_DC_RNDV_HDR_LEN 0 #endif #define UCT_DC_MLX5_IFACE_MAX_DCIS 16 #define UCT_DC_MLX5_IFACE_ADDR_TM_ENABLED(_addr) \ (!!((_addr)->flags & UCT_DC_MLX5_IFACE_ADDR_HW_TM)) typedef struct uct_dc_mlx5_ep uct_dc_mlx5_ep_t; typedef struct uct_dc_mlx5_iface uct_dc_mlx5_iface_t; typedef enum { UCT_DC_MLX5_IFACE_ADDR_HW_TM = UCS_BIT(0), UCT_DC_MLX5_IFACE_ADDR_DC_V1 = UCS_BIT(1), UCT_DC_MLX5_IFACE_ADDR_DC_V2 = UCS_BIT(2), UCT_DC_MLX5_IFACE_ADDR_DC_VERS = UCT_DC_MLX5_IFACE_ADDR_DC_V1 | UCT_DC_MLX5_IFACE_ADDR_DC_V2 } uct_dc_mlx5_iface_addr_flags_t; typedef struct uct_dc_mlx5_iface_addr { uct_ib_uint24_t qp_num; uint8_t atomic_mr_id; uint8_t flags; } UCS_S_PACKED uct_dc_mlx5_iface_addr_t; /** * dci policies: * - fixed: all eps always use same dci no matter what * - dcs: * - ep uses already assigned dci or * - free dci is assigned in LIFO (stack) order or * - ep has not resources to transmit * - on FULL completion (once there are no outstanding ops) * dci is pushed to the stack of free dcis * it is possible that ep will never release its dci: * ep send, gets some completion, sends more, repeat * - dcs + quota: * - same as dcs with following addition: * - if dci can not tx, and there are eps waiting for dci * allocation ep goes into tx_wait state * - in tx_wait state: * - ep can not transmit while there are eps * waiting for dci allocation. This will break * starvation. * - if there are no eps that are waiting for dci allocation * ep goes back to normal state * - random * - dci is choosen by random() % ndci * - ep keeps using dci as long as it has oustanding sends * * Not implemented policies: * * - hash: * - dci is allocated to ep by some hash function * for example dlid % ndci * */ typedef enum { UCT_DC_TX_POLICY_DCS, UCT_DC_TX_POLICY_DCS_QUOTA, UCT_DC_TX_POLICY_RAND, UCT_DC_TX_POLICY_LAST } uct_dc_tx_policy_t; typedef struct uct_dc_mlx5_iface_config { uct_rc_iface_common_config_t super; uct_rc_mlx5_iface_common_config_t rc_mlx5_common; uct_ud_iface_common_config_t ud_common; int ndci; int tx_policy; unsigned quota; unsigned rand_seed; uct_ud_mlx5_iface_common_config_t mlx5_ud; } uct_dc_mlx5_iface_config_t; typedef struct uct_dc_dci { uct_rc_txqp_t txqp; /* DCI qp */ uct_ib_mlx5_txwq_t txwq; /* DCI mlx5 wq */ union { uct_dc_mlx5_ep_t *ep; /* points to an endpoint that currently owns the dci. Relevant only for dcs and dcs quota policies. */ ucs_arbiter_group_t arb_group; /* pending group, relevant for rand policy. With rand, groups are not descheduled until all elements processed. Better have dci num groups scheduled than ep num. */ }; #if UCS_ENABLE_ASSERT uint8_t flags; /* debug state, @ref uct_dc_dci_state_t */ #endif } uct_dc_dci_t; typedef struct uct_dc_fc_sender_data { uint64_t ep; struct { int is_global; union ibv_gid gid; } UCS_S_PACKED global; } UCS_S_PACKED uct_dc_fc_sender_data_t; typedef struct uct_dc_fc_request { uct_rc_fc_request_t super; uct_dc_fc_sender_data_t sender; uint32_t dct_num; /* Lid can be stored either in BE or in LE order. The endianess depends * on the transport (BE for mlx5 and LE for dc verbs) */ uint16_t lid; } uct_dc_fc_request_t; struct uct_dc_mlx5_iface { uct_rc_mlx5_iface_common_t super; struct { /* Array of dcis */ uct_dc_dci_t dcis[UCT_DC_MLX5_IFACE_MAX_DCIS]; uint8_t ndci; /* Number of DCIs */ uct_dc_tx_policy_t policy; /* dci selection algorithm */ int16_t available_quota; /* if available tx is lower, let another endpoint use the dci */ /* LIFO is only relevant for dcs allocation policy */ uint8_t stack_top; /* dci stack top */ uint8_t dcis_stack[UCT_DC_MLX5_IFACE_MAX_DCIS]; /* LIFO of indexes of available dcis */ ucs_arbiter_t dci_arbiter; /* Used to send grant messages for all peers */ uct_dc_mlx5_ep_t *fc_ep; /* List of destroyed endpoints waiting for credit grant */ ucs_list_link_t gc_list; /* Number of expected FC grants */ unsigned fc_grants; /* Seed used for random dci allocation */ unsigned rand_seed; ucs_arbiter_callback_t pend_cb; } tx; struct { uct_ib_mlx5_qp_t dct; } rx; uint8_t version_flag; uct_ud_mlx5_iface_common_t ud_common; }; extern ucs_config_field_t uct_dc_mlx5_iface_config_table[]; ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface); int uct_dc_mlx5_iface_is_reachable(const uct_iface_h tl_iface, const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr); ucs_status_t uct_dc_mlx5_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr); ucs_status_t uct_dc_mlx5_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_completion_t *comp); void uct_dc_mlx5_iface_set_quota(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_iface_config_t *config); ucs_status_t uct_dc_mlx5_iface_init_fc_ep(uct_dc_mlx5_iface_t *iface); void uct_dc_mlx5_iface_cleanup_fc_ep(uct_dc_mlx5_iface_t *iface); ucs_status_t uct_dc_mlx5_iface_fc_grant(uct_pending_req_t *self); ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_num, uct_rc_hdr_t *hdr, unsigned length, uint32_t imm_data, uint16_t lid, unsigned flags); void uct_dc_mlx5_iface_set_av_sport(uct_dc_mlx5_iface_t *iface, uct_ib_mlx5_base_av_t *av, uint32_t remote_dctn); void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface); void uct_dc_mlx5_iface_init_version(uct_dc_mlx5_iface_t *iface, uct_md_h md); ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, uct_dc_dci_t *dci); ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface, uct_dc_dci_t *dci); void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface, int max); #if HAVE_DEVX ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface); ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface); ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface, uct_ib_mlx5_qp_t *qp); #else static UCS_F_MAYBE_UNUSED ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface) { return UCS_ERR_UNSUPPORTED; } static UCS_F_MAYBE_UNUSED ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface) { return UCS_ERR_UNSUPPORTED; } static UCS_F_MAYBE_UNUSED ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface, uct_ib_mlx5_qp_t *qp) { return UCS_ERR_UNSUPPORTED; } #endif #if IBV_HW_TM static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_iface_fill_ravh(struct ibv_ravh *ravh, uint32_t dct_num) { ravh->sl_dct = htobe32(dct_num); ravh->dc_access_key = htobe64(UCT_IB_KEY); ravh->reserved = 0; } #endif /* TODO: * use a better seach algorithm (perfect hash, bsearch, hash) ??? * * linear search is most probably the best way to go * because the number of dcis is usually small */ static inline uint8_t uct_dc_mlx5_iface_dci_find(uct_dc_mlx5_iface_t *iface, uint32_t qp_num) { uct_dc_dci_t *dcis = iface->tx.dcis; int i, ndci = iface->tx.ndci; for (i = 0; i < ndci; i++) { if (dcis[i].txwq.super.qp_num == qp_num) { return i; } } ucs_fatal("DCI (qpnum=%d) does not exist", qp_num); } static UCS_F_ALWAYS_INLINE int uct_dc_mlx5_iface_has_tx_resources(uct_dc_mlx5_iface_t *iface) { return !ucs_mpool_is_empty(&iface->super.super.tx.mp); } static inline int uct_dc_mlx5_iface_dci_has_tx_resources(uct_dc_mlx5_iface_t *iface, uint8_t dci) { return uct_rc_txqp_available(&iface->tx.dcis[dci].txqp) > 0; } /* returns pending queue of eps waiting for tx resources */ static inline ucs_arbiter_t *uct_dc_mlx5_iface_tx_waitq(uct_dc_mlx5_iface_t *iface) { return &iface->tx.dci_arbiter; } /* returns pending queue of eps waiting for the dci allocation */ static inline ucs_arbiter_t *uct_dc_mlx5_iface_dci_waitq(uct_dc_mlx5_iface_t *iface) { return &iface->super.super.tx.arbiter; } static inline int uct_dc_mlx5_iface_dci_has_outstanding(uct_dc_mlx5_iface_t *iface, int dci) { uct_rc_txqp_t *txqp; txqp = &iface->tx.dcis[dci].txqp; return uct_rc_txqp_available(txqp) < (int16_t)iface->super.super.config.tx_qp_len; } static inline ucs_status_t uct_dc_mlx5_iface_flush_dci(uct_dc_mlx5_iface_t *iface, int dci) { if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) { return UCS_OK; } ucs_trace_poll("dci %d is not flushed %d/%d", dci, iface->tx.dcis[dci].txqp.available, iface->super.super.config.tx_qp_len); ucs_assertv(uct_rc_txqp_unsignaled(&iface->tx.dcis[dci].txqp) == 0, "unsignalled send is not supported!!!"); return UCS_INPROGRESS; } #endif