/*
* Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "dst_entry_tcp.h"
#include <netinet/tcp.h>
#define MODULE_NAME "dst_tcp"
#define dst_tcp_logpanic __log_panic
#define dst_tcp_logerr __log_err
#define dst_tcp_logwarn __log_warn
#define dst_tcp_loginfo __log_info
#define dst_tcp_logdbg __log_info_dbg
#define dst_tcp_logfunc __log_info_fine
#define dst_tcp_logfine __log_info_fine
#define dst_tcp_logfuncall __log_info_finer
dst_entry_tcp::dst_entry_tcp(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port,
socket_data &sock_data , resource_allocation_key &ring_alloc_logic):
dst_entry(dst_ip, dst_port, src_port, sock_data, ring_alloc_logic),
m_n_sysvar_tx_bufs_batch_tcp(safe_mce_sys().tx_bufs_batch_tcp)
{
}
dst_entry_tcp::~dst_entry_tcp()
{
}
transport_t dst_entry_tcp::get_transport(sockaddr_in to)
{
NOT_IN_USE(to);
return TRANS_VMA;
}
#ifdef DEFINED_TSO
ssize_t dst_entry_tcp::fast_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr)
{
int ret = 0;
tx_packet_template_t* p_pkt;
tcp_iovec* p_tcp_iov = NULL;
size_t hdr_alignment_diff = 0;
/* The header is aligned for fast copy but we need to maintain this diff
* in order to get the real header pointer easily
*/
hdr_alignment_diff = m_header.m_aligned_l2_l3_len - m_header.m_total_hdr_len;
p_tcp_iov = (tcp_iovec*)p_iov;
attr.flags = (vma_wr_tx_packet_attr)(attr.flags | VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM);
/* Supported scenarios:
* 1. Standard:
* Use lwip memory buffer (zero copy) in case iov consists of single buffer with single TCP packet.
* 2. Large send offload:
* Use lwip sequence of memory buffers (zero copy) in case attribute is set as TSO and no retransmission.
* Size of iov can be one or more.
* 3. Simple:
* Use intermediate buffers for data send
*/
if (likely(m_p_ring->is_active_member(p_tcp_iov->p_desc->p_desc_owner, m_id) &&
(is_set(attr.flags, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_TSO)) ||
(sz_iov == 1 && !is_set(attr.flags, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_REXMIT)))))) {
size_t total_packet_len = 0;
vma_ibv_send_wr send_wqe;
wqe_send_handler send_wqe_h;
/* iov_base is a pointer to TCP header and data
* so p_pkt should point to L2
*/
p_pkt = (tx_packet_template_t*)((uint8_t*)p_tcp_iov[0].iovec.iov_base - m_header.m_aligned_l2_l3_len);
/* iov_len is a size of TCP header and data
* m_total_hdr_len is a size of L2/L3 header
*/
total_packet_len = p_tcp_iov[0].iovec.iov_len + m_header.m_total_hdr_len;
/* copy just L2/L3 headers to p_pkt */
m_header.copy_l2_ip_hdr(p_pkt);
/* L3(Total Length) field means nothing in case TSO usage and can be set as zero but
* setting this field to actual value allows to do valid call for scenario
* when payload size less or equal to mss
*/
p_pkt->hdr.m_ip_hdr.tot_len = (htons)(p_tcp_iov[0].iovec.iov_len + m_header.m_ip_header_len);
if ((total_packet_len < m_max_inline) && (1 == sz_iov)) {
m_p_send_wqe = &m_inline_send_wqe;
m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff);
m_sge[0].length = total_packet_len;
} else if (is_set(attr.flags, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_TSO))) {
/* update send work request. do not expect noninlined scenario */
send_wqe_h.init_not_inline_wqe(send_wqe, m_sge, sz_iov);
send_wqe_h.enable_tso(send_wqe,
(void *)((uint8_t*)p_pkt + hdr_alignment_diff),
m_header.m_total_hdr_len + p_pkt->hdr.m_tcp_hdr.doff * 4,
attr.mss);
m_p_send_wqe = &send_wqe;
m_sge[0].addr = (uintptr_t)((uint8_t *)&p_pkt->hdr.m_tcp_hdr + p_pkt->hdr.m_tcp_hdr.doff * 4);
m_sge[0].length = p_tcp_iov[0].iovec.iov_len - p_pkt->hdr.m_tcp_hdr.doff * 4;
} else {
m_p_send_wqe = &m_not_inline_send_wqe;
m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff);
m_sge[0].length = total_packet_len;
}
/* save pointers to ip and tcp headers for software checksum calculation */
p_tcp_iov[0].p_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr;
p_tcp_iov[0].p_desc->tx.p_tcp_h =(struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr)) + sizeof(p_pkt->hdr.m_ip_hdr));
p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.ref++;
/* set wr_id as a pointer to memory descriptor */
m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc;
/* Update scatter gather element list
* ref counter is incremented for the first memory descriptor only because it is needed
* for processing send wr completion (tx batching mode)
*/
m_sge[0].lkey = m_p_ring->get_tx_lkey(m_id);
for (int i = 1; i < sz_iov; ++i) {
m_sge[i].addr = (uintptr_t)p_tcp_iov[i].iovec.iov_base;
m_sge[i].length = p_tcp_iov[i].iovec.iov_len;
m_sge[i].lkey = m_sge[0].lkey;
}
send_lwip_buffer(m_id, m_p_send_wqe, attr.flags);
} else { // We don'nt support inline in this case, since we believe that this a very rare case
mem_buf_desc_t *p_mem_buf_desc;
size_t total_packet_len = 0;
p_mem_buf_desc = get_buffer(is_set(attr.flags, VMA_TX_PACKET_BLOCK));
if (p_mem_buf_desc == NULL) {
ret = -1;
goto out;
}
m_header.copy_l2_ip_hdr((tx_packet_template_t*)p_mem_buf_desc->p_buffer);
// Actually this is not the real packet len we will subtract the alignment diff at the end of the copy
total_packet_len = m_header.m_aligned_l2_l3_len;
for (int i = 0; i < sz_iov; ++i) {
memcpy(p_mem_buf_desc->p_buffer + total_packet_len, p_tcp_iov[i].iovec.iov_base, p_tcp_iov[i].iovec.iov_len);
total_packet_len += p_tcp_iov[i].iovec.iov_len;
}
m_sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + hdr_alignment_diff);
m_sge[0].length = total_packet_len - hdr_alignment_diff;
m_sge[0].lkey = m_p_ring->get_tx_lkey(m_id);
p_pkt = (tx_packet_template_t*)((uint8_t*)p_mem_buf_desc->p_buffer);
p_pkt->hdr.m_ip_hdr.tot_len = (htons)(m_sge[0].length - m_header.m_transport_header_len);
p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr;
p_mem_buf_desc->tx.p_tcp_h = (struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr));
m_p_send_wqe = &m_not_inline_send_wqe;
m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc;
send_ring_buffer(m_id, m_p_send_wqe, attr.flags);
}
if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) {
m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id,
is_set(attr.flags, VMA_TX_PACKET_BLOCK), m_n_sysvar_tx_bufs_batch_tcp);
}
out:
if (unlikely(is_set(attr.flags, VMA_TX_PACKET_REXMIT))) {
m_p_ring->inc_tx_retransmissions_stats(m_id);
}
return ret;
}
ssize_t dst_entry_tcp::slow_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr,
struct vma_rate_limit_t &rate_limit, int flags /*= 0*/,
socket_fd_api* sock /*= 0*/, tx_call_t call_type /*= 0*/)
{
ssize_t ret_val = -1;
NOT_IN_USE(sock);
NOT_IN_USE(call_type);
NOT_IN_USE(flags);
m_slow_path_lock.lock();
prepare_to_send(rate_limit, true);
if (m_b_is_offloaded) {
if (!is_valid()) { // That means that the neigh is not resolved yet
//there is a copy inside so we should not update any ref-counts
ret_val = pass_buff_to_neigh(p_iov, sz_iov);
}
else {
ret_val = fast_send(p_iov, sz_iov, attr);
}
}
else {
dst_tcp_logdbg("Dst_entry is not offloaded, bug?");
}
m_slow_path_lock.unlock();
return ret_val;
}
#else
ssize_t dst_entry_tcp::fast_send(const iovec* p_iov, const ssize_t sz_iov, bool is_dummy, bool b_blocked /*= true*/, bool is_rexmit /*= false*/)
{
int ret = 0;
tx_packet_template_t* p_pkt;
mem_buf_desc_t *p_mem_buf_desc;
size_t total_packet_len = 0;
// The header is aligned for fast copy but we need to maintain this diff in order to get the real header pointer easily
size_t hdr_alignment_diff = m_header.m_aligned_l2_l3_len - m_header.m_total_hdr_len;
tcp_iovec* p_tcp_iov = NULL;
bool no_copy = true;
if (likely(sz_iov == 1 && !is_rexmit)) {
p_tcp_iov = (tcp_iovec*)p_iov;
if (unlikely(!m_p_ring->is_active_member(p_tcp_iov->p_desc->p_desc_owner, m_id))) {
no_copy = false;
dst_tcp_logdbg("p_desc=%p wrong desc_owner=%p, this ring=%p. did migration occurred?", p_tcp_iov->p_desc, p_tcp_iov->p_desc->p_desc_owner, m_p_ring);
//todo can we handle this in migration (by going over all buffers lwip hold) instead for every send?
}
} else {
no_copy = false;
}
vma_wr_tx_packet_attr attr = (vma_wr_tx_packet_attr)((VMA_TX_PACKET_BLOCK * b_blocked) | (VMA_TX_PACKET_DUMMY * is_dummy) | VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM);
if (likely(no_copy)) {
p_pkt = (tx_packet_template_t*)((uint8_t*)p_tcp_iov[0].iovec.iov_base - m_header.m_aligned_l2_l3_len);
total_packet_len = p_tcp_iov[0].iovec.iov_len + m_header.m_total_hdr_len;
m_header.copy_l2_ip_hdr(p_pkt);
// We've copied to aligned address, and now we must update p_pkt to point to real
// L2 header
//p_pkt = (tx_packet_template_t*)((uint8_t*)p_pkt + hdr_alignment_diff);
p_pkt->hdr.m_ip_hdr.tot_len = (htons)(p_tcp_iov[0].iovec.iov_len + m_header.m_ip_header_len);
m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff);
m_sge[0].length = total_packet_len;
if (total_packet_len < m_max_inline) { // inline send
m_p_send_wqe = &m_inline_send_wqe;
} else {
m_p_send_wqe = &m_not_inline_send_wqe;
}
m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc;
p_tcp_iov[0].p_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr;
p_tcp_iov[0].p_desc->tx.p_tcp_h =(struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr));
send_lwip_buffer(m_id, m_p_send_wqe, attr);
/* for DEBUG */
if ((uint8_t*)m_sge[0].addr < p_tcp_iov[0].p_desc->p_buffer || (uint8_t*)p_pkt < p_tcp_iov[0].p_desc->p_buffer) {
dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n",
(int)(p_tcp_iov[0].p_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len,
p_tcp_iov[0].p_desc->p_buffer, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.type,
p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.len, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.tot_len,
p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff);
}
}
else { // We don'nt support inline in this case, since we believe that this a very rare case
p_mem_buf_desc = get_buffer(b_blocked);
if (p_mem_buf_desc == NULL) {
ret = -1;
goto out;
}
m_header.copy_l2_ip_hdr((tx_packet_template_t*)p_mem_buf_desc->p_buffer);
// Actually this is not the real packet len we will subtract the alignment diff at the end of the copy
total_packet_len = m_header.m_aligned_l2_l3_len;
for (int i = 0; i < sz_iov; ++i) {
memcpy(p_mem_buf_desc->p_buffer + total_packet_len, p_iov[i].iov_base, p_iov[i].iov_len);
total_packet_len += p_iov[i].iov_len;
}
m_sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + hdr_alignment_diff);
m_sge[0].length = total_packet_len - hdr_alignment_diff;
// LKey will be updated in ring->send() // m_sge[0].lkey = p_mem_buf_desc->lkey;
p_pkt = (tx_packet_template_t*)((uint8_t*)p_mem_buf_desc->p_buffer);
p_pkt->hdr.m_ip_hdr.tot_len = (htons)(m_sge[0].length - m_header.m_transport_header_len);
p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr;
p_mem_buf_desc->tx.p_tcp_h = (struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr));
m_p_send_wqe = &m_not_inline_send_wqe;
m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc;
send_ring_buffer(m_id, m_p_send_wqe, attr);
/* for DEBUG */
if ((uint8_t*)m_sge[0].addr < p_mem_buf_desc->p_buffer) {
dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n",
(int)(p_mem_buf_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len,
p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.pbuf.type,
p_mem_buf_desc->lwip_pbuf.pbuf.len, p_mem_buf_desc->lwip_pbuf.pbuf.tot_len,
p_mem_buf_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff);
}
}
if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) {
m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_tcp);
}
out:
if (unlikely(is_rexmit)) {
m_p_ring->inc_tx_retransmissions_stats(m_id);
}
return ret;
}
ssize_t dst_entry_tcp::slow_send(const iovec* p_iov, size_t sz_iov, bool is_dummy, struct vma_rate_limit_t &rate_limit, bool b_blocked /*= true*/, bool is_rexmit /*= false*/, int flags /*= 0*/, socket_fd_api* sock /*= 0*/, tx_call_t call_type /*= 0*/)
{
ssize_t ret_val = -1;
NOT_IN_USE(sock);
NOT_IN_USE(call_type);
NOT_IN_USE(flags);
m_slow_path_lock.lock();
prepare_to_send(rate_limit, true);
if (m_b_is_offloaded) {
if (!is_valid()) { // That means that the neigh is not resolved yet
//there is a copy inside so we should not update any ref-counts
ret_val = pass_buff_to_neigh(p_iov, sz_iov);
}
else {
ret_val = fast_send(p_iov, sz_iov, is_dummy, b_blocked, is_rexmit);
}
}
else {
dst_tcp_logdbg("Dst_entry is not offloaded, bug?");
}
m_slow_path_lock.unlock();
return ret_val;
}
#endif /* DEFINED_TSO */
ssize_t dst_entry_tcp::slow_send_neigh( const iovec* p_iov, size_t sz_iov, struct vma_rate_limit_t &rate_limit)
{
ssize_t ret_val = -1;
m_slow_path_lock.lock();
prepare_to_send(rate_limit, true);
if (m_b_is_offloaded) {
ret_val = pass_buff_to_neigh(p_iov, sz_iov);
}
else {
dst_tcp_logdbg("Dst_entry is not offloaded, bug?");
}
m_slow_path_lock.unlock();
return ret_val;
}
//The following function supposed to be called under m_lock
void dst_entry_tcp::configure_headers()
{
m_header.init();
dst_entry::configure_headers();
}
ssize_t dst_entry_tcp::pass_buff_to_neigh(const iovec * p_iov, size_t sz_iov, uint16_t packet_id)
{
NOT_IN_USE(packet_id);
m_header_neigh.init();
m_header_neigh.configure_tcp_ports(m_dst_port, m_src_port);
return(dst_entry::pass_buff_to_neigh(p_iov, sz_iov));
}
mem_buf_desc_t* dst_entry_tcp::get_buffer(bool b_blocked /*=false*/)
{
set_tx_buff_list_pending(false);
// Get a bunch of tx buf descriptor and data buffers
if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) {
m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_tcp);
}
mem_buf_desc_t* p_mem_buf_desc = m_p_tx_mem_buf_desc_list;
if (unlikely(p_mem_buf_desc == NULL)) {
dst_tcp_logfunc("silent packet drop, no buffers!");
}
else {
m_p_tx_mem_buf_desc_list = m_p_tx_mem_buf_desc_list->p_next_desc;
p_mem_buf_desc->p_next_desc = NULL;
// for TX, set lwip payload to the data segment.
// lwip will send it with payload pointing to the tcp header.
p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + m_header.m_aligned_l2_l3_len + sizeof(struct tcphdr);
}
return p_mem_buf_desc;
}
//called from lwip under sockinfo_tcp lock
//handle un-chained pbuf
// only single p_desc
void dst_entry_tcp::put_buffer(mem_buf_desc_t * p_desc)
{
//todo accumulate buffers?
if (unlikely(p_desc == NULL))
return;
if (likely(m_p_ring->is_member(p_desc->p_desc_owner))) {
m_p_ring->mem_buf_desc_return_single_to_owner_tx(p_desc);
} else {
//potential race, ref is protected here by tcp lock, and in ring by ring_tx lock
if (likely(p_desc->lwip_pbuf.pbuf.ref))
p_desc->lwip_pbuf.pbuf.ref--;
else
dst_tcp_logerr("ref count of %p is already zero, double free??", p_desc);
if (p_desc->lwip_pbuf.pbuf.ref == 0) {
p_desc->p_next_desc = NULL;
g_buffer_pool_tx->put_buffers_thread_safe(p_desc);
}
}
}