Blob Blame History Raw
/*
 * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */


#include "select_call.h"

#include "utils/bullseye.h"
#include "vlogger/vlogger.h"
#include <vma/util/vtypes.h>
#include <vma/sock/sock-redirect.h>
#include <vma/sock/fd_collection.h>
#include <vma/dev/net_device_table_mgr.h>

#define MODULE_NAME "select_call:"


#define FD_COPY(__fddst, __fdsrc, __nfds) \
		memcpy(__FDS_BITS(__fddst), __FDS_BITS(__fdsrc), ((__nfds) + 7) >> 3)

#undef  FD_ZERO // Remove select.h origianl FD_ZERO and define our own with limit size
#define FD_ZERO(__fddst, __nfds) \
		memset(__FDS_BITS(__fddst), 0, ((__nfds) + 7) >> 3)
iomux_func_stats_t g_select_stats;

select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer,
                         int nfds, fd_set *readfds, fd_set *writefds,
                         fd_set *exceptfds, timeval *timeout, const sigset_t *__sigmask /* = NULL */) :
	io_mux_call(off_fds_buffer, off_modes_buffer, nfds, __sigmask),
	m_nfds(nfds), m_readfds(readfds), m_writefds(writefds),
	m_exceptfds(exceptfds), m_timeout(timeout), m_nfds_with_cq(0), m_b_run_prepare_to_poll(false)
{
	int fd;
	//socket_fd_api* temp_sock_fd_api = NULL; 

	if (m_nfds > FD_SETSIZE) {
		errno = ENOMEM;
		vma_throw_object(io_mux_call::io_error);
	}

	// create stats
	m_p_stats = &g_select_stats;
        vma_stats_instance_get_select_block(m_p_stats);

	bool offloaded_read  = !!m_readfds;
	bool offloaded_write = !!m_writefds;

	if (offloaded_read || offloaded_write) {
		FD_ZERO(&m_os_rfds, m_nfds);
		FD_ZERO(&m_os_wfds, m_nfds);

		//covers the case of select(readfds = NULL)
		if(!m_readfds) {
			FD_ZERO(&m_cq_rfds, m_nfds);
			m_readfds = &m_cq_rfds;
		}

		// get offloaded fds in read set
		for (fd = 0; fd < m_nfds; ++fd) {

			bool check_read = offloaded_read && FD_ISSET(fd, m_readfds);
			bool check_write = offloaded_write && FD_ISSET(fd, m_writefds);

			socket_fd_api* psock = fd_collection_get_sockfd(fd);

			if (psock && psock->get_type() == FD_TYPE_SOCKET) {

				offloaded_mode_t off_mode  = OFF_NONE;
				if (check_read)  off_mode = (offloaded_mode_t)(off_mode | OFF_READ);
				if (check_write) off_mode = (offloaded_mode_t)(off_mode | OFF_WRITE);

				if (off_mode) {
					__log_func("---> fd=%d IS SET for read or write!", fd);

					m_p_all_offloaded_fds[m_num_all_offloaded_fds] = fd;
					m_p_offloaded_modes[m_num_all_offloaded_fds] = off_mode;
					m_num_all_offloaded_fds++;
					if (! psock->skip_os_select()) {
						if (check_read) {
							FD_SET(fd, &m_os_rfds);
							if (psock->is_readable(NULL)) {
								io_mux_call::update_fd_array(&m_fd_ready_array, fd);
								m_n_ready_rfds++;
								m_n_all_ready_fds++;
							} else {
								// Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on recvfrom(),
								// after iomux returned a shadow fd as ready (only for non-blocking sockets)
								psock->set_immediate_os_sample();
							}
						}
						if (check_write) {
							FD_SET(fd, &m_os_wfds);
						}
					}
					else
						__log_func("fd=%d must be skipped from os r select()", fd);

				}
			}
                        else {
                                if (check_read)  {
                                        FD_SET(fd, &m_os_rfds);
                                }
                                if (check_write) {
                                        FD_SET(fd, &m_os_wfds);
                                }
                        }

		}
	}
	__log_func("num all offloaded_fds=%d", m_num_all_offloaded_fds);
}


void select_call::prepare_to_poll()
{
	/* 
	 * Create copies of all sets and zero out the originals.
	 * This is needed because polling might be successful.
	 * 
	 * If the read set is zero, use the local copy every time.
	 * This is OK because it will hold only the CQ, and wait()
	 * clears the CQ from the set after orig_select() call.
	 * 
	 * m_readfds is non-NULL here because there are offloaded sockets.
	 */

	// copy sets, and zero out the originals
	if (m_readfds) {
		FD_COPY(&m_orig_readfds, m_readfds, m_nfds);
		FD_ZERO(m_readfds, m_nfds);
	}

	if (m_writefds) {
		FD_COPY(&m_orig_writefds, m_writefds, m_nfds);
		FD_ZERO(m_writefds, m_nfds);
	}
	if (m_exceptfds) {
		FD_COPY(&m_orig_exceptfds, m_exceptfds, m_nfds);
		FD_ZERO(m_exceptfds, m_nfds);
	}
	m_b_run_prepare_to_poll = true;
}

void select_call::prepare_to_block()
{
	m_cqepfd = g_p_net_device_table_mgr->global_ring_epfd_get();
	m_nfds_with_cq = max(m_cqepfd + 1, m_nfds);
}

bool select_call::wait_os(bool zero_timeout)
{
	timeval to, *pto = NULL;
	timespec to_pselect, *pto_pselect = NULL;
	
/* Avner: I put it in comment, because this logic is wrong

	// optimization: do not call os select if ALL fds are excluded
	// extend check to write/except fds
	if (m_rfd_count == m_n_exclude_fds)
		return;
*/
	
	if (zero_timeout) {
		to.tv_sec = to.tv_usec = 0;
		pto = &to;
	}
	else {
		pto = m_timeout;
	}

	// Restore original sets
	if (m_b_run_prepare_to_poll) {
		if (m_readfds)	FD_COPY(m_readfds, &m_os_rfds, m_nfds);
		if (m_writefds)	FD_COPY(m_writefds, &m_os_wfds, m_nfds);
		if (m_exceptfds)FD_COPY(m_exceptfds, &m_orig_exceptfds, m_nfds);
	}
	__log_func("calling os select: %d", m_nfds);
	if (m_sigmask) {
		if (pto) {
			to_pselect.tv_sec = pto->tv_sec;
			to_pselect.tv_nsec = pto->tv_usec * 1000;
			pto_pselect = &to_pselect;
		}
		m_n_all_ready_fds = orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask);
	} else {
		m_n_all_ready_fds = orig_os_api.select(m_nfds, m_readfds, m_writefds, m_exceptfds, pto);
	}
	if (m_n_all_ready_fds < 0) {
		vma_throw_object(io_mux_call::io_error);
	}
	if (m_n_all_ready_fds > 0) {
		__log_func("wait_os() returned with %d", m_n_all_ready_fds);
	}
	return false; // No cq_fd in select() event
}

bool select_call::wait(const timeval &elapsed)
{
	timeval timeout, *pto = NULL;
	timespec to_pselect, *pto_pselect = NULL;
	
	BULLSEYE_EXCLUDE_BLOCK_START
	if (m_n_all_ready_fds > 0) {
		__log_panic("wait() called when there are ready fd's!!!");
		// YossiE TODO make this and some more checks as debug assertions
		// In all functions
	}
	BULLSEYE_EXCLUDE_BLOCK_END

	// Restore original sets
	if (m_b_run_prepare_to_poll) {
		if (m_readfds)	FD_COPY(m_readfds, &m_os_rfds, m_nfds);
		if (m_writefds)	FD_COPY(m_writefds, &m_os_wfds, m_nfds);
		if (m_exceptfds)FD_COPY(m_exceptfds, &m_orig_exceptfds, m_nfds);
	}

	// Call OS select() on original sets + CQ epfd in read set
	if (m_readfds)
		FD_SET(m_cqepfd, m_readfds);
	if (m_timeout) {
		tv_sub(m_timeout, &elapsed, &timeout);
		if (timeout.tv_sec < 0 || timeout.tv_usec < 0) {
			// Already reached timeout
			return false;
		}
		pto = &timeout;
	}

	__log_func("going to wait on select CQ+OS nfds=%d cqfd=%d pto=%p!!!", m_nfds_with_cq, m_cqepfd, pto);

	// ACTUAL CALL TO SELECT
	if (m_sigmask) {
		if (pto) {
			to_pselect.tv_sec = pto->tv_sec;
			to_pselect.tv_nsec = pto->tv_usec * 1000;
			pto_pselect = &to_pselect;
		}
		m_n_all_ready_fds = orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask);
	} else {
		m_n_all_ready_fds = orig_os_api.select(m_nfds_with_cq, m_readfds, m_writefds, m_exceptfds, pto);
	}
	__log_func("done select CQ+OS nfds=%d cqfd=%d pto=%p ready=%d!!!", m_nfds_with_cq, m_cqepfd, pto, m_n_all_ready_fds);
	if (m_n_all_ready_fds < 0) {
		vma_throw_object(io_mux_call::io_error);
	}

	// Clear CQ from the set and don't count it
	if (m_readfds)
	{
		if (FD_ISSET(m_cqepfd, m_readfds)) {
			FD_CLR(m_cqepfd, m_readfds); // Not needed if m_readfds is NULL
			--m_n_all_ready_fds;
			return true;
		}
	}
	return false;
}

bool select_call::is_timeout(const timeval &elapsed)
{
	return m_timeout && tv_cmp(m_timeout, &elapsed, <=);
}

void select_call::set_offloaded_rfd_ready(int fd_index)
{
	if (m_p_offloaded_modes[fd_index] & OFF_READ) { //TODO: consider removing
		int fd = m_p_all_offloaded_fds[fd_index];
		if (!FD_ISSET(fd, m_readfds)) {
			FD_SET(fd, m_readfds);
			++m_n_ready_rfds;
			++m_n_all_ready_fds;
			__log_func("ready offloaded fd: %d", fd);
		}
	}
}

void select_call::set_rfd_ready(int fd)
{
	// This function also checks that fd was in the original read set
	if (!FD_ISSET(fd, m_readfds) && FD_ISSET(fd, &m_orig_readfds)) {
		FD_SET(fd, m_readfds);
		++m_n_ready_rfds;
//		if (!FD_ISSET(fd, m_writefds))
		++m_n_all_ready_fds;
	}
}

void select_call::set_offloaded_wfd_ready(int fd_index)
{
	if (m_p_offloaded_modes[fd_index] & OFF_WRITE) { //TODO: consider removing
		int fd = m_p_all_offloaded_fds[fd_index];
		if (!FD_ISSET(fd, m_writefds)) {
			FD_SET(fd, m_writefds);
			++m_n_ready_wfds;
			++m_n_all_ready_fds;
			__log_func("ready offloaded w fd: %d", fd);
		}
	}
}

void select_call::set_wfd_ready(int fd)
{
	// This function also checks that fd was in the original read set
	if (!FD_ISSET(fd, m_writefds) && FD_ISSET(fd, &m_orig_writefds)) { //TODO: why do we need the last 'if'??
		FD_SET(fd, m_writefds);
		++m_n_ready_wfds;
//		if (!FD_ISSET(fd, m_readfds))
		++m_n_all_ready_fds;
		__log_func("ready w fd: %d", fd);
	}
}

void select_call::set_efd_ready(int fd, int errors)
{
	/* TODO currently consider errors as ready to write OR read */
	NOT_IN_USE(errors);
	NOT_IN_USE(fd);
}