Blob Blame History Raw
/*
 * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */



#include <string.h>
#include <ifaddrs.h>
#include <sys/epoll.h>
#include <linux/if_infiniband.h>
#include <linux/if_ether.h>
#include <linux/rtnetlink.h>
#include <linux/netlink.h>
#include <linux/if_tun.h>
#include <sys/epoll.h>

#include "utils/bullseye.h"
#include "vma/util/if.h"
#include "vma/dev/net_device_val.h"
#include "vma/util/vtypes.h"
#include "vma/util/utils.h"
#include "vma/util/valgrind.h"
#include "vma/event/event_handler_manager.h"
#include "vma/proto/L2_address.h"
#include "vma/dev/ib_ctx_handler_collection.h"
#include "vma/dev/ring_tap.h"
#include "vma/dev/ring_simple.h"
#include "vma/dev/ring_eth_cb.h"
#include "vma/dev/ring_eth_direct.h"
#include "vma/dev/ring_slave.h"
#include "vma/dev/ring_bond.h"
#include "vma/sock/sock-redirect.h"
#include "vma/dev/net_device_table_mgr.h"
#include "vma/proto/neighbour_table_mgr.h"
#include "ring_profile.h"

#ifdef HAVE_LIBNL3
#include <netlink/route/link/vlan.h>
#endif

#define MODULE_NAME             "ndv"

#define nd_logpanic           __log_panic
#define nd_logerr             __log_err
#define nd_logwarn            __log_warn
#define nd_loginfo            __log_info
#define nd_logdbg             __log_info_dbg
#define nd_logfunc            __log_info_func
#define nd_logfuncall         __log_info_funcall

ring_alloc_logic_attr::ring_alloc_logic_attr():
				m_ring_alloc_logic(RING_LOGIC_PER_INTERFACE),
				m_ring_profile_key(0),
				m_user_id_key(0)
{
	m_mem_desc.iov_base = NULL;
	m_mem_desc.iov_len = 0;
	init();
}

ring_alloc_logic_attr::ring_alloc_logic_attr(ring_logic_t ring_logic):
				m_ring_alloc_logic(ring_logic),
				m_ring_profile_key(0),
				m_user_id_key(0)
{
	m_mem_desc.iov_base = NULL;
	m_mem_desc.iov_len = 0;
	init();
}

ring_alloc_logic_attr::ring_alloc_logic_attr(const ring_alloc_logic_attr &other):
	m_hash(other.m_hash),
	m_ring_alloc_logic(other.m_ring_alloc_logic),
	m_ring_profile_key(other.m_ring_profile_key),
	m_user_id_key(other.m_user_id_key),
	m_mem_desc(other.m_mem_desc)
{
	snprintf(m_str, RING_ALLOC_STR_SIZE, "%s", other.m_str);
}

void ring_alloc_logic_attr::init()
{
	size_t h = 5381;
	int c;
	char buff[RING_ALLOC_STR_SIZE];

	snprintf(m_str, RING_ALLOC_STR_SIZE,
		 "allocation logic %d profile %d key %ld user address %p "
		 "user length %zd", m_ring_alloc_logic, m_ring_profile_key,
		 m_user_id_key, m_mem_desc.iov_base, m_mem_desc.iov_len);
	snprintf(buff, RING_ALLOC_STR_SIZE, "%d%d%ld%p%zd", m_ring_alloc_logic,
		 m_ring_profile_key, m_user_id_key, m_mem_desc.iov_base,
		 m_mem_desc.iov_len);
	const char* chr = buff;
	while ((c = *chr++))
		h = ((h << 5) + h) + c; /* m_hash * 33 + c */
	m_hash = h;
}

void ring_alloc_logic_attr::set_ring_alloc_logic(ring_logic_t logic)
{
	if (m_ring_alloc_logic != logic) {
		m_ring_alloc_logic = logic;
		init();
	}
}

void ring_alloc_logic_attr::set_ring_profile_key(vma_ring_profile_key profile)
{
	if (m_ring_profile_key != profile) {
		m_ring_profile_key = profile;
		init();
	}
}

void ring_alloc_logic_attr::set_memory_descriptor(iovec &mem_desc)
{
	if (m_mem_desc.iov_base != mem_desc.iov_base ||
	    m_mem_desc.iov_len != mem_desc.iov_len) {
		m_mem_desc = mem_desc;
		init();
	}
}

void ring_alloc_logic_attr::set_user_id_key(uint64_t user_id_key)
{
	if (m_user_id_key != user_id_key) {
		m_user_id_key = user_id_key;
		init();
	}
}

net_device_val::net_device_val(struct net_device_val_desc *desc) : m_lock("net_device_val lock")
{
	bool valid = false;
	ib_ctx_handler* ib_ctx;
	struct nlmsghdr *nl_msg = NULL;
	struct ifinfomsg *nl_msgdata = NULL;
	int nl_attrlen;
	struct rtattr *nl_attr;

	m_if_idx = 0;
	m_if_link = 0;
	m_type = 0;
	m_flags = 0;
	m_mtu = 0;
	m_state = INVALID;
	m_p_L2_addr = NULL;
	m_p_br_addr = NULL;
	m_bond = NO_BOND;
	m_if_active = 0;
	m_bond_xmit_hash_policy = XHP_LAYER_2;
	m_bond_fail_over_mac = 0;
	m_transport_type = VMA_TRANSPORT_UNKNOWN;

	if (NULL == desc) {
		nd_logerr("Invalid net_device_val name=%s", "NA");
		m_state = INVALID;
		return;
	}

	nl_msg = desc->nl_msg;
	nl_msgdata = (struct ifinfomsg *)NLMSG_DATA(nl_msg);

	nl_attr = (struct rtattr *)IFLA_RTA(nl_msgdata);
	nl_attrlen = IFLA_PAYLOAD(nl_msg);

	set_type(nl_msgdata->ifi_type);
	set_if_idx(nl_msgdata->ifi_index);
	set_flags(nl_msgdata->ifi_flags);
	while (RTA_OK(nl_attr, nl_attrlen)) {
		char *nl_attrdata = (char *)RTA_DATA(nl_attr);
		size_t nl_attrpayload = RTA_PAYLOAD(nl_attr);

		switch (nl_attr->rta_type) {
		case IFLA_MTU:
			set_mtu(*(int32_t *)nl_attrdata);
			break;
		case IFLA_LINK:
			set_if_link(*(int32_t *)nl_attrdata);
			break;
		case IFLA_IFNAME:
			set_ifname(nl_attrdata);
			break;
		case IFLA_ADDRESS:
			set_l2_if_addr((uint8_t *)nl_attrdata, nl_attrpayload);
			break;
		case IFLA_BROADCAST:
			set_l2_bc_addr((uint8_t *)nl_attrdata, nl_attrpayload);
			break;
		default:
			break;
		}
		nl_attr = RTA_NEXT(nl_attr, nl_attrlen);
	}

	/* Valid interface should have at least one IP address */
	set_ip_array();
	if (m_ip.empty()) {
		return;
	}

	/* Identify device type */
	if ((get_flags() & IFF_MASTER) || check_device_exist(get_ifname_link(), BOND_DEVICE_FILE)) {
		verify_bonding_mode();
	} else if (check_netvsc_device_exist(get_ifname_link())) {
		m_bond = NETVSC;
	} else {
		m_bond = NO_BOND;
	}

	set_str();

	nd_logdbg("Check interface '%s' (index=%d addr=%d.%d.%d.%d flags=%X)",
			get_ifname(), get_if_idx(), NIPQUAD(get_local_addr()), get_flags());

	valid = false;
	ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link());
	switch (m_bond) {
	case NETVSC:
		if (get_type() == ARPHRD_ETHER) {
			char slave_ifname[IFNAMSIZ] = {0};
			unsigned int slave_flags = 0;
			/* valid = true; uncomment it is valid flow to operate w/o SRIOV */
			if (get_netvsc_slave(get_ifname_link(), slave_ifname, slave_flags)) {
				valid = verify_qp_creation(slave_ifname, IBV_QPT_RAW_PACKET);
			}
		}
		break;
	case LAG_8023ad:
	case ACTIVE_BACKUP:
		// this is a bond interface (or a vlan/alias over bond), find the slaves
		valid = verify_bond_ipoib_or_eth_qp_creation();
		break;
	default:
		valid = (bool)(ib_ctx && verify_ipoib_or_eth_qp_creation(get_ifname_link()));
		break;
	}

	if (!valid) {
		nd_logdbg("Skip interface '%s'", get_ifname());
		return;
	}

	if (safe_mce_sys().mtu != 0 && (int)safe_mce_sys().mtu != get_mtu()) {
		nd_logwarn("Mismatch between interface %s MTU=%d and VMA_MTU=%d."
				"Make sure VMA_MTU and all offloaded interfaces MTUs match.",
				get_ifname(), get_mtu(), safe_mce_sys().mtu);
	}

	/* Set interface state after all verifications */
	if (m_flags & IFF_RUNNING) {
		m_state = RUNNING;
	}
	else {
		if (m_flags & IFF_UP) {
			m_state = UP;
		}
		else {
			m_state = DOWN;
		}
	}

	nd_logdbg("Use interface '%s'", get_ifname());
	if (ib_ctx) {
		nd_logdbg("%s ==> %s port %d (%s)",
				get_ifname(),
				ib_ctx->get_ibname(), get_port_from_ifname(get_ifname_link()),
				(ib_ctx->is_active(get_port_from_ifname(get_ifname_link())) ? "Up" : "Down"));
	} else {
		nd_logdbg("%s ==> none",
				get_ifname());
	}
}

net_device_val::~net_device_val()
{
	auto_unlocker lock(m_lock);

	rings_hash_map_t::iterator ring_iter;
	while ((ring_iter = m_h_ring_map.begin()) != m_h_ring_map.end()) {
		delete THE_RING;
		resource_allocation_key *tmp = ring_iter->first;
		m_h_ring_map.erase(ring_iter);
		delete tmp;
	}

	rings_key_redirection_hash_map_t::iterator redirect_iter;
	while ((redirect_iter = m_h_ring_key_redirection_map.begin()) !=
		m_h_ring_key_redirection_map.end()) {
		delete redirect_iter->second.first;
		m_h_ring_key_redirection_map.erase(redirect_iter);
	}
	if (m_p_br_addr) {
		delete m_p_br_addr;
		m_p_br_addr = NULL;
	}

	if (m_p_L2_addr) {
		delete m_p_L2_addr;
		m_p_L2_addr = NULL;
	}

	slave_data_vector_t::iterator slave = m_slaves.begin();
	for (; slave != m_slaves.end(); ++slave) {
		delete *slave;
	}
	m_slaves.clear();

	ip_data_vector_t::iterator ip = m_ip.begin();
	for (; ip != m_ip.end(); ++ip) {
		delete *ip;
	}
	m_ip.clear();
}

void net_device_val::set_ip_array()
{
	int rc = 0;
	int fd = -1;
	struct {
		struct nlmsghdr hdr;
		struct ifaddrmsg addrmsg;
	} nl_req;
	struct nlmsghdr *nl_msg;
	int nl_msglen = 0;
	char nl_res[8096];
	static int _seq = 0;

	/* Set up the netlink socket */
	fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
	if (fd < 0) {
		nd_logerr("netlink socket() creation");
		return;
	}

	/* Prepare RTM_GETADDR request */
	memset(&nl_req, 0, sizeof(nl_req));
	nl_req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
	nl_req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
	nl_req.hdr.nlmsg_type = RTM_GETADDR;
	nl_req.hdr.nlmsg_seq = _seq++;
	nl_req.hdr.nlmsg_pid = getpid();
	nl_req.addrmsg.ifa_family = AF_INET;
	nl_req.addrmsg.ifa_index = m_if_idx;

	/* Send the netlink request */
	rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0);
	if (rc < 0) {
		nd_logerr("netlink send() operation");
		goto ret;
	}

	do {
		/* Receive the netlink reply */
		rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0);
		if (rc < 0) {
			nd_logerr("netlink recv() operation");
			goto ret;
		}

		nl_msg = (struct nlmsghdr *)nl_res;
		nl_msglen = rc;
		while (NLMSG_OK(nl_msg, (size_t)nl_msglen) && (nl_msg->nlmsg_type != NLMSG_ERROR)) {
			int nl_attrlen;
			struct ifaddrmsg *nl_msgdata;
			struct rtattr *nl_attr;
			ip_data_t* p_val = NULL;

			nl_msgdata = (struct ifaddrmsg *)NLMSG_DATA(nl_msg);

			/* Process just specific if index */
			if ((int)nl_msgdata->ifa_index == m_if_idx) {
				nl_attr = (struct rtattr *)IFA_RTA(nl_msgdata);
				nl_attrlen = IFA_PAYLOAD(nl_msg);

				p_val = new ip_data_t;
				p_val->flags = nl_msgdata->ifa_flags;
				memset(&p_val->netmask, 0, sizeof(in_addr_t));
				p_val->netmask = prefix_to_netmask(nl_msgdata->ifa_prefixlen);
				while (RTA_OK(nl_attr, nl_attrlen)) {
					char *nl_attrdata = (char *)RTA_DATA(nl_attr);

					switch (nl_attr->rta_type) {
					case IFA_ADDRESS:
						memset(&p_val->local_addr, 0, sizeof(in_addr_t));
						memcpy(&p_val->local_addr, (in_addr_t *)nl_attrdata, sizeof(in_addr_t));
						break;
					default:
						break;
					}
					nl_attr = RTA_NEXT(nl_attr, nl_attrlen);
				}

				m_ip.push_back(p_val);
			}

			/* Check if it is the last message */
			if(nl_msg->nlmsg_type == NLMSG_DONE) {
				goto ret;
			}
			nl_msg = NLMSG_NEXT(nl_msg, nl_msglen);
		}
	} while (1);

ret:
	orig_os_api.close(fd);
}

void net_device_val::set_str()
{
	char str_x[BUFF_SIZE] = {0};

	m_str[0] = '\0';

	str_x[0] = '\0';
	sprintf(str_x, "%d:", m_if_idx);
	strcat(m_str, str_x);

	str_x[0] = '\0';
	if (!strcmp(get_ifname(), get_ifname_link())) {
		sprintf(str_x, " %s:", get_ifname());
	} else {
		sprintf(str_x, " %s@%s:", get_ifname(), get_ifname_link());
	}
	strcat(m_str, str_x);

	str_x[0] = '\0';
	sprintf(str_x, " <%s%s%s%s%s%s%s%s%s%s%s>:",
			(m_flags & IFF_UP        ? "UP," : ""),
			(m_flags & IFF_RUNNING   ? "RUNNING," : ""),
			(m_flags & IFF_NOARP     ? "NO_ARP," : ""),
			(m_flags & IFF_LOOPBACK  ? "LOOPBACK," : ""),
			(m_flags & IFF_BROADCAST ? "BROADCAST," : ""),
			(m_flags & IFF_MULTICAST ? "MULTICAST," : ""),
			(m_flags & IFF_MASTER    ? "MASTER," : ""),
			(m_flags & IFF_SLAVE     ? "SLAVE," : ""),
			(m_flags & IFF_LOWER_UP  ? "LOWER_UP," : ""),
			(m_flags & IFF_DEBUG     ? "DEBUG," : ""),
			(m_flags & IFF_PROMISC   ? "PROMISC," : ""));
	strcat(m_str, str_x);

	str_x[0] = '\0';
	sprintf(str_x, " mtu %d", m_mtu);
	strcat(m_str, str_x);

	str_x[0] = '\0';
	switch (m_type) {
	case ARPHRD_LOOPBACK:
		sprintf(str_x, " type %s", "loopback");
		break;
	case ARPHRD_ETHER:
		sprintf(str_x, " type %s", "ether");
		break;
	case ARPHRD_INFINIBAND:
		sprintf(str_x, " type %s", "infiniband");
		break;
	default:
		sprintf(str_x, " type %s", "unknown");
		break;
	}

	str_x[0] = '\0';
	switch (m_bond) {
	case NETVSC:
		sprintf(str_x, " (%s)", "netvsc");
		break;
	case LAG_8023ad:
		sprintf(str_x, " (%s)", "lag 8023ad");
		break;
	case ACTIVE_BACKUP:
		sprintf(str_x, " (%s)", "active backup");
		break;
	default:
		sprintf(str_x, " (%s)", "normal");
		break;
	}
	strcat(m_str, str_x);
}

void net_device_val::print_val()
{
	size_t i = 0;
	rings_hash_map_t::iterator ring_iter;

	set_str();
	nd_logdbg("%s", m_str);

	nd_logdbg("  ip list: %s", (m_ip.empty() ? "empty " : ""));
	for (i = 0; i < m_ip.size(); i++) {
		nd_logdbg("    inet: %d.%d.%d.%d netmask: %d.%d.%d.%d flags: 0x%X",
				NIPQUAD(m_ip[i]->local_addr), NIPQUAD(m_ip[i]->netmask), m_ip[i]->flags);
	}

	nd_logdbg("  slave list: %s", (m_slaves.empty() ? "empty " : ""));
	for (i = 0; i < m_slaves.size(); i++) {
		char if_name[IFNAMSIZ] = {0};

		if_name[0] = '\0';
		if_indextoname(m_slaves[i]->if_index, if_name);
		nd_logdbg("    %d: %s: %s active: %d",
				m_slaves[i]->if_index, if_name, m_slaves[i]->p_L2_addr->to_str().c_str(), m_slaves[i]->active);
	}

	nd_logdbg("  ring list: %s", (m_h_ring_map.empty() ? "empty " : ""));
	for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
		ring *cur_ring = ring_iter->second.first;
		NOT_IN_USE(cur_ring); // Suppress --enable-opt-log=high warning
		nd_logdbg("    %d: 0x%X: parent 0x%X ref %d",
				cur_ring->get_if_index(), cur_ring, cur_ring->get_parent(), ring_iter->second.second);
	}
}

void net_device_val::set_slave_array()
{
	char active_slave[IFNAMSIZ] = {0}; // gather the slave data (only for active-backup)-

	nd_logdbg("");

	if (m_bond == NETVSC) {
		slave_data_t* s = NULL;
		unsigned int slave_flags = 0;
		if (get_netvsc_slave(get_ifname_link(), active_slave, slave_flags)) {
			if ((slave_flags & IFF_UP) &&
					verify_qp_creation(active_slave, IBV_QPT_RAW_PACKET)) {
				s = new slave_data_t(if_nametoindex(active_slave));
				m_slaves.push_back(s);
			}
		}
	} else if (m_bond == NO_BOND) {
		slave_data_t* s = new slave_data_t(if_nametoindex(get_ifname()));
		m_slaves.push_back(s);
	} else {
		// bond device

		// get list of all slave devices
		char slaves_list[IFNAMSIZ * MAX_SLAVES] = {0};
		if (get_bond_slaves_name_list(get_ifname_link(), slaves_list, sizeof(slaves_list))) {
			char* slave = strtok(slaves_list, " ");
			while (slave) {
				char* p = strchr(slave, '\n');
				if (p) *p = '\0'; // Remove the tailing 'new line" char

				slave_data_t* s = new slave_data_t(if_nametoindex(slave));
				m_slaves.push_back(s);
				slave = strtok(NULL, " ");
			}
		}

		// find the active slave
		if (get_bond_active_slave_name(get_ifname_link(), active_slave, sizeof(active_slave))) {
			m_if_active = if_nametoindex(active_slave);
			nd_logdbg("found the active slave: %d: '%s'", m_if_active, active_slave);
		}
		else {
			nd_logdbg("failed to find the active slave, Moving to LAG state");
		}
	}

	bool up_and_active_slaves[m_slaves.size()];

	memset(up_and_active_slaves, 0, sizeof(up_and_active_slaves));

	if (m_bond == LAG_8023ad) {
		get_up_and_active_slaves(up_and_active_slaves, m_slaves.size());
	}

	for (uint16_t i = 0; i < m_slaves.size(); i++) {
		char if_name[IFNAMSIZ] = {0};
		char base_ifname[IFNAMSIZ];

		if (!if_indextoname(m_slaves[i]->if_index, if_name)) {
			nd_logerr("Can not find interface name by index=%d", m_slaves[i]->if_index);
			continue;
		}
		get_base_interface_name((const char*)if_name, base_ifname, sizeof(base_ifname));

		// Save L2 address
		m_slaves[i]->p_L2_addr = create_L2_address(if_name);
		m_slaves[i]->active = false;

		if (m_bond == ACTIVE_BACKUP && m_if_active == m_slaves[i]->if_index) {
			m_slaves[i]->active = true;
		}

		if (m_bond == LAG_8023ad) {
			if (up_and_active_slaves[i]) {
				m_slaves[i]->active = true;
			}
		}

		if (m_bond == NETVSC) {
			m_slaves[i]->active = true;
		}

		if (m_bond == NO_BOND) {
			m_slaves[i]->active = true;
		}

		m_slaves[i]->p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(base_ifname);
		m_slaves[i]->port_num = get_port_from_ifname(base_ifname);
		if (m_slaves[i]->port_num < 1) {
			nd_logdbg("Error: port %d ==> ifname=%s base_ifname=%s",
					m_slaves[i]->port_num, if_name, base_ifname);
		}
	}

	if (m_slaves.empty() && NETVSC != m_bond) {
		m_state = INVALID;
		nd_logpanic("No slave found.");
	}
}

const slave_data_t* net_device_val::get_slave(int if_index)
{
	auto_unlocker lock(m_lock);

	slave_data_vector_t::iterator iter;
	for (iter = m_slaves.begin(); iter != m_slaves.end(); iter++) {
		slave_data_t *cur_slave = *iter;
		if (cur_slave->if_index == if_index) {
			return cur_slave;
		}
	}
	return NULL;
}

void net_device_val::verify_bonding_mode()
{
	// this is a bond interface, lets get its mode.
	char bond_mode_file_content[FILENAME_MAX];
	char bond_failover_mac_file_content[FILENAME_MAX];
	char bond_mode_param_file[FILENAME_MAX];
	char bond_failover_mac_param_file[FILENAME_MAX];
	char bond_xmit_hash_policy_file_content[FILENAME_MAX];
	char bond_xmit_hash_policy_param_file[FILENAME_MAX];

	memset(bond_mode_file_content, 0, FILENAME_MAX);
	sprintf(bond_mode_param_file, BONDING_MODE_PARAM_FILE, get_ifname_link());
	sprintf(bond_failover_mac_param_file, BONDING_FAILOVER_MAC_PARAM_FILE, get_ifname_link());

	if (priv_safe_read_file(bond_mode_param_file, bond_mode_file_content, FILENAME_MAX) > 0) {
		char *bond_mode = NULL;
		bond_mode = strtok(bond_mode_file_content, " ");
		if (bond_mode) {
			if (!strcmp(bond_mode, "active-backup")) {
				m_bond = ACTIVE_BACKUP;
			} else if (strstr(bond_mode, "802.3ad")) {
				m_bond = LAG_8023ad;
			}
			if (priv_safe_read_file(bond_failover_mac_param_file, bond_failover_mac_file_content, FILENAME_MAX) > 0) {
				if(strstr(bond_failover_mac_file_content, "0")){
					m_bond_fail_over_mac = 0;
				} else if(strstr(bond_failover_mac_file_content, "1")){
					m_bond_fail_over_mac = 1;
				} else if(strstr(bond_failover_mac_file_content, "2")){
					m_bond_fail_over_mac = 2;
				}
			}
		}
	}

	memset(bond_xmit_hash_policy_file_content, 0, FILENAME_MAX);
	sprintf(bond_xmit_hash_policy_param_file, BONDING_XMIT_HASH_POLICY_PARAM_FILE, get_ifname_link());
	if (priv_safe_try_read_file(bond_xmit_hash_policy_param_file, bond_xmit_hash_policy_file_content, FILENAME_MAX) > 0) {
		char *bond_xhp = NULL;
		char *saveptr = NULL;

		bond_xhp = strtok_r(bond_xmit_hash_policy_file_content, " ", &saveptr);
		if (NULL == bond_xhp) {
			nd_logdbg("could not parse bond xmit hash policy, staying with default (L2)\n");
		} else {
			bond_xhp = strtok_r(NULL, " ", &saveptr);
			if (bond_xhp) {
				m_bond_xmit_hash_policy = (bond_xmit_hash_policy)strtol(bond_xhp, NULL , 10);
				if (m_bond_xmit_hash_policy < XHP_LAYER_2 || m_bond_xmit_hash_policy > XHP_ENCAP_3_4) {
					vlog_printf(VLOG_WARNING,"VMA does not support xmit hash policy = %d\n", m_bond_xmit_hash_policy);
					m_bond_xmit_hash_policy = XHP_LAYER_2;
				}
			}
			nd_logdbg("got bond xmit hash policy = %d\n", m_bond_xmit_hash_policy);
		}
	} else {
		nd_logdbg("could not read bond xmit hash policy, staying with default (L2)\n");
	}

	if (m_bond == NO_BOND || m_bond_fail_over_mac > 1) {
		vlog_printf(VLOG_WARNING,"******************************************************************************\n");
		vlog_printf(VLOG_WARNING,"VMA doesn't support current bonding configuration of %s.\n", get_ifname_link());
		vlog_printf(VLOG_WARNING,"The only supported bonding mode is \"802.3ad 4(#4)\" or \"active-backup(#1)\"\n");
		vlog_printf(VLOG_WARNING,"with \"fail_over_mac=1\" or \"fail_over_mac=0\".\n");
		vlog_printf(VLOG_WARNING,"The effect of working in unsupported bonding mode is undefined.\n");
		vlog_printf(VLOG_WARNING,"Read more about Bonding in the VMA's User Manual\n");
		vlog_printf(VLOG_WARNING,"******************************************************************************\n");
	}
}

/**
 * only for active-backup bond
 */
bool net_device_val::update_active_backup_slaves()
{
	// update the active slave
	// /sys/class/net/bond0/bonding/active_slave
	char active_slave[IFNAMSIZ*MAX_SLAVES] = {0};
	int if_active_slave = 0;

	if (!get_bond_active_slave_name(get_ifname_link(), active_slave, IFNAMSIZ)) {
		nd_logdbg("failed to find the active slave!");
		return 0;
	}

	//nothing changed
	if_active_slave = if_nametoindex(active_slave);
	if (m_if_active == if_active_slave) {
		return 0;
	}

	m_p_L2_addr = create_L2_address(get_ifname());
	bool found_active_slave = false;
	for (size_t i = 0; i < m_slaves.size(); i++) {
		if (if_active_slave == m_slaves[i]->if_index) {
			m_slaves[i]->active = true;
			found_active_slave = true;
			nd_logdbg("Slave changed old=%d new=%d", m_if_active, if_active_slave);
			m_if_active = if_active_slave;
		} else {
			m_slaves[i]->active = false;
		}
	}
	if (!found_active_slave) {
		nd_logdbg("Failed to locate new active slave details");
		return 0;
	}
	// restart rings
	rings_hash_map_t::iterator ring_iter;
	for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
		THE_RING->restart();
	}
	return 1;
}

/*
 * this function assume m_slaves[i]->if_name and m_slaves.size() are already set.
 */
bool net_device_val::get_up_and_active_slaves(bool* up_and_active_slaves, size_t size)
{
	bool up_slaves[m_slaves.size()];
	int num_up = 0;
	bool active_slaves[m_slaves.size()];
	int num_up_and_active = 0;
	size_t i = 0;

	if (size != m_slaves.size()) {
		nd_logwarn("programmer error! array size is not correct");
		return false;
	}

	/* get slaves operstate and active state */
	for (i = 0; i < m_slaves.size(); i++) {
		char oper_state[5] = {0};
		char slave_state[10] = {0};
		char if_name[IFNAMSIZ] = {0};

		if (!if_indextoname(m_slaves[i]->if_index, if_name)) {
			nd_logerr("Can not find interface name by index=%d", m_slaves[i]->if_index);
			continue;
		}

		// get interface operstate
		get_interface_oper_state(if_name, oper_state, sizeof(oper_state));
		if (strstr(oper_state, "up")) {
			num_up++;
			up_slaves[i] = true;
		} else {
			up_slaves[i] = false;
		}

		active_slaves[i] = true;
		// get slave state
		if (get_bond_slave_state(if_name, slave_state, sizeof(slave_state))){
			if (!strstr(slave_state, "active"))
				active_slaves[i] = false;
		}

		if (active_slaves[i] && up_slaves[i]) {
			up_and_active_slaves[i] = true;
			num_up_and_active++;
		} else {
			up_and_active_slaves[i] = false;
		}
	}

	/* make sure at least one up interface is active */
	if (!num_up_and_active && num_up) {
		for (i = 0; i < m_slaves.size(); i++) {
			if (up_slaves[i]) {
				up_and_active_slaves[i] = true;
				break;
			}
		}
	}

	return true;
}

bool net_device_val::update_active_slaves()
{
	bool changed = false;
	bool up_and_active_slaves[m_slaves.size()];
	size_t i = 0;

	memset(&up_and_active_slaves, 0, m_slaves.size() * sizeof(bool));
	get_up_and_active_slaves(up_and_active_slaves, m_slaves.size());

	/* compare to current status and prepare for restart */
	for (i = 0; i< m_slaves.size(); i++) {
		if (up_and_active_slaves[i]) {
			//slave came up
			if (!m_slaves[i]->active) {
				nd_logdbg("slave %d is up ", m_slaves[i]->if_index);
				m_slaves[i]->active = true;
				changed = true;
			}
		}
		else {
			//slave went down
			if (m_slaves[i]->active) {
				nd_logdbg("slave %d is down ", m_slaves[i]->if_index);
				m_slaves[i]->active = false;
				changed = true;
			}
		}
	}

	/* restart if status changed */
	if (changed) {
		m_p_L2_addr = create_L2_address(get_ifname());
		// restart rings
		rings_hash_map_t::iterator ring_iter;
		for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
			THE_RING->restart();
		}
		return 1;
	}
	return 0;
}

void net_device_val::update_netvsc_slaves(int if_index, int if_flags)
{
	slave_data_t* s = NULL;
	bool found = false;
	ib_ctx_handler *ib_ctx = NULL, *up_ib_ctx = NULL;
	char if_name[IFNAMSIZ] = {0};

	m_lock.lock();

	if (if_indextoname(if_index, if_name) && (if_flags & IFF_UP) && (if_flags & IFF_RUNNING)) {
		nd_logdbg("slave %d is up", if_index);

		g_p_ib_ctx_handler_collection->update_tbl(if_name);
		if ((up_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(if_name))) {
			s = new slave_data_t(if_index);
			s->active = true;
			s->p_ib_ctx = up_ib_ctx;
			s->p_L2_addr = create_L2_address(if_name);
			s->port_num = get_port_from_ifname(if_name);
			m_slaves.push_back(s);

			up_ib_ctx->set_ctx_time_converter_status(g_p_net_device_table_mgr->get_ctx_time_conversion_mode());
			g_buffer_pool_rx->register_memory(s->p_ib_ctx);
			g_buffer_pool_tx->register_memory(s->p_ib_ctx);
			found = true;
		}
	} else {
		if (!m_slaves.empty()) {
			s = m_slaves.back();
			m_slaves.pop_back();

			nd_logdbg("slave %d is down ", s->if_index);

			ib_ctx = s->p_ib_ctx;
			delete s;
			found = true;
		}
	}

	m_lock.unlock();

	if (!found) {
		nd_logdbg("Unable to detect any changes for interface %d. ignoring", if_index);
		return;
	}

	/* restart if status changed */
	m_p_L2_addr = create_L2_address(get_ifname());
	// restart rings
	rings_hash_map_t::iterator ring_iter;
	for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
		THE_RING->restart();
	}

	if (ib_ctx) {
		g_p_ib_ctx_handler_collection->del_ib_ctx(ib_ctx);
	}
}

std::string net_device_val::to_str()
{
	return std::string("Net Device: " + m_name);
}

ring* net_device_val::reserve_ring(resource_allocation_key *key)
{
	nd_logfunc("");
	auto_unlocker lock(m_lock);
	key = ring_key_redirection_reserve(key);
	ring* the_ring = NULL;
	rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(key);

	if (m_h_ring_map.end() == ring_iter) {
		nd_logdbg("Creating new RING for %s", key->to_str());
		// copy key since we keep pointer and socket can die so map will lose pointer
		resource_allocation_key *new_key = new resource_allocation_key(*key);
		the_ring = create_ring(new_key);
		if (!the_ring) {
			return NULL;
		}
		m_h_ring_map[new_key] = std::make_pair(the_ring, 0); // each ring is born with ref_count = 0
		ring_iter = m_h_ring_map.find(new_key);
		epoll_event ev = {0, {0}};
		int num_ring_rx_fds = the_ring->get_num_resources();
		int *ring_rx_fds_array = the_ring->get_rx_channel_fds();
		ev.events = EPOLLIN;
		for (int i = 0; i < num_ring_rx_fds; i++) {
			int cq_ch_fd = ring_rx_fds_array[i];
			ev.data.fd = cq_ch_fd;
			BULLSEYE_EXCLUDE_BLOCK_START
			if (unlikely( orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(),
					EPOLL_CTL_ADD, cq_ch_fd, &ev))) {
				nd_logerr("Failed to add RING notification fd to global_table_mgr_epfd (errno=%d %m)", errno);
			}
			BULLSEYE_EXCLUDE_BLOCK_END
		}

		g_p_net_device_table_mgr->global_ring_wakeup();
	}
	// now we are sure the ring is in the map

	ADD_RING_REF_CNT;
	the_ring = GET_THE_RING(key);

	nd_logdbg("0x%X: if_index %d parent 0x%X ref %d key %s",
			the_ring, the_ring->get_if_index(),
			the_ring->get_parent(), RING_REF_CNT, key->to_str());

	return the_ring;
}

bool net_device_val::release_ring(resource_allocation_key *key)
{
	nd_logfunc("");

	resource_allocation_key *red_key;

	auto_unlocker lock(m_lock);
	red_key = get_ring_key_redirection(key);
	ring* the_ring = NULL;
	rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(red_key);

	if (m_h_ring_map.end() != ring_iter) {
		DEC_RING_REF_CNT;
		the_ring = GET_THE_RING(red_key);

		nd_logdbg("0x%X: if_index %d parent 0x%X ref %d key %s",
				the_ring, the_ring->get_if_index(),
				the_ring->get_parent(), RING_REF_CNT, red_key->to_str());

		if ( TEST_REF_CNT_ZERO ) {
			int num_ring_rx_fds = the_ring->get_num_resources();
			int *ring_rx_fds_array = the_ring->get_rx_channel_fds();
			nd_logdbg("Deleting RING %p for key %s and removing notification fd from global_table_mgr_epfd (epfd=%d)",
					the_ring, red_key->to_str(), g_p_net_device_table_mgr->global_ring_epfd_get());
			for (int i = 0; i < num_ring_rx_fds; i++) {
				int cq_ch_fd = ring_rx_fds_array[i];
				BULLSEYE_EXCLUDE_BLOCK_START
				if (unlikely(orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(),
						EPOLL_CTL_DEL, cq_ch_fd, NULL))) {
					nd_logerr("Failed to delete RING notification fd to global_table_mgr_epfd (errno=%d %m)", errno);
				}
				BULLSEYE_EXCLUDE_BLOCK_END
			}

			ring_key_redirection_release(key);

			delete the_ring;
			delete ring_iter->first;
			m_h_ring_map.erase(ring_iter);
		}
		return true;
	}
	return false;
}

/*
 * this function maps key to new keys that it created
 * the key that it creates is the size of the map
 */
resource_allocation_key* net_device_val::ring_key_redirection_reserve(resource_allocation_key *key)
{
	// if allocation logic is usr idx feature disabled
	if (!safe_mce_sys().ring_limit_per_interface ||
	    key->get_ring_alloc_logic() == RING_LOGIC_PER_USER_ID)
		return key;

	if (m_h_ring_key_redirection_map.find(key) != m_h_ring_key_redirection_map.end()) {
		m_h_ring_key_redirection_map[key].second++;
		nd_logdbg("redirecting key=%s (ref-count:%d) to key=%s", key->to_str(),
			m_h_ring_key_redirection_map[key].second,
			m_h_ring_key_redirection_map[key].first->to_str());
		return m_h_ring_key_redirection_map[key].first;
	}

	int ring_map_size = (int)m_h_ring_map.size();
	if (safe_mce_sys().ring_limit_per_interface > ring_map_size) {
		resource_allocation_key *key2 = new resource_allocation_key(*key);
		// replace key to redirection key
		key2->set_user_id_key(ring_map_size);
		m_h_ring_key_redirection_map[key] = std::make_pair(key2, 1);
		nd_logdbg("redirecting key=%s (ref-count:1) to key=%s",
			  key->to_str(), key2->to_str());
		return key2;
	}

	rings_hash_map_t::iterator ring_iter = m_h_ring_map.begin();
	int min_ref_count = ring_iter->second.second;
	resource_allocation_key *min_key = ring_iter->first;
	while (ring_iter != m_h_ring_map.end()) {
		// redirect only to ring with the same profile
		if (ring_iter->first->get_ring_profile_key() ==
		    key->get_ring_profile_key() &&
		    ring_iter->second.second < min_ref_count) {
			min_ref_count = ring_iter->second.second;
			min_key = ring_iter->first;
		}
		ring_iter++;
	}
	m_h_ring_key_redirection_map[key] = std::make_pair(new resource_allocation_key(*min_key), 1);
	nd_logdbg("redirecting key=%s (ref-count:1) to key=%s",
		  key->to_str(), min_key->to_str());
	return min_key;
}

resource_allocation_key* net_device_val::get_ring_key_redirection(resource_allocation_key *key)
{
	if (!safe_mce_sys().ring_limit_per_interface) return key;

	if (m_h_ring_key_redirection_map.find(key) == m_h_ring_key_redirection_map.end()) {
		nd_logdbg("key = %s is not found in the redirection map",
			  key->to_str());
		return key;
	}

	return m_h_ring_key_redirection_map[key].first;
}

void net_device_val::ring_key_redirection_release(resource_allocation_key *key)
{
	if (safe_mce_sys().ring_limit_per_interface && m_h_ring_key_redirection_map.find(key) != m_h_ring_key_redirection_map.end()
		&& --m_h_ring_key_redirection_map[key].second == 0) {
		// this is allocated in ring_key_redirection_reserve
		nd_logdbg("release redirecting key=%s (ref-count:%d) to key=%s", key->to_str(),
			m_h_ring_key_redirection_map[key].second,
			m_h_ring_key_redirection_map[key].first->to_str());
		delete m_h_ring_key_redirection_map[key].first;
		m_h_ring_key_redirection_map.erase(key);
	}
}

int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array /*=NULL*/)
{
	nd_logfuncall("");
	int ret_total = 0;
	auto_unlocker lock(m_lock);
	rings_hash_map_t::iterator ring_iter;
	for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
		int ret = THE_RING->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array);
		BULLSEYE_EXCLUDE_BLOCK_START
		if (ret < 0 && errno != EAGAIN) {
			nd_logerr("Error in ring->poll_and_process_element() of %p (errno=%d %m)", THE_RING, errno);
			return ret;
		}
		BULLSEYE_EXCLUDE_BLOCK_END
		if (ret > 0)
			nd_logfunc("ring[%p] Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn);
		ret_total += ret;
	}
	return ret_total;
}

int net_device_val::global_ring_request_notification(uint64_t poll_sn)
{
	int ret_total = 0;
	auto_unlocker lock(m_lock);
	rings_hash_map_t::iterator ring_iter;
	for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
		int ret = THE_RING->request_notification(CQT_RX, poll_sn);
		if (ret < 0) {
			nd_logerr("Error ring[%p]->request_notification() (errno=%d %m)", THE_RING, errno);
			return ret;
		}
		nd_logfunc("ring[%p] Returned with: %d (sn=%d)", THE_RING, ret, poll_sn);
		ret_total += ret;
	}
	return ret_total;
}

int net_device_val::ring_drain_and_proccess()
{
	nd_logfuncall();
	int ret_total = 0;

	auto_unlocker lock(m_lock);
	rings_hash_map_t::iterator ring_iter;
	for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
		int ret = THE_RING->drain_and_proccess();
		if (ret < 0)
			return ret;
		if (ret > 0)
			nd_logfunc("cq[%p] Returned with: %d", THE_RING, ret);
		ret_total += ret;
	}
	return ret_total;
}

void net_device_val::ring_adapt_cq_moderation()
{
	nd_logfuncall();

	auto_unlocker lock(m_lock);
	rings_hash_map_t::iterator ring_iter;
	for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) {
		THE_RING->adapt_cq_moderation();
	}
}

void net_device_val::register_to_ibverbs_events(event_handler_ibverbs *handler) {
	for (size_t i = 0; i < m_slaves.size(); i++) {
		bool found = false;
		for (size_t j = 0; j < i; j++) {
			if (m_slaves[i]->p_ib_ctx == m_slaves[j]->p_ib_ctx) {
				found = true; //two slaves might be on two ports of the same device, register only once
				break;
			}
		}
		if (found)
			continue;
		nd_logfunc("registering slave to ibverbs events slave=%p", m_slaves[i]);
		g_p_event_handler_manager->register_ibverbs_event(m_slaves[i]->p_ib_ctx->get_ibv_context()->async_fd, handler, m_slaves[i]->p_ib_ctx->get_ibv_context(), 0);
	}
}

void net_device_val::unregister_to_ibverbs_events(event_handler_ibverbs *handler) {
	for (size_t i = 0; i < m_slaves.size(); i++) {
		bool found = false;
		for (size_t j = 0; j < i; j++) {
			if (m_slaves[i]->p_ib_ctx == m_slaves[j]->p_ib_ctx) {
				found = true; //two slaves might be on two ports of the same device, unregister only once
				break;
			}
		}
		if (found)
			continue;
		nd_logfunc("unregistering slave to ibverbs events slave=%p", m_slaves[i]);
		g_p_event_handler_manager->unregister_ibverbs_event(m_slaves[i]->p_ib_ctx->get_ibv_context()->async_fd, handler);
	}
}

void net_device_val_eth::configure()
{
	m_p_L2_addr = create_L2_address(get_ifname());

	BULLSEYE_EXCLUDE_BLOCK_START
	if (m_p_L2_addr == NULL) {
		nd_logpanic("m_p_L2_addr allocation error");
	}
	BULLSEYE_EXCLUDE_BLOCK_END

	create_br_address(get_ifname());

	m_vlan = get_vlan_id_from_ifname(get_ifname());
	if (m_vlan) {
		parse_prio_egress_map();
	}
	if (m_vlan && m_bond != NO_BOND && m_bond_fail_over_mac == 1) {
		vlog_printf(VLOG_WARNING, " ******************************************************************\n");
		vlog_printf(VLOG_WARNING, "%s: vlan over bond while fail_over_mac=1 is not offloaded\n", get_ifname());
		vlog_printf(VLOG_WARNING, " ******************************************************************\n");
		m_state = INVALID;
	}
	if(!m_vlan && (get_flags() & IFF_MASTER)) {
		char if_name[IFNAMSIZ] = {0};

		if (!if_indextoname(m_slaves[0]->if_index, if_name)) {
			nd_logerr("Can not find interface name by index=%d", m_slaves[0]->if_index);
		}

		//in case vlan is configured on slave
		m_vlan = get_vlan_id_from_ifname(if_name);
	}
}

int net_device_val::get_priority_by_tc_class(uint32_t tc_class)
{
	tc_class_priority_map::iterator it = m_class_prio_map.find(tc_class);
	if (it == m_class_prio_map.end()) {
		return VMA_DEFAULT_ENGRESS_MAP_PRIO;
	}
	return it->second;
}

void net_device_val_eth::parse_prio_egress_map()
{
#ifdef HAVE_LIBNL3
	int len, ret;
	nl_cache *cache = NULL;
	rtnl_link *link;
	vlan_map *map;

	nl_socket_handle *nl_socket = nl_socket_handle_alloc();
	if (!nl_socket) {
		nd_logdbg("unable to allocate socket socket %m", errno);
		goto out;
	}
	nl_socket_set_local_port(nl_socket, 0);
	ret = nl_connect(nl_socket, NETLINK_ROUTE);
	if (ret < 0) {
		nd_logdbg("unable to connect to libnl socket %d %m", ret, errno);
		goto out;
	}
	ret = rtnl_link_alloc_cache(nl_socket, AF_UNSPEC, &cache);
	if (!cache) {
		nd_logdbg("unable to create libnl cache %d %m", ret, errno);
		goto out;
	}
	link = rtnl_link_get_by_name(cache, get_ifname());
	if (!link) {
		nd_logdbg("unable to get libnl link %d %m", ret, errno);
		goto out;
	}
	map = rtnl_link_vlan_get_egress_map(link, &len);
	if (!map || !len) {
		nd_logdbg("no egress map found %d %p",len, map);
		goto out;
	}
	for (int i = 0; i < len; i++) {
		m_class_prio_map[map[i].vm_from] = map[i].vm_to;
	}
out:
	if (cache) {
		nl_cache_free(cache);
	}
	if (nl_socket) {
		nl_socket_handle_free(nl_socket);
	}
#else
	nd_logdbg("libnl3 not found, cannot read engress map, "
		  "SO_PRIORITY will not work properly");
#endif
}

ring* net_device_val_eth::create_ring(resource_allocation_key *key)
{
	ring* ring = NULL;

	// if this is a ring profile key get the profile from the global map
	if (key->get_ring_profile_key()) {
		if (!g_p_ring_profile) {
			nd_logdbg("could not find ring profile");
			return NULL;
		}
		ring_profile *prof =
			g_p_ring_profile->get_profile(key->get_ring_profile_key());
		if (prof == NULL) {
			nd_logerr("could not find ring profile %d",
				  key->get_ring_profile_key());
			return NULL;
		}
		try {
			switch (prof->get_ring_type()) {
#ifdef HAVE_MP_RQ
			case VMA_RING_CYCLIC_BUFFER:
				ring = new ring_eth_cb(get_if_idx(),
						       &prof->get_desc()->ring_cyclicb,
						       key->get_memory_descriptor());
			break;
#endif
			case VMA_RING_EXTERNAL_MEM:
				ring = new ring_eth_direct(get_if_idx(),
							   &prof->get_desc()->ring_ext);
			break;
			default:
				nd_logdbg("Unknown ring type");
				break;
			}
		} catch (vma_error &error) {
			nd_logdbg("failed creating ring %s", error.message);
		}
	} else {
		try {
			switch (m_bond) {
			case NO_BOND:
				ring = new ring_eth(get_if_idx());
				break;
			case ACTIVE_BACKUP:
			case LAG_8023ad:
				ring = new ring_bond_eth(get_if_idx());
				break;
			case NETVSC:
				ring = new ring_bond_netvsc(get_if_idx());
				break;
			default:
				nd_logdbg("Unknown ring type");
				break;
			}
		} catch (vma_error &error) {
			nd_logdbg("failed creating ring %s", error.message);
		}
	}
	return ring;
}

L2_address* net_device_val_eth::create_L2_address(const char* ifname)
{
	if (m_p_L2_addr) {
		delete m_p_L2_addr;
		m_p_L2_addr = NULL;
	}
	unsigned char hw_addr[ETH_ALEN];
	get_local_ll_addr(ifname, hw_addr, ETH_ALEN, false);
	return new ETH_addr(hw_addr);
}

void net_device_val_eth::create_br_address(const char* ifname)
{
	if(m_p_br_addr) {
		delete m_p_br_addr;
		m_p_br_addr = NULL;
	}
	uint8_t hw_addr[ETH_ALEN];
	get_local_ll_addr(ifname, hw_addr, ETH_ALEN, true);
	m_p_br_addr = new ETH_addr(hw_addr);

	BULLSEYE_EXCLUDE_BLOCK_START
	if(m_p_br_addr == NULL) {
		nd_logpanic("m_p_br_addr allocation error");
	}
	BULLSEYE_EXCLUDE_BLOCK_END
}
std::string net_device_val_eth::to_str()
{
	return std::string("ETH: " + net_device_val::to_str());
}

net_device_val_ib::~net_device_val_ib()
{
	struct in_addr in;
	if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) {
		g_p_neigh_table_mgr->unregister_observer(neigh_key(ip_address(in.s_addr), this), this);
	}
}

void net_device_val_ib::configure()
{
	ib_ctx_handler* p_ib_ctx = NULL;
	struct in_addr in;

	m_p_L2_addr = create_L2_address(get_ifname());

	BULLSEYE_EXCLUDE_BLOCK_START
	if(m_p_L2_addr == NULL) {
		nd_logpanic("m_p_L2_addr allocation error");
	}
	BULLSEYE_EXCLUDE_BLOCK_END

	create_br_address(get_ifname());

	if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) {
		g_p_neigh_table_mgr->unregister_observer(neigh_key(ip_address(in.s_addr), this), this);
	}

	//Register to IB BR neigh
	cache_entry_subject<neigh_key, neigh_val*>* p_ces = NULL;
	if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) {
		g_p_neigh_table_mgr->register_observer(neigh_key(ip_address(in.s_addr), this), this, &p_ces);
	}
	m_br_neigh = dynamic_cast<neigh_ib_broadcast*>(p_ces);

	p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link());
	if (!p_ib_ctx || ibv_query_pkey(p_ib_ctx->get_ibv_context(), get_port_from_ifname(get_ifname_link()), 0, &m_pkey)) {
		nd_logerr("failed querying pkey");
	}
	nd_logdbg("pkey: %d", m_pkey);
}

ring* net_device_val_ib::create_ring(resource_allocation_key *key)
{
	ring* ring = NULL;

	NOT_IN_USE(key);
	try {
		switch (m_bond) {
		case NO_BOND:
			ring = new ring_ib(get_if_idx());
			break;
		case ACTIVE_BACKUP:
		case LAG_8023ad:
			ring = new ring_bond_ib(get_if_idx());
			break;
		default:
			nd_logdbg("Unknown ring type");
			break;
		}
	} catch (vma_error &error) {
		nd_logdbg("failed creating ring %s", error.message);
	}

	return ring;
}

L2_address* net_device_val_ib::create_L2_address(const char* ifname)
{
	if (m_p_L2_addr) {
		delete m_p_L2_addr;
		m_p_L2_addr = NULL;
	}
	unsigned char hw_addr[IPOIB_HW_ADDR_LEN];
	get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, false);
	return new IPoIB_addr(hw_addr);
}

void net_device_val_ib::create_br_address(const char* ifname)
{
	if (m_p_br_addr) {
		delete m_p_br_addr;
		m_p_br_addr = NULL;
	}
	unsigned char hw_addr[IPOIB_HW_ADDR_LEN];
	get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, true);
	m_p_br_addr = new IPoIB_addr(hw_addr);
	BULLSEYE_EXCLUDE_BLOCK_START
	if (m_p_br_addr == NULL) {
		nd_logpanic("m_p_br_addr allocation error");
	}
	BULLSEYE_EXCLUDE_BLOCK_END
}

std::string net_device_val_ib::to_str()
{
	return std::string("IB: " + net_device_val::to_str());
}


bool net_device_val::verify_bond_ipoib_or_eth_qp_creation()
{
	char slaves[IFNAMSIZ * MAX_SLAVES] = {0};
	if (!get_bond_slaves_name_list(get_ifname_link(), slaves, sizeof slaves)) {
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded, slave list or bond name could not be found\n", get_ifname());
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		return false;
	}
	//go over all slaves and check preconditions
	bool bond_ok = true;
	char* slave_name;
	char* save_ptr;
	slave_name = strtok_r(slaves, " ", &save_ptr);
	while (slave_name != NULL)
	{
		char* p = strchr(slave_name, '\n');
		if (p) *p = '\0'; // Remove the tailing 'new line" char
		if (!verify_ipoib_or_eth_qp_creation(slave_name)) {
			//check all slaves but print only once for bond
			bond_ok =  false;
		}
		slave_name = strtok_r(NULL, " ", &save_ptr);
	}
	if (!bond_ok) {
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		vlog_printf(VLOG_WARNING,"* Bond %s will not be offloaded due to problem with its slaves.\n", get_ifname());
		vlog_printf(VLOG_WARNING,"* Check warning messages for more information.\n");
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
	} else {
		/*
		 * Print warning message while bond device contains two slaves of the same HCA
		 * while RoCE LAG is enabled for both slaves.
		 */
		sys_image_guid_map_t::iterator guid_iter;
		for (guid_iter = m_sys_image_guid_map.begin(); guid_iter != m_sys_image_guid_map.end(); guid_iter++) {
			char bond_roce_lag_path[256] = {0};
			if (guid_iter->second.size() > 1 &&
					check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), guid_iter->second.front().c_str()) &&
					check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), guid_iter->second.back().c_str())) {
				print_roce_lag_warnings(get_ifname_link(), bond_roce_lag_path, guid_iter->second.front().c_str(), guid_iter->second.back().c_str());
			}
		}
	}
	return bond_ok;
}

//interface name can be slave while ifa struct can describe bond
bool net_device_val::verify_ipoib_or_eth_qp_creation(const char* interface_name)
{
	if (m_type == ARPHRD_INFINIBAND) {
		if (verify_enable_ipoib(interface_name) && verify_qp_creation(interface_name, IBV_QPT_UD)) {
			return true;
		}
	} else {
		if (verify_qp_creation(interface_name, IBV_QPT_RAW_PACKET)) {
			return true;
		}
	}
	return false;
}

bool net_device_val::verify_enable_ipoib(const char* interface_name)
{
	char filename[256] = "\0";
	char ifname[IFNAMSIZ] = "\0";
	NOT_IN_USE(interface_name); // Suppress --enable-opt-log=high warning

	if(!safe_mce_sys().enable_ipoib) {
		nd_logdbg("Blocking offload: IPoIB interfaces ('%s')", interface_name);
		return false;
	}

#ifndef DEFINED_IBV_QP_INIT_SOURCE_QPN
	// Note: mlx4 does not support this capability
	ib_ctx_handler* ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link());
	if (!ib_ctx->is_mlx4()) {
		nd_logwarn("Blocking offload: SOURCE_QPN is not supported for this driver ('%s')", interface_name);
		return false;
	}
#endif

	// Verify IPoIB is in 'datagram mode' for proper VMA with flow steering operation
	if (validate_ipoib_prop(get_ifname(), m_flags, IPOIB_MODE_PARAM_FILE, "datagram", 8, filename, ifname)) {
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		vlog_printf(VLOG_WARNING,"* IPoIB mode of interface '%s' is \"connected\" !\n", get_ifname());
		vlog_printf(VLOG_WARNING,"* Please change it to datagram: \"echo datagram > %s\" before loading your application with VMA library\n", filename);
		vlog_printf(VLOG_WARNING,"* VMA doesn't support IPoIB in connected mode.\n");
		vlog_printf(VLOG_WARNING,"* Please refer to VMA Release Notes for more information\n");
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		return false;
	}
	else {
		nd_logdbg("verified interface '%s' is running in datagram mode", get_ifname());
	}

	// Verify umcast is disabled for IB flow
	if (validate_ipoib_prop(get_ifname(), m_flags, UMCAST_PARAM_FILE, "0", 1, filename, ifname)) { // Extract UMCAST flag (only for IB transport types)
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		vlog_printf(VLOG_WARNING,"* UMCAST flag is Enabled for interface %s !\n", get_ifname());
		vlog_printf(VLOG_WARNING,"* Please disable it: \"echo 0 > %s\" before loading your application with VMA library\n", filename);
		vlog_printf(VLOG_WARNING,"* This option in no longer needed in this version\n");
		vlog_printf(VLOG_WARNING,"* Please refer to Release Notes for more information\n");
		vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		return false;
	}
	else {
		nd_logdbg("verified interface '%s' is running with umcast disabled", get_ifname());
	}

	return true;
}

//ifname should point to a physical device
bool net_device_val::verify_qp_creation(const char* ifname, enum ibv_qp_type qp_type)
{
	bool success = false;
	char bond_roce_lag_path[256] = {0};
	struct ibv_cq* cq = NULL;
	struct ibv_comp_channel *channel = NULL;
	struct ibv_qp* qp = NULL;

	vma_ibv_qp_init_attr qp_init_attr;
	memset(&qp_init_attr, 0, sizeof(qp_init_attr));

	vma_ibv_cq_init_attr attr;
	memset(&attr, 0, sizeof(attr));

	qp_init_attr.cap.max_send_wr = MCE_DEFAULT_TX_NUM_WRE;
	qp_init_attr.cap.max_recv_wr = MCE_DEFAULT_RX_NUM_WRE;
	qp_init_attr.cap.max_inline_data = MCE_DEFAULT_TX_MAX_INLINE;
	qp_init_attr.cap.max_send_sge = MCE_DEFAULT_TX_NUM_SGE;
	qp_init_attr.cap.max_recv_sge = MCE_DEFAULT_RX_NUM_SGE;
	qp_init_attr.sq_sig_all = 0;
	qp_init_attr.qp_type = qp_type;

	//find ib_cxt
	char base_ifname[IFNAMSIZ];
	get_base_interface_name((const char*)(ifname), base_ifname, sizeof(base_ifname));
	int port_num = get_port_from_ifname(base_ifname);
	ib_ctx_handler* p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(base_ifname);

	if (!p_ib_ctx) {
		nd_logdbg("Cant find ib_ctx for interface %s", base_ifname);
		if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND) {
			if (check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), ifname)) {
				print_roce_lag_warnings(get_ifname_link(), bond_roce_lag_path);
			} else if ((p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()))
					&& strstr(p_ib_ctx->get_ibname(), "bond")) {
				print_roce_lag_warnings(get_ifname_link());
			}
		}
		goto release_resources;
	} else if (port_num > p_ib_ctx->get_ibv_device_attr()->phys_port_cnt) {
		nd_logdbg("Invalid port for interface %s", base_ifname);
		if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND && p_ib_ctx->is_mlx4()) {
			print_roce_lag_warnings(get_ifname_link());
		}
		goto release_resources;
	}

	// Add to guid map in order to detect roce lag issue
	if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND) {
		m_sys_image_guid_map[p_ib_ctx->get_ibv_device_attr()->sys_image_guid].push_back(base_ifname);
	}

	//create qp resources
	channel = ibv_create_comp_channel(p_ib_ctx->get_ibv_context());
	if (!channel) {
		nd_logdbg("channel creation failed for interface %s (errno=%d %m)", ifname, errno);
		goto release_resources;
	}
	VALGRIND_MAKE_MEM_DEFINED(channel, sizeof(ibv_comp_channel));
	cq = vma_ibv_create_cq(p_ib_ctx->get_ibv_context(), safe_mce_sys().tx_num_wr, (void*)this, channel, 0, &attr);
	if (!cq) {
		nd_logdbg("cq creation failed for interface %s (errno=%d %m)", ifname, errno);
		goto release_resources;
	}

	vma_ibv_qp_init_attr_comp_mask(p_ib_ctx->get_ibv_pd(), qp_init_attr);
	qp_init_attr.recv_cq = cq;
	qp_init_attr.send_cq = cq;

	// Set source qpn for non mlx4 IPoIB devices
	if (qp_type == IBV_QPT_UD && !p_ib_ctx->is_mlx4()) {
		unsigned char hw_addr[IPOIB_HW_ADDR_LEN];
		get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, false);
		IPoIB_addr ipoib_addr(hw_addr);
		ibv_source_qpn_set(qp_init_attr, ipoib_addr.get_qpn());
	}

	qp = vma_ibv_create_qp(p_ib_ctx->get_ibv_pd(), &qp_init_attr);
	if (qp) {
		if (qp_type == IBV_QPT_UD && priv_ibv_create_flow_supported(qp, port_num) == -1) {
			nd_logdbg("Create_ibv_flow failed on interface %s (errno=%d %m), Traffic will not be offloaded", ifname, errno);
			goto qp_failure;
		} else {
			success = true;

			if (qp_type == IBV_QPT_RAW_PACKET && !priv_ibv_query_flow_tag_supported(qp, port_num)) {
				p_ib_ctx->set_flow_tag_capability(true);
			}
			nd_logdbg("verified interface %s for flow tag capabilities : %s", ifname, p_ib_ctx->get_flow_tag_capability() ? "enabled" : "disabled");

			if (qp_type == IBV_QPT_RAW_PACKET && p_ib_ctx->is_packet_pacing_supported() && !priv_ibv_query_burst_supported(qp, port_num)) {
				p_ib_ctx->set_burst_capability(true);
			}
			nd_logdbg("verified interface %s for burst capabilities : %s", ifname, p_ib_ctx->get_burst_capability() ? "enabled" : "disabled");
		}
	} else {
		nd_logdbg("QP creation failed on interface %s (errno=%d %m), Traffic will not be offloaded", ifname, errno);
qp_failure:
		int err = errno; //verify_raw_qp_privliges can overwrite errno so keep it before the call
		if (validate_raw_qp_privliges() == 0) {
			// MLNX_OFED raw_qp_privliges file exist with bad value
			vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
			vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname);
			vlog_printf(VLOG_WARNING,"* Working in this mode might causes VMA malfunction over Ethernet/InfiniBand interfaces\n");
			vlog_printf(VLOG_WARNING,"* WARNING: the following steps will restart your network interface!\n");
			vlog_printf(VLOG_WARNING,"* 1. \"echo options ib_uverbs disable_raw_qp_enforcement=1 > /etc/modprobe.d/ib_uverbs.conf\"\n");
			vlog_printf(VLOG_WARNING,"* 2. Restart openibd or rdma service depending on your system configuration\n");
			vlog_printf(VLOG_WARNING,"* Read the RAW_PACKET QP root access enforcement section in the VMA's User Manual for more information\n");
			vlog_printf(VLOG_WARNING,"******************************************************************************************************\n");
		}
		else if (validate_user_has_cap_net_raw_privliges() == 0 || err == EPERM) {
			vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
			vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname);
			vlog_printf(VLOG_WARNING,"* Offloaded resources are restricted to root or user with CAP_NET_RAW privileges\n");
			vlog_printf(VLOG_WARNING,"* Read the CAP_NET_RAW and root access section in the VMA's User Manual for more information\n");
			vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		} else {
			vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
			vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname);
			vlog_printf(VLOG_WARNING,"* VMA was not able to create QP for this device (errno = %d).\n", err);
			vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n");
		}
	}

release_resources:
	if(qp) {
		IF_VERBS_FAILURE(ibv_destroy_qp(qp)) {
			nd_logdbg("qp destroy failed on interface %s (errno=%d %m)", ifname, errno);
			success = false;
		} ENDIF_VERBS_FAILURE;
	}
	if (cq) {
		IF_VERBS_FAILURE(ibv_destroy_cq(cq)) {
			nd_logdbg("cq destroy failed on interface %s (errno=%d %m)", ifname, errno);
			success = false;
		} ENDIF_VERBS_FAILURE;
	}
	if (channel) {
		IF_VERBS_FAILURE(ibv_destroy_comp_channel(channel)) {
			nd_logdbg("channel destroy failed on interface %s (errno=%d %m)", ifname, errno);
			success = false;
		} ENDIF_VERBS_FAILURE;
		VALGRIND_MAKE_MEM_UNDEFINED(channel, sizeof(ibv_comp_channel));
	}
	return success;
}