/* * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "utils/bullseye.h" #include "vma/util/if.h" #include "vma/dev/net_device_val.h" #include "vma/util/vtypes.h" #include "vma/util/utils.h" #include "vma/util/valgrind.h" #include "vma/event/event_handler_manager.h" #include "vma/proto/L2_address.h" #include "vma/dev/ib_ctx_handler_collection.h" #include "vma/dev/ring_tap.h" #include "vma/dev/ring_simple.h" #include "vma/dev/ring_eth_cb.h" #include "vma/dev/ring_eth_direct.h" #include "vma/dev/ring_slave.h" #include "vma/dev/ring_bond.h" #include "vma/sock/sock-redirect.h" #include "vma/dev/net_device_table_mgr.h" #include "vma/proto/neighbour_table_mgr.h" #include "ring_profile.h" #ifdef HAVE_LIBNL3 #include #endif #define MODULE_NAME "ndv" #define nd_logpanic __log_panic #define nd_logerr __log_err #define nd_logwarn __log_warn #define nd_loginfo __log_info #define nd_logdbg __log_info_dbg #define nd_logfunc __log_info_func #define nd_logfuncall __log_info_funcall ring_alloc_logic_attr::ring_alloc_logic_attr(): m_ring_alloc_logic(RING_LOGIC_PER_INTERFACE), m_ring_profile_key(0), m_user_id_key(0) { m_mem_desc.iov_base = NULL; m_mem_desc.iov_len = 0; init(); } ring_alloc_logic_attr::ring_alloc_logic_attr(ring_logic_t ring_logic): m_ring_alloc_logic(ring_logic), m_ring_profile_key(0), m_user_id_key(0) { m_mem_desc.iov_base = NULL; m_mem_desc.iov_len = 0; init(); } ring_alloc_logic_attr::ring_alloc_logic_attr(const ring_alloc_logic_attr &other): m_hash(other.m_hash), m_ring_alloc_logic(other.m_ring_alloc_logic), m_ring_profile_key(other.m_ring_profile_key), m_user_id_key(other.m_user_id_key), m_mem_desc(other.m_mem_desc) { snprintf(m_str, RING_ALLOC_STR_SIZE, "%s", other.m_str); } void ring_alloc_logic_attr::init() { size_t h = 5381; int c; char buff[RING_ALLOC_STR_SIZE]; snprintf(m_str, RING_ALLOC_STR_SIZE, "allocation logic %d profile %d key %ld user address %p " "user length %zd", m_ring_alloc_logic, m_ring_profile_key, m_user_id_key, m_mem_desc.iov_base, m_mem_desc.iov_len); snprintf(buff, RING_ALLOC_STR_SIZE, "%d%d%ld%p%zd", m_ring_alloc_logic, m_ring_profile_key, m_user_id_key, m_mem_desc.iov_base, m_mem_desc.iov_len); const char* chr = buff; while ((c = *chr++)) h = ((h << 5) + h) + c; /* m_hash * 33 + c */ m_hash = h; } void ring_alloc_logic_attr::set_ring_alloc_logic(ring_logic_t logic) { if (m_ring_alloc_logic != logic) { m_ring_alloc_logic = logic; init(); } } void ring_alloc_logic_attr::set_ring_profile_key(vma_ring_profile_key profile) { if (m_ring_profile_key != profile) { m_ring_profile_key = profile; init(); } } void ring_alloc_logic_attr::set_memory_descriptor(iovec &mem_desc) { if (m_mem_desc.iov_base != mem_desc.iov_base || m_mem_desc.iov_len != mem_desc.iov_len) { m_mem_desc = mem_desc; init(); } } void ring_alloc_logic_attr::set_user_id_key(uint64_t user_id_key) { if (m_user_id_key != user_id_key) { m_user_id_key = user_id_key; init(); } } net_device_val::net_device_val(struct net_device_val_desc *desc) : m_lock("net_device_val lock") { bool valid = false; ib_ctx_handler* ib_ctx; struct nlmsghdr *nl_msg = NULL; struct ifinfomsg *nl_msgdata = NULL; int nl_attrlen; struct rtattr *nl_attr; m_if_idx = 0; m_if_link = 0; m_type = 0; m_flags = 0; m_mtu = 0; m_state = INVALID; m_p_L2_addr = NULL; m_p_br_addr = NULL; m_bond = NO_BOND; m_if_active = 0; m_bond_xmit_hash_policy = XHP_LAYER_2; m_bond_fail_over_mac = 0; m_transport_type = VMA_TRANSPORT_UNKNOWN; if (NULL == desc) { nd_logerr("Invalid net_device_val name=%s", "NA"); m_state = INVALID; return; } nl_msg = desc->nl_msg; nl_msgdata = (struct ifinfomsg *)NLMSG_DATA(nl_msg); nl_attr = (struct rtattr *)IFLA_RTA(nl_msgdata); nl_attrlen = IFLA_PAYLOAD(nl_msg); set_type(nl_msgdata->ifi_type); set_if_idx(nl_msgdata->ifi_index); set_flags(nl_msgdata->ifi_flags); while (RTA_OK(nl_attr, nl_attrlen)) { char *nl_attrdata = (char *)RTA_DATA(nl_attr); size_t nl_attrpayload = RTA_PAYLOAD(nl_attr); switch (nl_attr->rta_type) { case IFLA_MTU: set_mtu(*(int32_t *)nl_attrdata); break; case IFLA_LINK: set_if_link(*(int32_t *)nl_attrdata); break; case IFLA_IFNAME: set_ifname(nl_attrdata); break; case IFLA_ADDRESS: set_l2_if_addr((uint8_t *)nl_attrdata, nl_attrpayload); break; case IFLA_BROADCAST: set_l2_bc_addr((uint8_t *)nl_attrdata, nl_attrpayload); break; default: break; } nl_attr = RTA_NEXT(nl_attr, nl_attrlen); } /* Valid interface should have at least one IP address */ set_ip_array(); if (m_ip.empty()) { return; } /* Identify device type */ if ((get_flags() & IFF_MASTER) || check_device_exist(get_ifname_link(), BOND_DEVICE_FILE)) { verify_bonding_mode(); } else if (check_netvsc_device_exist(get_ifname_link())) { m_bond = NETVSC; } else { m_bond = NO_BOND; } set_str(); nd_logdbg("Check interface '%s' (index=%d addr=%d.%d.%d.%d flags=%X)", get_ifname(), get_if_idx(), NIPQUAD(get_local_addr()), get_flags()); valid = false; ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()); switch (m_bond) { case NETVSC: if (get_type() == ARPHRD_ETHER) { char slave_ifname[IFNAMSIZ] = {0}; unsigned int slave_flags = 0; /* valid = true; uncomment it is valid flow to operate w/o SRIOV */ if (get_netvsc_slave(get_ifname_link(), slave_ifname, slave_flags)) { valid = verify_qp_creation(slave_ifname, IBV_QPT_RAW_PACKET); } } break; case LAG_8023ad: case ACTIVE_BACKUP: // this is a bond interface (or a vlan/alias over bond), find the slaves valid = verify_bond_ipoib_or_eth_qp_creation(); break; default: valid = (bool)(ib_ctx && verify_ipoib_or_eth_qp_creation(get_ifname_link())); break; } if (!valid) { nd_logdbg("Skip interface '%s'", get_ifname()); return; } if (safe_mce_sys().mtu != 0 && (int)safe_mce_sys().mtu != get_mtu()) { nd_logwarn("Mismatch between interface %s MTU=%d and VMA_MTU=%d." "Make sure VMA_MTU and all offloaded interfaces MTUs match.", get_ifname(), get_mtu(), safe_mce_sys().mtu); } /* Set interface state after all verifications */ if (m_flags & IFF_RUNNING) { m_state = RUNNING; } else { if (m_flags & IFF_UP) { m_state = UP; } else { m_state = DOWN; } } nd_logdbg("Use interface '%s'", get_ifname()); if (ib_ctx) { nd_logdbg("%s ==> %s port %d (%s)", get_ifname(), ib_ctx->get_ibname(), get_port_from_ifname(get_ifname_link()), (ib_ctx->is_active(get_port_from_ifname(get_ifname_link())) ? "Up" : "Down")); } else { nd_logdbg("%s ==> none", get_ifname()); } } net_device_val::~net_device_val() { auto_unlocker lock(m_lock); rings_hash_map_t::iterator ring_iter; while ((ring_iter = m_h_ring_map.begin()) != m_h_ring_map.end()) { delete THE_RING; resource_allocation_key *tmp = ring_iter->first; m_h_ring_map.erase(ring_iter); delete tmp; } rings_key_redirection_hash_map_t::iterator redirect_iter; while ((redirect_iter = m_h_ring_key_redirection_map.begin()) != m_h_ring_key_redirection_map.end()) { delete redirect_iter->second.first; m_h_ring_key_redirection_map.erase(redirect_iter); } if (m_p_br_addr) { delete m_p_br_addr; m_p_br_addr = NULL; } if (m_p_L2_addr) { delete m_p_L2_addr; m_p_L2_addr = NULL; } slave_data_vector_t::iterator slave = m_slaves.begin(); for (; slave != m_slaves.end(); ++slave) { delete *slave; } m_slaves.clear(); ip_data_vector_t::iterator ip = m_ip.begin(); for (; ip != m_ip.end(); ++ip) { delete *ip; } m_ip.clear(); } void net_device_val::set_ip_array() { int rc = 0; int fd = -1; struct { struct nlmsghdr hdr; struct ifaddrmsg addrmsg; } nl_req; struct nlmsghdr *nl_msg; int nl_msglen = 0; char nl_res[8096]; static int _seq = 0; /* Set up the netlink socket */ fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (fd < 0) { nd_logerr("netlink socket() creation"); return; } /* Prepare RTM_GETADDR request */ memset(&nl_req, 0, sizeof(nl_req)); nl_req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)); nl_req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; nl_req.hdr.nlmsg_type = RTM_GETADDR; nl_req.hdr.nlmsg_seq = _seq++; nl_req.hdr.nlmsg_pid = getpid(); nl_req.addrmsg.ifa_family = AF_INET; nl_req.addrmsg.ifa_index = m_if_idx; /* Send the netlink request */ rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0); if (rc < 0) { nd_logerr("netlink send() operation"); goto ret; } do { /* Receive the netlink reply */ rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0); if (rc < 0) { nd_logerr("netlink recv() operation"); goto ret; } nl_msg = (struct nlmsghdr *)nl_res; nl_msglen = rc; while (NLMSG_OK(nl_msg, (size_t)nl_msglen) && (nl_msg->nlmsg_type != NLMSG_ERROR)) { int nl_attrlen; struct ifaddrmsg *nl_msgdata; struct rtattr *nl_attr; ip_data_t* p_val = NULL; nl_msgdata = (struct ifaddrmsg *)NLMSG_DATA(nl_msg); /* Process just specific if index */ if ((int)nl_msgdata->ifa_index == m_if_idx) { nl_attr = (struct rtattr *)IFA_RTA(nl_msgdata); nl_attrlen = IFA_PAYLOAD(nl_msg); p_val = new ip_data_t; p_val->flags = nl_msgdata->ifa_flags; memset(&p_val->netmask, 0, sizeof(in_addr_t)); p_val->netmask = prefix_to_netmask(nl_msgdata->ifa_prefixlen); while (RTA_OK(nl_attr, nl_attrlen)) { char *nl_attrdata = (char *)RTA_DATA(nl_attr); switch (nl_attr->rta_type) { case IFA_ADDRESS: memset(&p_val->local_addr, 0, sizeof(in_addr_t)); memcpy(&p_val->local_addr, (in_addr_t *)nl_attrdata, sizeof(in_addr_t)); break; default: break; } nl_attr = RTA_NEXT(nl_attr, nl_attrlen); } m_ip.push_back(p_val); } /* Check if it is the last message */ if(nl_msg->nlmsg_type == NLMSG_DONE) { goto ret; } nl_msg = NLMSG_NEXT(nl_msg, nl_msglen); } } while (1); ret: orig_os_api.close(fd); } void net_device_val::set_str() { char str_x[BUFF_SIZE] = {0}; m_str[0] = '\0'; str_x[0] = '\0'; sprintf(str_x, "%d:", m_if_idx); strcat(m_str, str_x); str_x[0] = '\0'; if (!strcmp(get_ifname(), get_ifname_link())) { sprintf(str_x, " %s:", get_ifname()); } else { sprintf(str_x, " %s@%s:", get_ifname(), get_ifname_link()); } strcat(m_str, str_x); str_x[0] = '\0'; sprintf(str_x, " <%s%s%s%s%s%s%s%s%s%s%s>:", (m_flags & IFF_UP ? "UP," : ""), (m_flags & IFF_RUNNING ? "RUNNING," : ""), (m_flags & IFF_NOARP ? "NO_ARP," : ""), (m_flags & IFF_LOOPBACK ? "LOOPBACK," : ""), (m_flags & IFF_BROADCAST ? "BROADCAST," : ""), (m_flags & IFF_MULTICAST ? "MULTICAST," : ""), (m_flags & IFF_MASTER ? "MASTER," : ""), (m_flags & IFF_SLAVE ? "SLAVE," : ""), (m_flags & IFF_LOWER_UP ? "LOWER_UP," : ""), (m_flags & IFF_DEBUG ? "DEBUG," : ""), (m_flags & IFF_PROMISC ? "PROMISC," : "")); strcat(m_str, str_x); str_x[0] = '\0'; sprintf(str_x, " mtu %d", m_mtu); strcat(m_str, str_x); str_x[0] = '\0'; switch (m_type) { case ARPHRD_LOOPBACK: sprintf(str_x, " type %s", "loopback"); break; case ARPHRD_ETHER: sprintf(str_x, " type %s", "ether"); break; case ARPHRD_INFINIBAND: sprintf(str_x, " type %s", "infiniband"); break; default: sprintf(str_x, " type %s", "unknown"); break; } str_x[0] = '\0'; switch (m_bond) { case NETVSC: sprintf(str_x, " (%s)", "netvsc"); break; case LAG_8023ad: sprintf(str_x, " (%s)", "lag 8023ad"); break; case ACTIVE_BACKUP: sprintf(str_x, " (%s)", "active backup"); break; default: sprintf(str_x, " (%s)", "normal"); break; } strcat(m_str, str_x); } void net_device_val::print_val() { size_t i = 0; rings_hash_map_t::iterator ring_iter; set_str(); nd_logdbg("%s", m_str); nd_logdbg(" ip list: %s", (m_ip.empty() ? "empty " : "")); for (i = 0; i < m_ip.size(); i++) { nd_logdbg(" inet: %d.%d.%d.%d netmask: %d.%d.%d.%d flags: 0x%X", NIPQUAD(m_ip[i]->local_addr), NIPQUAD(m_ip[i]->netmask), m_ip[i]->flags); } nd_logdbg(" slave list: %s", (m_slaves.empty() ? "empty " : "")); for (i = 0; i < m_slaves.size(); i++) { char if_name[IFNAMSIZ] = {0}; if_name[0] = '\0'; if_indextoname(m_slaves[i]->if_index, if_name); nd_logdbg(" %d: %s: %s active: %d", m_slaves[i]->if_index, if_name, m_slaves[i]->p_L2_addr->to_str().c_str(), m_slaves[i]->active); } nd_logdbg(" ring list: %s", (m_h_ring_map.empty() ? "empty " : "")); for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { ring *cur_ring = ring_iter->second.first; NOT_IN_USE(cur_ring); // Suppress --enable-opt-log=high warning nd_logdbg(" %d: 0x%X: parent 0x%X ref %d", cur_ring->get_if_index(), cur_ring, cur_ring->get_parent(), ring_iter->second.second); } } void net_device_val::set_slave_array() { char active_slave[IFNAMSIZ] = {0}; // gather the slave data (only for active-backup)- nd_logdbg(""); if (m_bond == NETVSC) { slave_data_t* s = NULL; unsigned int slave_flags = 0; if (get_netvsc_slave(get_ifname_link(), active_slave, slave_flags)) { if ((slave_flags & IFF_UP) && verify_qp_creation(active_slave, IBV_QPT_RAW_PACKET)) { s = new slave_data_t(if_nametoindex(active_slave)); m_slaves.push_back(s); } } } else if (m_bond == NO_BOND) { slave_data_t* s = new slave_data_t(if_nametoindex(get_ifname())); m_slaves.push_back(s); } else { // bond device // get list of all slave devices char slaves_list[IFNAMSIZ * MAX_SLAVES] = {0}; if (get_bond_slaves_name_list(get_ifname_link(), slaves_list, sizeof(slaves_list))) { char* slave = strtok(slaves_list, " "); while (slave) { char* p = strchr(slave, '\n'); if (p) *p = '\0'; // Remove the tailing 'new line" char slave_data_t* s = new slave_data_t(if_nametoindex(slave)); m_slaves.push_back(s); slave = strtok(NULL, " "); } } // find the active slave if (get_bond_active_slave_name(get_ifname_link(), active_slave, sizeof(active_slave))) { m_if_active = if_nametoindex(active_slave); nd_logdbg("found the active slave: %d: '%s'", m_if_active, active_slave); } else { nd_logdbg("failed to find the active slave, Moving to LAG state"); } } bool up_and_active_slaves[m_slaves.size()]; memset(up_and_active_slaves, 0, sizeof(up_and_active_slaves)); if (m_bond == LAG_8023ad) { get_up_and_active_slaves(up_and_active_slaves, m_slaves.size()); } for (uint16_t i = 0; i < m_slaves.size(); i++) { char if_name[IFNAMSIZ] = {0}; char base_ifname[IFNAMSIZ]; if (!if_indextoname(m_slaves[i]->if_index, if_name)) { nd_logerr("Can not find interface name by index=%d", m_slaves[i]->if_index); continue; } get_base_interface_name((const char*)if_name, base_ifname, sizeof(base_ifname)); // Save L2 address m_slaves[i]->p_L2_addr = create_L2_address(if_name); m_slaves[i]->active = false; if (m_bond == ACTIVE_BACKUP && m_if_active == m_slaves[i]->if_index) { m_slaves[i]->active = true; } if (m_bond == LAG_8023ad) { if (up_and_active_slaves[i]) { m_slaves[i]->active = true; } } if (m_bond == NETVSC) { m_slaves[i]->active = true; } if (m_bond == NO_BOND) { m_slaves[i]->active = true; } m_slaves[i]->p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(base_ifname); m_slaves[i]->port_num = get_port_from_ifname(base_ifname); if (m_slaves[i]->port_num < 1) { nd_logdbg("Error: port %d ==> ifname=%s base_ifname=%s", m_slaves[i]->port_num, if_name, base_ifname); } } if (m_slaves.empty() && NETVSC != m_bond) { m_state = INVALID; nd_logpanic("No slave found."); } } const slave_data_t* net_device_val::get_slave(int if_index) { auto_unlocker lock(m_lock); slave_data_vector_t::iterator iter; for (iter = m_slaves.begin(); iter != m_slaves.end(); iter++) { slave_data_t *cur_slave = *iter; if (cur_slave->if_index == if_index) { return cur_slave; } } return NULL; } void net_device_val::verify_bonding_mode() { // this is a bond interface, lets get its mode. char bond_mode_file_content[FILENAME_MAX]; char bond_failover_mac_file_content[FILENAME_MAX]; char bond_mode_param_file[FILENAME_MAX]; char bond_failover_mac_param_file[FILENAME_MAX]; char bond_xmit_hash_policy_file_content[FILENAME_MAX]; char bond_xmit_hash_policy_param_file[FILENAME_MAX]; memset(bond_mode_file_content, 0, FILENAME_MAX); sprintf(bond_mode_param_file, BONDING_MODE_PARAM_FILE, get_ifname_link()); sprintf(bond_failover_mac_param_file, BONDING_FAILOVER_MAC_PARAM_FILE, get_ifname_link()); if (priv_safe_read_file(bond_mode_param_file, bond_mode_file_content, FILENAME_MAX) > 0) { char *bond_mode = NULL; bond_mode = strtok(bond_mode_file_content, " "); if (bond_mode) { if (!strcmp(bond_mode, "active-backup")) { m_bond = ACTIVE_BACKUP; } else if (strstr(bond_mode, "802.3ad")) { m_bond = LAG_8023ad; } if (priv_safe_read_file(bond_failover_mac_param_file, bond_failover_mac_file_content, FILENAME_MAX) > 0) { if(strstr(bond_failover_mac_file_content, "0")){ m_bond_fail_over_mac = 0; } else if(strstr(bond_failover_mac_file_content, "1")){ m_bond_fail_over_mac = 1; } else if(strstr(bond_failover_mac_file_content, "2")){ m_bond_fail_over_mac = 2; } } } } memset(bond_xmit_hash_policy_file_content, 0, FILENAME_MAX); sprintf(bond_xmit_hash_policy_param_file, BONDING_XMIT_HASH_POLICY_PARAM_FILE, get_ifname_link()); if (priv_safe_try_read_file(bond_xmit_hash_policy_param_file, bond_xmit_hash_policy_file_content, FILENAME_MAX) > 0) { char *bond_xhp = NULL; char *saveptr = NULL; bond_xhp = strtok_r(bond_xmit_hash_policy_file_content, " ", &saveptr); if (NULL == bond_xhp) { nd_logdbg("could not parse bond xmit hash policy, staying with default (L2)\n"); } else { bond_xhp = strtok_r(NULL, " ", &saveptr); if (bond_xhp) { m_bond_xmit_hash_policy = (bond_xmit_hash_policy)strtol(bond_xhp, NULL , 10); if (m_bond_xmit_hash_policy < XHP_LAYER_2 || m_bond_xmit_hash_policy > XHP_ENCAP_3_4) { vlog_printf(VLOG_WARNING,"VMA does not support xmit hash policy = %d\n", m_bond_xmit_hash_policy); m_bond_xmit_hash_policy = XHP_LAYER_2; } } nd_logdbg("got bond xmit hash policy = %d\n", m_bond_xmit_hash_policy); } } else { nd_logdbg("could not read bond xmit hash policy, staying with default (L2)\n"); } if (m_bond == NO_BOND || m_bond_fail_over_mac > 1) { vlog_printf(VLOG_WARNING,"******************************************************************************\n"); vlog_printf(VLOG_WARNING,"VMA doesn't support current bonding configuration of %s.\n", get_ifname_link()); vlog_printf(VLOG_WARNING,"The only supported bonding mode is \"802.3ad 4(#4)\" or \"active-backup(#1)\"\n"); vlog_printf(VLOG_WARNING,"with \"fail_over_mac=1\" or \"fail_over_mac=0\".\n"); vlog_printf(VLOG_WARNING,"The effect of working in unsupported bonding mode is undefined.\n"); vlog_printf(VLOG_WARNING,"Read more about Bonding in the VMA's User Manual\n"); vlog_printf(VLOG_WARNING,"******************************************************************************\n"); } } /** * only for active-backup bond */ bool net_device_val::update_active_backup_slaves() { // update the active slave // /sys/class/net/bond0/bonding/active_slave char active_slave[IFNAMSIZ*MAX_SLAVES] = {0}; int if_active_slave = 0; if (!get_bond_active_slave_name(get_ifname_link(), active_slave, IFNAMSIZ)) { nd_logdbg("failed to find the active slave!"); return 0; } //nothing changed if_active_slave = if_nametoindex(active_slave); if (m_if_active == if_active_slave) { return 0; } m_p_L2_addr = create_L2_address(get_ifname()); bool found_active_slave = false; for (size_t i = 0; i < m_slaves.size(); i++) { if (if_active_slave == m_slaves[i]->if_index) { m_slaves[i]->active = true; found_active_slave = true; nd_logdbg("Slave changed old=%d new=%d", m_if_active, if_active_slave); m_if_active = if_active_slave; } else { m_slaves[i]->active = false; } } if (!found_active_slave) { nd_logdbg("Failed to locate new active slave details"); return 0; } // restart rings rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { THE_RING->restart(); } return 1; } /* * this function assume m_slaves[i]->if_name and m_slaves.size() are already set. */ bool net_device_val::get_up_and_active_slaves(bool* up_and_active_slaves, size_t size) { bool up_slaves[m_slaves.size()]; int num_up = 0; bool active_slaves[m_slaves.size()]; int num_up_and_active = 0; size_t i = 0; if (size != m_slaves.size()) { nd_logwarn("programmer error! array size is not correct"); return false; } /* get slaves operstate and active state */ for (i = 0; i < m_slaves.size(); i++) { char oper_state[5] = {0}; char slave_state[10] = {0}; char if_name[IFNAMSIZ] = {0}; if (!if_indextoname(m_slaves[i]->if_index, if_name)) { nd_logerr("Can not find interface name by index=%d", m_slaves[i]->if_index); continue; } // get interface operstate get_interface_oper_state(if_name, oper_state, sizeof(oper_state)); if (strstr(oper_state, "up")) { num_up++; up_slaves[i] = true; } else { up_slaves[i] = false; } active_slaves[i] = true; // get slave state if (get_bond_slave_state(if_name, slave_state, sizeof(slave_state))){ if (!strstr(slave_state, "active")) active_slaves[i] = false; } if (active_slaves[i] && up_slaves[i]) { up_and_active_slaves[i] = true; num_up_and_active++; } else { up_and_active_slaves[i] = false; } } /* make sure at least one up interface is active */ if (!num_up_and_active && num_up) { for (i = 0; i < m_slaves.size(); i++) { if (up_slaves[i]) { up_and_active_slaves[i] = true; break; } } } return true; } bool net_device_val::update_active_slaves() { bool changed = false; bool up_and_active_slaves[m_slaves.size()]; size_t i = 0; memset(&up_and_active_slaves, 0, m_slaves.size() * sizeof(bool)); get_up_and_active_slaves(up_and_active_slaves, m_slaves.size()); /* compare to current status and prepare for restart */ for (i = 0; i< m_slaves.size(); i++) { if (up_and_active_slaves[i]) { //slave came up if (!m_slaves[i]->active) { nd_logdbg("slave %d is up ", m_slaves[i]->if_index); m_slaves[i]->active = true; changed = true; } } else { //slave went down if (m_slaves[i]->active) { nd_logdbg("slave %d is down ", m_slaves[i]->if_index); m_slaves[i]->active = false; changed = true; } } } /* restart if status changed */ if (changed) { m_p_L2_addr = create_L2_address(get_ifname()); // restart rings rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { THE_RING->restart(); } return 1; } return 0; } void net_device_val::update_netvsc_slaves(int if_index, int if_flags) { slave_data_t* s = NULL; bool found = false; ib_ctx_handler *ib_ctx = NULL, *up_ib_ctx = NULL; char if_name[IFNAMSIZ] = {0}; m_lock.lock(); if (if_indextoname(if_index, if_name) && (if_flags & IFF_UP) && (if_flags & IFF_RUNNING)) { nd_logdbg("slave %d is up", if_index); g_p_ib_ctx_handler_collection->update_tbl(if_name); if ((up_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(if_name))) { s = new slave_data_t(if_index); s->active = true; s->p_ib_ctx = up_ib_ctx; s->p_L2_addr = create_L2_address(if_name); s->port_num = get_port_from_ifname(if_name); m_slaves.push_back(s); up_ib_ctx->set_ctx_time_converter_status(g_p_net_device_table_mgr->get_ctx_time_conversion_mode()); g_buffer_pool_rx->register_memory(s->p_ib_ctx); g_buffer_pool_tx->register_memory(s->p_ib_ctx); found = true; } } else { if (!m_slaves.empty()) { s = m_slaves.back(); m_slaves.pop_back(); nd_logdbg("slave %d is down ", s->if_index); ib_ctx = s->p_ib_ctx; delete s; found = true; } } m_lock.unlock(); if (!found) { nd_logdbg("Unable to detect any changes for interface %d. ignoring", if_index); return; } /* restart if status changed */ m_p_L2_addr = create_L2_address(get_ifname()); // restart rings rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { THE_RING->restart(); } if (ib_ctx) { g_p_ib_ctx_handler_collection->del_ib_ctx(ib_ctx); } } std::string net_device_val::to_str() { return std::string("Net Device: " + m_name); } ring* net_device_val::reserve_ring(resource_allocation_key *key) { nd_logfunc(""); auto_unlocker lock(m_lock); key = ring_key_redirection_reserve(key); ring* the_ring = NULL; rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(key); if (m_h_ring_map.end() == ring_iter) { nd_logdbg("Creating new RING for %s", key->to_str()); // copy key since we keep pointer and socket can die so map will lose pointer resource_allocation_key *new_key = new resource_allocation_key(*key); the_ring = create_ring(new_key); if (!the_ring) { return NULL; } m_h_ring_map[new_key] = std::make_pair(the_ring, 0); // each ring is born with ref_count = 0 ring_iter = m_h_ring_map.find(new_key); epoll_event ev = {0, {0}}; int num_ring_rx_fds = the_ring->get_num_resources(); int *ring_rx_fds_array = the_ring->get_rx_channel_fds(); ev.events = EPOLLIN; for (int i = 0; i < num_ring_rx_fds; i++) { int cq_ch_fd = ring_rx_fds_array[i]; ev.data.fd = cq_ch_fd; BULLSEYE_EXCLUDE_BLOCK_START if (unlikely( orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), EPOLL_CTL_ADD, cq_ch_fd, &ev))) { nd_logerr("Failed to add RING notification fd to global_table_mgr_epfd (errno=%d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END } g_p_net_device_table_mgr->global_ring_wakeup(); } // now we are sure the ring is in the map ADD_RING_REF_CNT; the_ring = GET_THE_RING(key); nd_logdbg("0x%X: if_index %d parent 0x%X ref %d key %s", the_ring, the_ring->get_if_index(), the_ring->get_parent(), RING_REF_CNT, key->to_str()); return the_ring; } bool net_device_val::release_ring(resource_allocation_key *key) { nd_logfunc(""); resource_allocation_key *red_key; auto_unlocker lock(m_lock); red_key = get_ring_key_redirection(key); ring* the_ring = NULL; rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(red_key); if (m_h_ring_map.end() != ring_iter) { DEC_RING_REF_CNT; the_ring = GET_THE_RING(red_key); nd_logdbg("0x%X: if_index %d parent 0x%X ref %d key %s", the_ring, the_ring->get_if_index(), the_ring->get_parent(), RING_REF_CNT, red_key->to_str()); if ( TEST_REF_CNT_ZERO ) { int num_ring_rx_fds = the_ring->get_num_resources(); int *ring_rx_fds_array = the_ring->get_rx_channel_fds(); nd_logdbg("Deleting RING %p for key %s and removing notification fd from global_table_mgr_epfd (epfd=%d)", the_ring, red_key->to_str(), g_p_net_device_table_mgr->global_ring_epfd_get()); for (int i = 0; i < num_ring_rx_fds; i++) { int cq_ch_fd = ring_rx_fds_array[i]; BULLSEYE_EXCLUDE_BLOCK_START if (unlikely(orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), EPOLL_CTL_DEL, cq_ch_fd, NULL))) { nd_logerr("Failed to delete RING notification fd to global_table_mgr_epfd (errno=%d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END } ring_key_redirection_release(key); delete the_ring; delete ring_iter->first; m_h_ring_map.erase(ring_iter); } return true; } return false; } /* * this function maps key to new keys that it created * the key that it creates is the size of the map */ resource_allocation_key* net_device_val::ring_key_redirection_reserve(resource_allocation_key *key) { // if allocation logic is usr idx feature disabled if (!safe_mce_sys().ring_limit_per_interface || key->get_ring_alloc_logic() == RING_LOGIC_PER_USER_ID) return key; if (m_h_ring_key_redirection_map.find(key) != m_h_ring_key_redirection_map.end()) { m_h_ring_key_redirection_map[key].second++; nd_logdbg("redirecting key=%s (ref-count:%d) to key=%s", key->to_str(), m_h_ring_key_redirection_map[key].second, m_h_ring_key_redirection_map[key].first->to_str()); return m_h_ring_key_redirection_map[key].first; } int ring_map_size = (int)m_h_ring_map.size(); if (safe_mce_sys().ring_limit_per_interface > ring_map_size) { resource_allocation_key *key2 = new resource_allocation_key(*key); // replace key to redirection key key2->set_user_id_key(ring_map_size); m_h_ring_key_redirection_map[key] = std::make_pair(key2, 1); nd_logdbg("redirecting key=%s (ref-count:1) to key=%s", key->to_str(), key2->to_str()); return key2; } rings_hash_map_t::iterator ring_iter = m_h_ring_map.begin(); int min_ref_count = ring_iter->second.second; resource_allocation_key *min_key = ring_iter->first; while (ring_iter != m_h_ring_map.end()) { // redirect only to ring with the same profile if (ring_iter->first->get_ring_profile_key() == key->get_ring_profile_key() && ring_iter->second.second < min_ref_count) { min_ref_count = ring_iter->second.second; min_key = ring_iter->first; } ring_iter++; } m_h_ring_key_redirection_map[key] = std::make_pair(new resource_allocation_key(*min_key), 1); nd_logdbg("redirecting key=%s (ref-count:1) to key=%s", key->to_str(), min_key->to_str()); return min_key; } resource_allocation_key* net_device_val::get_ring_key_redirection(resource_allocation_key *key) { if (!safe_mce_sys().ring_limit_per_interface) return key; if (m_h_ring_key_redirection_map.find(key) == m_h_ring_key_redirection_map.end()) { nd_logdbg("key = %s is not found in the redirection map", key->to_str()); return key; } return m_h_ring_key_redirection_map[key].first; } void net_device_val::ring_key_redirection_release(resource_allocation_key *key) { if (safe_mce_sys().ring_limit_per_interface && m_h_ring_key_redirection_map.find(key) != m_h_ring_key_redirection_map.end() && --m_h_ring_key_redirection_map[key].second == 0) { // this is allocated in ring_key_redirection_reserve nd_logdbg("release redirecting key=%s (ref-count:%d) to key=%s", key->to_str(), m_h_ring_key_redirection_map[key].second, m_h_ring_key_redirection_map[key].first->to_str()); delete m_h_ring_key_redirection_map[key].first; m_h_ring_key_redirection_map.erase(key); } } int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array /*=NULL*/) { nd_logfuncall(""); int ret_total = 0; auto_unlocker lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { int ret = THE_RING->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array); BULLSEYE_EXCLUDE_BLOCK_START if (ret < 0 && errno != EAGAIN) { nd_logerr("Error in ring->poll_and_process_element() of %p (errno=%d %m)", THE_RING, errno); return ret; } BULLSEYE_EXCLUDE_BLOCK_END if (ret > 0) nd_logfunc("ring[%p] Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn); ret_total += ret; } return ret_total; } int net_device_val::global_ring_request_notification(uint64_t poll_sn) { int ret_total = 0; auto_unlocker lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { int ret = THE_RING->request_notification(CQT_RX, poll_sn); if (ret < 0) { nd_logerr("Error ring[%p]->request_notification() (errno=%d %m)", THE_RING, errno); return ret; } nd_logfunc("ring[%p] Returned with: %d (sn=%d)", THE_RING, ret, poll_sn); ret_total += ret; } return ret_total; } int net_device_val::ring_drain_and_proccess() { nd_logfuncall(); int ret_total = 0; auto_unlocker lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { int ret = THE_RING->drain_and_proccess(); if (ret < 0) return ret; if (ret > 0) nd_logfunc("cq[%p] Returned with: %d", THE_RING, ret); ret_total += ret; } return ret_total; } void net_device_val::ring_adapt_cq_moderation() { nd_logfuncall(); auto_unlocker lock(m_lock); rings_hash_map_t::iterator ring_iter; for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { THE_RING->adapt_cq_moderation(); } } void net_device_val::register_to_ibverbs_events(event_handler_ibverbs *handler) { for (size_t i = 0; i < m_slaves.size(); i++) { bool found = false; for (size_t j = 0; j < i; j++) { if (m_slaves[i]->p_ib_ctx == m_slaves[j]->p_ib_ctx) { found = true; //two slaves might be on two ports of the same device, register only once break; } } if (found) continue; nd_logfunc("registering slave to ibverbs events slave=%p", m_slaves[i]); g_p_event_handler_manager->register_ibverbs_event(m_slaves[i]->p_ib_ctx->get_ibv_context()->async_fd, handler, m_slaves[i]->p_ib_ctx->get_ibv_context(), 0); } } void net_device_val::unregister_to_ibverbs_events(event_handler_ibverbs *handler) { for (size_t i = 0; i < m_slaves.size(); i++) { bool found = false; for (size_t j = 0; j < i; j++) { if (m_slaves[i]->p_ib_ctx == m_slaves[j]->p_ib_ctx) { found = true; //two slaves might be on two ports of the same device, unregister only once break; } } if (found) continue; nd_logfunc("unregistering slave to ibverbs events slave=%p", m_slaves[i]); g_p_event_handler_manager->unregister_ibverbs_event(m_slaves[i]->p_ib_ctx->get_ibv_context()->async_fd, handler); } } void net_device_val_eth::configure() { m_p_L2_addr = create_L2_address(get_ifname()); BULLSEYE_EXCLUDE_BLOCK_START if (m_p_L2_addr == NULL) { nd_logpanic("m_p_L2_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END create_br_address(get_ifname()); m_vlan = get_vlan_id_from_ifname(get_ifname()); if (m_vlan) { parse_prio_egress_map(); } if (m_vlan && m_bond != NO_BOND && m_bond_fail_over_mac == 1) { vlog_printf(VLOG_WARNING, " ******************************************************************\n"); vlog_printf(VLOG_WARNING, "%s: vlan over bond while fail_over_mac=1 is not offloaded\n", get_ifname()); vlog_printf(VLOG_WARNING, " ******************************************************************\n"); m_state = INVALID; } if(!m_vlan && (get_flags() & IFF_MASTER)) { char if_name[IFNAMSIZ] = {0}; if (!if_indextoname(m_slaves[0]->if_index, if_name)) { nd_logerr("Can not find interface name by index=%d", m_slaves[0]->if_index); } //in case vlan is configured on slave m_vlan = get_vlan_id_from_ifname(if_name); } } int net_device_val::get_priority_by_tc_class(uint32_t tc_class) { tc_class_priority_map::iterator it = m_class_prio_map.find(tc_class); if (it == m_class_prio_map.end()) { return VMA_DEFAULT_ENGRESS_MAP_PRIO; } return it->second; } void net_device_val_eth::parse_prio_egress_map() { #ifdef HAVE_LIBNL3 int len, ret; nl_cache *cache = NULL; rtnl_link *link; vlan_map *map; nl_socket_handle *nl_socket = nl_socket_handle_alloc(); if (!nl_socket) { nd_logdbg("unable to allocate socket socket %m", errno); goto out; } nl_socket_set_local_port(nl_socket, 0); ret = nl_connect(nl_socket, NETLINK_ROUTE); if (ret < 0) { nd_logdbg("unable to connect to libnl socket %d %m", ret, errno); goto out; } ret = rtnl_link_alloc_cache(nl_socket, AF_UNSPEC, &cache); if (!cache) { nd_logdbg("unable to create libnl cache %d %m", ret, errno); goto out; } link = rtnl_link_get_by_name(cache, get_ifname()); if (!link) { nd_logdbg("unable to get libnl link %d %m", ret, errno); goto out; } map = rtnl_link_vlan_get_egress_map(link, &len); if (!map || !len) { nd_logdbg("no egress map found %d %p",len, map); goto out; } for (int i = 0; i < len; i++) { m_class_prio_map[map[i].vm_from] = map[i].vm_to; } out: if (cache) { nl_cache_free(cache); } if (nl_socket) { nl_socket_handle_free(nl_socket); } #else nd_logdbg("libnl3 not found, cannot read engress map, " "SO_PRIORITY will not work properly"); #endif } ring* net_device_val_eth::create_ring(resource_allocation_key *key) { ring* ring = NULL; // if this is a ring profile key get the profile from the global map if (key->get_ring_profile_key()) { if (!g_p_ring_profile) { nd_logdbg("could not find ring profile"); return NULL; } ring_profile *prof = g_p_ring_profile->get_profile(key->get_ring_profile_key()); if (prof == NULL) { nd_logerr("could not find ring profile %d", key->get_ring_profile_key()); return NULL; } try { switch (prof->get_ring_type()) { #ifdef HAVE_MP_RQ case VMA_RING_CYCLIC_BUFFER: ring = new ring_eth_cb(get_if_idx(), &prof->get_desc()->ring_cyclicb, key->get_memory_descriptor()); break; #endif case VMA_RING_EXTERNAL_MEM: ring = new ring_eth_direct(get_if_idx(), &prof->get_desc()->ring_ext); break; default: nd_logdbg("Unknown ring type"); break; } } catch (vma_error &error) { nd_logdbg("failed creating ring %s", error.message); } } else { try { switch (m_bond) { case NO_BOND: ring = new ring_eth(get_if_idx()); break; case ACTIVE_BACKUP: case LAG_8023ad: ring = new ring_bond_eth(get_if_idx()); break; case NETVSC: ring = new ring_bond_netvsc(get_if_idx()); break; default: nd_logdbg("Unknown ring type"); break; } } catch (vma_error &error) { nd_logdbg("failed creating ring %s", error.message); } } return ring; } L2_address* net_device_val_eth::create_L2_address(const char* ifname) { if (m_p_L2_addr) { delete m_p_L2_addr; m_p_L2_addr = NULL; } unsigned char hw_addr[ETH_ALEN]; get_local_ll_addr(ifname, hw_addr, ETH_ALEN, false); return new ETH_addr(hw_addr); } void net_device_val_eth::create_br_address(const char* ifname) { if(m_p_br_addr) { delete m_p_br_addr; m_p_br_addr = NULL; } uint8_t hw_addr[ETH_ALEN]; get_local_ll_addr(ifname, hw_addr, ETH_ALEN, true); m_p_br_addr = new ETH_addr(hw_addr); BULLSEYE_EXCLUDE_BLOCK_START if(m_p_br_addr == NULL) { nd_logpanic("m_p_br_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END } std::string net_device_val_eth::to_str() { return std::string("ETH: " + net_device_val::to_str()); } net_device_val_ib::~net_device_val_ib() { struct in_addr in; if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) { g_p_neigh_table_mgr->unregister_observer(neigh_key(ip_address(in.s_addr), this), this); } } void net_device_val_ib::configure() { ib_ctx_handler* p_ib_ctx = NULL; struct in_addr in; m_p_L2_addr = create_L2_address(get_ifname()); BULLSEYE_EXCLUDE_BLOCK_START if(m_p_L2_addr == NULL) { nd_logpanic("m_p_L2_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END create_br_address(get_ifname()); if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) { g_p_neigh_table_mgr->unregister_observer(neigh_key(ip_address(in.s_addr), this), this); } //Register to IB BR neigh cache_entry_subject* p_ces = NULL; if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) { g_p_neigh_table_mgr->register_observer(neigh_key(ip_address(in.s_addr), this), this, &p_ces); } m_br_neigh = dynamic_cast(p_ces); p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()); if (!p_ib_ctx || ibv_query_pkey(p_ib_ctx->get_ibv_context(), get_port_from_ifname(get_ifname_link()), 0, &m_pkey)) { nd_logerr("failed querying pkey"); } nd_logdbg("pkey: %d", m_pkey); } ring* net_device_val_ib::create_ring(resource_allocation_key *key) { ring* ring = NULL; NOT_IN_USE(key); try { switch (m_bond) { case NO_BOND: ring = new ring_ib(get_if_idx()); break; case ACTIVE_BACKUP: case LAG_8023ad: ring = new ring_bond_ib(get_if_idx()); break; default: nd_logdbg("Unknown ring type"); break; } } catch (vma_error &error) { nd_logdbg("failed creating ring %s", error.message); } return ring; } L2_address* net_device_val_ib::create_L2_address(const char* ifname) { if (m_p_L2_addr) { delete m_p_L2_addr; m_p_L2_addr = NULL; } unsigned char hw_addr[IPOIB_HW_ADDR_LEN]; get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, false); return new IPoIB_addr(hw_addr); } void net_device_val_ib::create_br_address(const char* ifname) { if (m_p_br_addr) { delete m_p_br_addr; m_p_br_addr = NULL; } unsigned char hw_addr[IPOIB_HW_ADDR_LEN]; get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, true); m_p_br_addr = new IPoIB_addr(hw_addr); BULLSEYE_EXCLUDE_BLOCK_START if (m_p_br_addr == NULL) { nd_logpanic("m_p_br_addr allocation error"); } BULLSEYE_EXCLUDE_BLOCK_END } std::string net_device_val_ib::to_str() { return std::string("IB: " + net_device_val::to_str()); } bool net_device_val::verify_bond_ipoib_or_eth_qp_creation() { char slaves[IFNAMSIZ * MAX_SLAVES] = {0}; if (!get_bond_slaves_name_list(get_ifname_link(), slaves, sizeof slaves)) { vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded, slave list or bond name could not be found\n", get_ifname()); vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); return false; } //go over all slaves and check preconditions bool bond_ok = true; char* slave_name; char* save_ptr; slave_name = strtok_r(slaves, " ", &save_ptr); while (slave_name != NULL) { char* p = strchr(slave_name, '\n'); if (p) *p = '\0'; // Remove the tailing 'new line" char if (!verify_ipoib_or_eth_qp_creation(slave_name)) { //check all slaves but print only once for bond bond_ok = false; } slave_name = strtok_r(NULL, " ", &save_ptr); } if (!bond_ok) { vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); vlog_printf(VLOG_WARNING,"* Bond %s will not be offloaded due to problem with its slaves.\n", get_ifname()); vlog_printf(VLOG_WARNING,"* Check warning messages for more information.\n"); vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); } else { /* * Print warning message while bond device contains two slaves of the same HCA * while RoCE LAG is enabled for both slaves. */ sys_image_guid_map_t::iterator guid_iter; for (guid_iter = m_sys_image_guid_map.begin(); guid_iter != m_sys_image_guid_map.end(); guid_iter++) { char bond_roce_lag_path[256] = {0}; if (guid_iter->second.size() > 1 && check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), guid_iter->second.front().c_str()) && check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), guid_iter->second.back().c_str())) { print_roce_lag_warnings(get_ifname_link(), bond_roce_lag_path, guid_iter->second.front().c_str(), guid_iter->second.back().c_str()); } } } return bond_ok; } //interface name can be slave while ifa struct can describe bond bool net_device_val::verify_ipoib_or_eth_qp_creation(const char* interface_name) { if (m_type == ARPHRD_INFINIBAND) { if (verify_enable_ipoib(interface_name) && verify_qp_creation(interface_name, IBV_QPT_UD)) { return true; } } else { if (verify_qp_creation(interface_name, IBV_QPT_RAW_PACKET)) { return true; } } return false; } bool net_device_val::verify_enable_ipoib(const char* interface_name) { char filename[256] = "\0"; char ifname[IFNAMSIZ] = "\0"; NOT_IN_USE(interface_name); // Suppress --enable-opt-log=high warning if(!safe_mce_sys().enable_ipoib) { nd_logdbg("Blocking offload: IPoIB interfaces ('%s')", interface_name); return false; } #ifndef DEFINED_IBV_QP_INIT_SOURCE_QPN // Note: mlx4 does not support this capability ib_ctx_handler* ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()); if (!ib_ctx->is_mlx4()) { nd_logwarn("Blocking offload: SOURCE_QPN is not supported for this driver ('%s')", interface_name); return false; } #endif // Verify IPoIB is in 'datagram mode' for proper VMA with flow steering operation if (validate_ipoib_prop(get_ifname(), m_flags, IPOIB_MODE_PARAM_FILE, "datagram", 8, filename, ifname)) { vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); vlog_printf(VLOG_WARNING,"* IPoIB mode of interface '%s' is \"connected\" !\n", get_ifname()); vlog_printf(VLOG_WARNING,"* Please change it to datagram: \"echo datagram > %s\" before loading your application with VMA library\n", filename); vlog_printf(VLOG_WARNING,"* VMA doesn't support IPoIB in connected mode.\n"); vlog_printf(VLOG_WARNING,"* Please refer to VMA Release Notes for more information\n"); vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); return false; } else { nd_logdbg("verified interface '%s' is running in datagram mode", get_ifname()); } // Verify umcast is disabled for IB flow if (validate_ipoib_prop(get_ifname(), m_flags, UMCAST_PARAM_FILE, "0", 1, filename, ifname)) { // Extract UMCAST flag (only for IB transport types) vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); vlog_printf(VLOG_WARNING,"* UMCAST flag is Enabled for interface %s !\n", get_ifname()); vlog_printf(VLOG_WARNING,"* Please disable it: \"echo 0 > %s\" before loading your application with VMA library\n", filename); vlog_printf(VLOG_WARNING,"* This option in no longer needed in this version\n"); vlog_printf(VLOG_WARNING,"* Please refer to Release Notes for more information\n"); vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); return false; } else { nd_logdbg("verified interface '%s' is running with umcast disabled", get_ifname()); } return true; } //ifname should point to a physical device bool net_device_val::verify_qp_creation(const char* ifname, enum ibv_qp_type qp_type) { bool success = false; char bond_roce_lag_path[256] = {0}; struct ibv_cq* cq = NULL; struct ibv_comp_channel *channel = NULL; struct ibv_qp* qp = NULL; vma_ibv_qp_init_attr qp_init_attr; memset(&qp_init_attr, 0, sizeof(qp_init_attr)); vma_ibv_cq_init_attr attr; memset(&attr, 0, sizeof(attr)); qp_init_attr.cap.max_send_wr = MCE_DEFAULT_TX_NUM_WRE; qp_init_attr.cap.max_recv_wr = MCE_DEFAULT_RX_NUM_WRE; qp_init_attr.cap.max_inline_data = MCE_DEFAULT_TX_MAX_INLINE; qp_init_attr.cap.max_send_sge = MCE_DEFAULT_TX_NUM_SGE; qp_init_attr.cap.max_recv_sge = MCE_DEFAULT_RX_NUM_SGE; qp_init_attr.sq_sig_all = 0; qp_init_attr.qp_type = qp_type; //find ib_cxt char base_ifname[IFNAMSIZ]; get_base_interface_name((const char*)(ifname), base_ifname, sizeof(base_ifname)); int port_num = get_port_from_ifname(base_ifname); ib_ctx_handler* p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(base_ifname); if (!p_ib_ctx) { nd_logdbg("Cant find ib_ctx for interface %s", base_ifname); if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND) { if (check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), ifname)) { print_roce_lag_warnings(get_ifname_link(), bond_roce_lag_path); } else if ((p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link())) && strstr(p_ib_ctx->get_ibname(), "bond")) { print_roce_lag_warnings(get_ifname_link()); } } goto release_resources; } else if (port_num > p_ib_ctx->get_ibv_device_attr()->phys_port_cnt) { nd_logdbg("Invalid port for interface %s", base_ifname); if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND && p_ib_ctx->is_mlx4()) { print_roce_lag_warnings(get_ifname_link()); } goto release_resources; } // Add to guid map in order to detect roce lag issue if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND) { m_sys_image_guid_map[p_ib_ctx->get_ibv_device_attr()->sys_image_guid].push_back(base_ifname); } //create qp resources channel = ibv_create_comp_channel(p_ib_ctx->get_ibv_context()); if (!channel) { nd_logdbg("channel creation failed for interface %s (errno=%d %m)", ifname, errno); goto release_resources; } VALGRIND_MAKE_MEM_DEFINED(channel, sizeof(ibv_comp_channel)); cq = vma_ibv_create_cq(p_ib_ctx->get_ibv_context(), safe_mce_sys().tx_num_wr, (void*)this, channel, 0, &attr); if (!cq) { nd_logdbg("cq creation failed for interface %s (errno=%d %m)", ifname, errno); goto release_resources; } vma_ibv_qp_init_attr_comp_mask(p_ib_ctx->get_ibv_pd(), qp_init_attr); qp_init_attr.recv_cq = cq; qp_init_attr.send_cq = cq; // Set source qpn for non mlx4 IPoIB devices if (qp_type == IBV_QPT_UD && !p_ib_ctx->is_mlx4()) { unsigned char hw_addr[IPOIB_HW_ADDR_LEN]; get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, false); IPoIB_addr ipoib_addr(hw_addr); ibv_source_qpn_set(qp_init_attr, ipoib_addr.get_qpn()); } qp = vma_ibv_create_qp(p_ib_ctx->get_ibv_pd(), &qp_init_attr); if (qp) { if (qp_type == IBV_QPT_UD && priv_ibv_create_flow_supported(qp, port_num) == -1) { nd_logdbg("Create_ibv_flow failed on interface %s (errno=%d %m), Traffic will not be offloaded", ifname, errno); goto qp_failure; } else { success = true; if (qp_type == IBV_QPT_RAW_PACKET && !priv_ibv_query_flow_tag_supported(qp, port_num)) { p_ib_ctx->set_flow_tag_capability(true); } nd_logdbg("verified interface %s for flow tag capabilities : %s", ifname, p_ib_ctx->get_flow_tag_capability() ? "enabled" : "disabled"); if (qp_type == IBV_QPT_RAW_PACKET && p_ib_ctx->is_packet_pacing_supported() && !priv_ibv_query_burst_supported(qp, port_num)) { p_ib_ctx->set_burst_capability(true); } nd_logdbg("verified interface %s for burst capabilities : %s", ifname, p_ib_ctx->get_burst_capability() ? "enabled" : "disabled"); } } else { nd_logdbg("QP creation failed on interface %s (errno=%d %m), Traffic will not be offloaded", ifname, errno); qp_failure: int err = errno; //verify_raw_qp_privliges can overwrite errno so keep it before the call if (validate_raw_qp_privliges() == 0) { // MLNX_OFED raw_qp_privliges file exist with bad value vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname); vlog_printf(VLOG_WARNING,"* Working in this mode might causes VMA malfunction over Ethernet/InfiniBand interfaces\n"); vlog_printf(VLOG_WARNING,"* WARNING: the following steps will restart your network interface!\n"); vlog_printf(VLOG_WARNING,"* 1. \"echo options ib_uverbs disable_raw_qp_enforcement=1 > /etc/modprobe.d/ib_uverbs.conf\"\n"); vlog_printf(VLOG_WARNING,"* 2. Restart openibd or rdma service depending on your system configuration\n"); vlog_printf(VLOG_WARNING,"* Read the RAW_PACKET QP root access enforcement section in the VMA's User Manual for more information\n"); vlog_printf(VLOG_WARNING,"******************************************************************************************************\n"); } else if (validate_user_has_cap_net_raw_privliges() == 0 || err == EPERM) { vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname); vlog_printf(VLOG_WARNING,"* Offloaded resources are restricted to root or user with CAP_NET_RAW privileges\n"); vlog_printf(VLOG_WARNING,"* Read the CAP_NET_RAW and root access section in the VMA's User Manual for more information\n"); vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); } else { vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname); vlog_printf(VLOG_WARNING,"* VMA was not able to create QP for this device (errno = %d).\n", err); vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); } } release_resources: if(qp) { IF_VERBS_FAILURE(ibv_destroy_qp(qp)) { nd_logdbg("qp destroy failed on interface %s (errno=%d %m)", ifname, errno); success = false; } ENDIF_VERBS_FAILURE; } if (cq) { IF_VERBS_FAILURE(ibv_destroy_cq(cq)) { nd_logdbg("cq destroy failed on interface %s (errno=%d %m)", ifname, errno); success = false; } ENDIF_VERBS_FAILURE; } if (channel) { IF_VERBS_FAILURE(ibv_destroy_comp_channel(channel)) { nd_logdbg("channel destroy failed on interface %s (errno=%d %m)", ifname, errno); success = false; } ENDIF_VERBS_FAILURE; VALGRIND_MAKE_MEM_UNDEFINED(channel, sizeof(ibv_comp_channel)); } return success; }