/*
* Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "main.h"
#include <string.h>
#include <sys/stat.h>
#include <sys/resource.h>
#include <sys/utsname.h>
#include <time.h>
#include <mcheck.h>
#include <execinfo.h>
#include <libgen.h>
#include <linux/igmp.h>
#include "vlogger/vlogger.h"
#include "utils/rdtsc.h"
#include "vma/util/vma_stats.h"
#include "vma/util/utils.h"
#include "vma/event/event_handler_manager.h"
#include "vma/event/vlogger_timer_handler.h"
#include "vma/dev/buffer_pool.h"
#include "vma/dev/ib_ctx_handler_collection.h"
#include "vma/dev/net_device_table_mgr.h"
#include "vma/proto/ip_frag.h"
#include "vma/proto/vma_lwip.h"
#include "vma/proto/route_table_mgr.h"
#include "vma/proto/rule_table_mgr.h"
#include "vma/proto/igmp_mgr.h"
#include "vma/proto/neighbour_table_mgr.h"
#include "vma/netlink/netlink_wrapper.h"
#include "vma/event/command.h"
#include "vma/sock/sock-redirect.h"
#include "vma/sock/fd_collection.h"
#include "vma/sock/sockinfo_tcp.h"
#include "vma/sock/sockinfo_udp.h"
#include "vma/iomux/io_mux_call.h"
#include "vma/util/instrumentation.h"
void check_netperf_flags();
// Do not rely on global variable initialization in code that might be called from library constructor (main_init)
mce_sys_var & safe_mce_sys() {return mce_sys_var::instance();}
#define MAX_BACKTRACE 25
#define MAX_VERSION_STR_LEN 128
#define MAX_CMD_LINE 2048
void mce_sys_var::print_vma_load_failure_msg()
{
vlog_printf(VLOG_ERROR,"***************************************************************************\n");
vlog_printf(VLOG_ERROR,"* Failed loading VMA library! Try executing the application without VMA. *\n");
vlog_printf(VLOG_ERROR,"* 'unset LD_PRELOAD' environment variable and rerun the application. *\n");
vlog_printf(VLOG_ERROR,"***************************************************************************\n");
}
namespace vma_spec {
typedef struct {
vma_spec_t level;
const char * output_name;
const char ** input_names;
} vma_spec_names;
static const char *names_none[] = {"none", "0",NULL};
static const char *spec_names_ulatency[] = {"ultra-latency", "10", NULL};
static const char *spec_names_latency[] = {"latency", "15", NULL};
static const char *spec_names_29west[] = {"29west", "29", NULL};
static const char *spec_names_wombat_fh[] = {"wombat_fh", "554", NULL};
static const char *spec_names_mcd[] = {"mcd", "623", NULL};
static const char *spec_names_mcd_irq[] = {"mcd-irq", "624", NULL};
static const char *spec_names_rti[] = {"rti", "784", NULL};
static const char *spec_names_7750[] = {"7750", NULL};
static const char *spec_names_multi_ring[] = {"multi_ring_latency", NULL};
// must be by order because "to_str" relies on that!
static const vma_spec_names specs[] = {
{MCE_SPEC_NONE, "NONE", (const char ** )names_none},
{MCE_SPEC_SOCKPERF_ULTRA_LATENCY_10, "Ultra Latency", (const char ** )spec_names_ulatency},
{MCE_SPEC_SOCKPERF_LATENCY_15, "Latency", (const char ** )spec_names_latency},
{MCE_SPEC_29WEST_LBM_29, "29West LBM Logic", (const char ** )spec_names_29west},
{MCE_SPEC_WOMBAT_FH_LBM_554, "Wombat FH LBM Logic", (const char ** )spec_names_wombat_fh},
{MCE_SPEC_MCD_623, "Memcached Logic", (const char ** )spec_names_mcd},
{MCE_SPEC_MCD_IRQ_624, "Memcached Interrupt Mode", (const char ** )spec_names_mcd_irq},
{MCE_SPEC_RTI_784, "RTI Logic", (const char ** )spec_names_rti},
{MCE_SPEC_LL_7750, "7750 Low Latency Profile", (const char ** )spec_names_7750},
{MCE_SPEC_LL_MULTI_RING, "Multi Ring Latency Profile", (const char ** )spec_names_multi_ring},
};
// convert str to vVMA_spec_t; upon error - returns the given 'def_value'
vma_spec_t from_str(const char* str, vma_spec_t def_value)
{
size_t num_levels = sizeof(specs) / sizeof(specs[0]);
for (size_t i = 0; i < num_levels; ++i) {
const char ** input_name = specs[i].input_names;
while (*input_name) {
if (strcasecmp(str, *input_name) == 0)
return specs[i].level;
input_name++;
}
}
return def_value; // not found. use given def_value
}
// convert int to vVMA_spec_t; upon error - returns the given 'def_value'
vma_spec_t from_int(const int int_spec, vma_spec_t def_value)
{
if (int_spec >= MCE_SPEC_NONE && int_spec <= MCE_VMA__ALL) {
return static_cast<vma_spec_t>(int_spec);
}
return def_value; // not found. use given def_value
}
const char * to_str(vma_spec_t level)
{
static int base = MCE_SPEC_NONE;
return specs[level - base].output_name;
}
}
int mce_sys_var::list_to_cpuset(char *cpulist, cpu_set_t *cpu_set)
{
char comma[] = ",";
char dash[] = "-";
char *comma_saveptr, *dash_saveptr;
char *token, *subtoken, *endptr;
int range_start, range_end;
int i;
CPU_ZERO(cpu_set);
/*
* When passed a CPU list, we expect comma(',') delimited values.
*/
token = strtok_r(cpulist, comma, &comma_saveptr);
if (!token) {
return -1;
}
/*
* For each comma delimited value we need to parse the token based
* on a dash('-') to see if we are dealing with a single cpu digit
* or a range.
*/
while (token) {
subtoken = strtok_r(token, dash, &dash_saveptr);
if (!subtoken) {
return -1;
}
while (subtoken) {
errno = 0;
range_start = strtol(subtoken, &endptr, 10);
if ( (!range_start && *endptr) || errno) {
return -1;
}
/*
* Here we assume that if we get a second subtoken
* then we must be processing a range.
*/
subtoken = strtok_r(NULL, dash, &dash_saveptr);
if (subtoken) {
errno = 0;
range_end = strtol(subtoken, &endptr, 10);
if ( (!range_end && *endptr) || errno) {
return -1;
}
subtoken = NULL;
} else {
range_end = range_start;
}
for (i = range_start; i <= range_end; i++) {
if (i > (CPU_SETSIZE-1)) {
return -1;
} else {
CPU_SET(i,cpu_set);
}
}
}
token = strtok_r(NULL, comma, &comma_saveptr);
}
return 0;
}
int mce_sys_var::hex_to_cpuset(char *start, cpu_set_t *cpu_set)
{
const char *end;
char hexc[2];
int i, length, digit;
int bit = 0, set_one = 0;
/*
* The least significant bits are at the end of the
* string, so we need to start our processing at the
* last character and work our way back to the start.
*/
length = strlen(start);
end = start + (length - 1);
CPU_ZERO(cpu_set);
while (length) {
*hexc = *end;
*(hexc+1) = 0; // NULL terminate the string or strtol can be buggy.
if (!isxdigit(*hexc)) {
return -1;
}
digit = strtol(hexc, NULL, 16);
/*
* Each hex digit is 4 bits. For each bit set per
* in the hex value set the corresponding CPU number
* in the cpu_set.
*
* Since we are working on a single hex digit in a string
* of unknown length we need to keep a running bit counter
* so we don't lose track of our progress.
*/
for (i = 0; i < 4; i++)
{
if (digit & (1 << i)) {
if (bit > (CPU_SETSIZE-1)) {
return -1;
} else {
CPU_SET(bit,cpu_set);
set_one++;
}
}
bit++;
}
/* move the end pointer back a character */
end--;
/* one less character to process */
length--;
}
/*
* passing all 0's is not legal. if no bits were set
* and we make it to the end of the function then return
* failure.
*/
if (!set_one) {
return -1;
} else {
return 0;
}
}
int mce_sys_var::env_to_cpuset(char *orig_start, cpu_set_t *cpu_set)
{
int ret;
char* start = strdup(orig_start); // save the caller string from strtok destruction.
/*
* We expect a hex number or comma delimited cpulist. Check for
* starting characters of "0x" or "0X" and if present then parse
* the string as a hexidecimal value, otherwise treat it as a
* cpulist.
*/
if ((strlen(start) > 2) &&
(start[0] == '0') &&
((start[1] == 'x') || (start[1] == 'X'))) {
ret = hex_to_cpuset(start + 2, cpu_set);
} else {
ret = list_to_cpuset(start, cpu_set);
}
free(start);
return ret;
}
void mce_sys_var::read_env_variable_with_pid(char* mce_sys_name, size_t mce_sys_max_size, char* env_ptr)
{
int n = -1;
char* d_pos = NULL;
if (NULL == env_ptr || NULL == mce_sys_name || mce_sys_max_size < 2) {
return ;
}
d_pos = strstr(env_ptr, "%d");
if (!d_pos) { // no %d in the string
n = snprintf(mce_sys_name, mce_sys_max_size - 1, "%s", env_ptr);
if (unlikely((((int)mce_sys_max_size - 1) < n) || (n < 0))) {
mce_sys_name[0] = '\0';
}
} else { // has at least one occurrence of %d - replace the first one with the process PID
size_t bytes_num = MIN((size_t)(d_pos - env_ptr), mce_sys_max_size - 1);
strncpy(mce_sys_name, env_ptr, bytes_num);
mce_sys_name[bytes_num] = '\0';
n = snprintf(mce_sys_name + bytes_num, mce_sys_max_size - bytes_num - 1, "%d", getpid());
if (likely((0 < n) && (n < ((int)mce_sys_max_size - (int)bytes_num - 1)))) {
bytes_num += n;
snprintf(mce_sys_name + bytes_num, mce_sys_max_size - bytes_num, "%s", d_pos + 2);
}
}
}
bool mce_sys_var::check_cpuinfo_flag(const char* flag)
{
FILE *fp;
char *line;
bool ret = false;
fp = fopen("/proc/cpuinfo", "r");
if (!fp) {
vlog_printf(VLOG_ERROR, "error while fopen\n");
print_vma_load_failure_msg();
return false;
}
line = (char*)malloc(MAX_CMD_LINE);
BULLSEYE_EXCLUDE_BLOCK_START
if (!line) {
vlog_printf(VLOG_ERROR, "error while malloc\n");
print_vma_load_failure_msg();
goto exit;
}
BULLSEYE_EXCLUDE_BLOCK_END
while (fgets(line, MAX_CMD_LINE, fp)) {
if (strncmp(line, "flags\t", 5) == 0) {
if (strstr(line, flag)) {
ret = true;
goto exit;
}
}
}
exit:
fclose(fp);
free(line);
return ret;
}
/*
* Intel and AMD CPUs have reserved bit 31 of ECX of CPUID leaf 0x1 as the hypervisor present bit.
* This bit allows hypervisors to indicate their presence to the guest operating system.
* Hypervisors set this bit and physical CPUs (all existing and future CPUs) set this bit to zero.
* Guest operating systems can test bit 31 to detect if they are running inside a virtual machine.
*/
bool mce_sys_var::cpuid_hv()
{
#if defined(__x86_64__)
uint32_t _ecx;
__asm__ __volatile__("cpuid" \
: "=c"(_ecx) \
: "a"(0x01));
usleep(0);
return (bool)((_ecx >> 31) & 0x1);
#else
return check_cpuinfo_flag(VIRTUALIZATION_FLAG);
#endif
}
/*
* Intel and AMD have also reserved CPUID leaves 0x40000000 - 0x400000FF for software use.
* Hypervisors can use these leaves to provide an interface to pass information from the
* hypervisor to the guest operating system running inside a virtual machine.
* The hypervisor bit indicates the presence of a hypervisor and that it is safe to test
* these additional software leaves. VMware defines the 0x40000000 leaf as the hypervisor CPUID
* information leaf. Code running on a VMware hypervisor can test the CPUID information leaf
* for the hypervisor signature. VMware stores the string "VMwareVMware" in
* EBX, ECX, EDX of CPUID leaf 0x40000000.
*/
const char* mce_sys_var::cpuid_hv_vendor()
{
static __thread char vendor[13] = {0};
if (!cpuid_hv()) {
return NULL;
}
#if defined(__x86_64__)
uint32_t _ebx = 0, _ecx = 0, _edx = 0;
__asm__ __volatile__("cpuid" \
: "=b"(_ebx), \
"=c"(_ecx), \
"=d"(_edx) \
: "a"(0x40000000));
sprintf(vendor, "%c%c%c%c", _ebx, (_ebx >> 8), (_ebx >> 16), (_ebx >> 24));
sprintf(vendor + 4, "%c%c%c%c", _ecx, (_ecx >> 8), (_ecx >> 16), (_ecx >> 24));
sprintf(vendor + 8, "%c%c%c%c", _edx, (_edx >> 8), (_edx >> 16), (_edx >> 24));
vendor[12] = 0x00;
#endif
return vendor;
}
void mce_sys_var::read_hv()
{
const char *hyper_vendor_id = NULL;
hypervisor = mce_sys_var::HYPER_NONE;
hyper_vendor_id = cpuid_hv_vendor();
if (hyper_vendor_id) {
if (!strncmp("XenVMMXenVMM", hyper_vendor_id, 12)) {
hypervisor = HYPER_XEN;
} else if (!strncmp("KVMKVMKVM", hyper_vendor_id, 9)) {
hypervisor = HYPER_KVM;
} else if (!strncmp("Microsoft Hv", hyper_vendor_id, 12)) {
hypervisor = HYPER_MSHV;
} else if (!strncmp("VMwareVMware", hyper_vendor_id, 12)) {
hypervisor = HYPER_VMWARE;
} else {
hypervisor = HYPER_NONE;
}
}
}
void mce_sys_var::get_env_params()
{
int c = 0, len =0;
char *env_ptr;
FILE *fp = NULL;
int app_name_size = MAX_CMD_LINE;
// Large buffer size to avoid need for realloc
fp = fopen("/proc/self/cmdline", "r");
if (!fp) {
vlog_printf(VLOG_ERROR, "error while fopen\n");
print_vma_load_failure_msg();
exit(1);
}
app_name = (char *)malloc(app_name_size);
BULLSEYE_EXCLUDE_BLOCK_START
if (!app_name) {
vlog_printf(VLOG_ERROR, "error while malloc\n");
print_vma_load_failure_msg();
exit(1);
}
BULLSEYE_EXCLUDE_BLOCK_END
while ((c = fgetc(fp)) != EOF){
app_name[len++] = (c==0?' ':c);
if (len>=app_name_size) {
app_name_size=app_name_size*2;
app_name = (char*)realloc(app_name, app_name_size);
BULLSEYE_EXCLUDE_BLOCK_START
if (!app_name) {
vlog_printf(VLOG_ERROR, "error while malloc\n");
print_vma_load_failure_msg();
exit(1);
}
BULLSEYE_EXCLUDE_BLOCK_END
}
}
app_name[len-1] = '\0';
fclose(fp);
memset(vma_time_measure_filename, 0, sizeof(vma_time_measure_filename));
strcpy(vma_time_measure_filename, MCE_DEFAULT_TIME_MEASURE_DUMP_FILE);
memset(log_filename, 0, sizeof(log_filename));
memset(stats_filename, 0, sizeof(stats_filename));
memset(stats_shmem_dirname, 0, sizeof(stats_shmem_dirname));
memset(vmad_notify_dir, 0, sizeof(vmad_notify_dir));
strcpy(stats_filename, MCE_DEFAULT_STATS_FILE);
strcpy(vmad_notify_dir, MCE_DEFAULT_VMAD_FOLDER);
strcpy(stats_shmem_dirname, MCE_DEFAULT_STATS_SHMEM_DIR);
strcpy(conf_filename, MCE_DEFAULT_CONF_FILE);
strcpy(app_id, MCE_DEFAULT_APP_ID);
strcpy(internal_thread_cpuset, MCE_DEFAULT_INTERNAL_THREAD_CPUSET);
strcpy(internal_thread_affinity_str, MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR);
log_level = VLOG_DEFAULT;
log_details = MCE_DEFAULT_LOG_DETAILS;
log_colors = MCE_DEFAULT_LOG_COLORS;
handle_sigintr = MCE_DEFAULT_HANDLE_SIGINTR;
handle_segfault = MCE_DEFAULT_HANDLE_SIGFAULT;
stats_fd_num_max = MCE_DEFAULT_STATS_FD_NUM;
ring_allocation_logic_tx= MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX;
ring_allocation_logic_rx= MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX;
ring_migration_ratio_tx = MCE_DEFAULT_RING_MIGRATION_RATIO_TX;
ring_migration_ratio_rx = MCE_DEFAULT_RING_MIGRATION_RATIO_RX;
ring_limit_per_interface= MCE_DEFAULT_RING_LIMIT_PER_INTERFACE;
ring_dev_mem_tx = MCE_DEFAULT_RING_DEV_MEM_TX;
tcp_max_syn_rate = MCE_DEFAULT_TCP_MAX_SYN_RATE;
tx_num_segs_tcp = MCE_DEFAULT_TX_NUM_SEGS_TCP;
tx_num_bufs = MCE_DEFAULT_TX_NUM_BUFS;
#ifdef DEFINED_TSO
tx_buf_size = MCE_DEFAULT_TX_BUF_SIZE;
#endif /* DEFINED_TSO */
tx_num_wr = MCE_DEFAULT_TX_NUM_WRE;
tx_num_wr_to_signal = MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL;
tx_max_inline = MCE_DEFAULT_TX_MAX_INLINE;
tx_mc_loopback_default = MCE_DEFAULT_TX_MC_LOOPBACK;
tx_nonblocked_eagains = MCE_DEFAULT_TX_NONBLOCKED_EAGAINS;
tx_prefetch_bytes = MCE_DEFAULT_TX_PREFETCH_BYTES;
tx_bufs_batch_udp = MCE_DEFAULT_TX_BUFS_BATCH_UDP;
tx_bufs_batch_tcp = MCE_DEFAULT_TX_BUFS_BATCH_TCP;
rx_num_bufs = MCE_DEFAULT_RX_NUM_BUFS;
rx_bufs_batch = MCE_DEFAULT_RX_BUFS_BATCH;
rx_num_wr = MCE_DEFAULT_RX_NUM_WRE;
rx_num_wr_to_post_recv = MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV;
rx_poll_num = MCE_DEFAULT_RX_NUM_POLLS;
rx_poll_num_init = MCE_DEFAULT_RX_NUM_POLLS_INIT;
rx_udp_poll_os_ratio = MCE_DEFAULT_RX_UDP_POLL_OS_RATIO;
hw_ts_conversion_mode = MCE_DEFAULT_HW_TS_CONVERSION_MODE;
rx_poll_yield_loops = MCE_DEFAULT_RX_POLL_YIELD;
select_handle_cpu_usage_stats = MCE_DEFAULT_SELECT_CPU_USAGE_STATS;
rx_ready_byte_min_limit = MCE_DEFAULT_RX_BYTE_MIN_LIMIT;
rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES;
rx_prefetch_bytes_before_poll = MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL;
rx_cq_drain_rate_nsec = MCE_DEFAULT_RX_CQ_DRAIN_RATE;
rx_delta_tsc_between_cq_polls = 0;
gro_streams_max = MCE_DEFAULT_GRO_STREAMS_MAX;
tcp_3t_rules = MCE_DEFAULT_TCP_3T_RULES;
udp_3t_rules = MCE_DEFAULT_UDP_3T_RULES;
eth_mc_l2_only_rules = MCE_DEFAULT_ETH_MC_L2_ONLY_RULES;
mc_force_flowtag = MCE_DEFAULT_MC_FORCE_FLOWTAG;
select_poll_num = MCE_DEFAULT_SELECT_NUM_POLLS;
select_poll_os_force = MCE_DEFAULT_SELECT_POLL_OS_FORCE;
select_poll_os_ratio = MCE_DEFAULT_SELECT_POLL_OS_RATIO;
select_skip_os_fd_check = MCE_DEFAULT_SELECT_SKIP_OS;
cq_moderation_enable = MCE_DEFAULT_CQ_MODERATION_ENABLE;
cq_moderation_count = MCE_DEFAULT_CQ_MODERATION_COUNT;
cq_moderation_period_usec = MCE_DEFAULT_CQ_MODERATION_PERIOD_USEC;
cq_aim_max_count = MCE_DEFAULT_CQ_AIM_MAX_COUNT;
cq_aim_max_period_usec = MCE_DEFAULT_CQ_AIM_MAX_PERIOD_USEC;
cq_aim_interval_msec = MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC;
cq_aim_interrupts_rate_per_sec = MCE_DEFAULT_CQ_AIM_INTERRUPTS_RATE_PER_SEC;
cq_poll_batch_max = MCE_DEFAULT_CQ_POLL_BATCH;
progress_engine_interval_msec = MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC;
progress_engine_wce_max = MCE_DEFAULT_PROGRESS_ENGINE_WCE_MAX;
cq_keep_qp_full = MCE_DEFAULT_CQ_KEEP_QP_FULL;
qp_compensation_level = MCE_DEFAULT_QP_COMPENSATION_LEVEL;
internal_thread_arm_cq_enabled = MCE_DEFAULT_INTERNAL_THREAD_ARM_CQ_ENABLED;
offloaded_sockets = MCE_DEFAULT_OFFLOADED_SOCKETS;
timer_resolution_msec = MCE_DEFAULT_TIMER_RESOLUTION_MSEC;
tcp_timer_resolution_msec= MCE_DEFAULT_TCP_TIMER_RESOLUTION_MSEC;
internal_thread_tcp_timer_handling = MCE_DEFAULT_INTERNAL_THREAD_TCP_TIMER_HANDLING;
tcp_ctl_thread = MCE_DEFAULT_TCP_CTL_THREAD;
tcp_ts_opt = MCE_DEFAULT_TCP_TIMESTAMP_OPTION;
tcp_nodelay = MCE_DEFAULT_TCP_NODELAY;
tcp_quickack = MCE_DEFAULT_TCP_QUICKACK;
// exception_handling is handled by its CTOR
avoid_sys_calls_on_tcp_fd = MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD;
allow_privileged_sock_opt = MCE_DEFAULT_ALLOW_PRIVILEGED_SOCK_OPT;
wait_after_join_msec = MCE_DEFAULT_WAIT_AFTER_JOIN_MSEC;
thread_mode = MCE_DEFAULT_THREAD_MODE;
buffer_batching_mode = MCE_DEFAULT_BUFFER_BATCHING_MODE;
mem_alloc_type = MCE_DEFAULT_MEM_ALLOC_TYPE;
enable_ipoib = MCE_DEFAULT_IPOIB_FLAG;
enable_socketxtreme = MCE_DEFAULT_SOCKETXTREME;
#ifdef DEFINED_TSO
enable_tso = MCE_DEFAULT_TSO;
#endif /* DEFINED_TSO */
handle_fork = MCE_DEFAULT_FORK_SUPPORT;
handle_bf = MCE_DEFAULT_BF_FLAG;
close_on_dup2 = MCE_DEFAULT_CLOSE_ON_DUP2;
mtu = MCE_DEFAULT_MTU;
lwip_mss = MCE_DEFAULT_MSS;
lwip_cc_algo_mod = MCE_DEFAULT_LWIP_CC_ALGO_MOD;
mce_spec = MCE_SPEC_NONE;
mce_spec_param1 = 1;
mce_spec_param2 = 1;
neigh_num_err_retries = MCE_DEFAULT_NEIGH_NUM_ERR_RETRIES;
neigh_uc_arp_quata = MCE_DEFAULT_NEIGH_UC_ARP_QUATA;
neigh_wait_till_send_arp_msec = MCE_DEFAULT_NEIGH_UC_ARP_DELAY_MSEC;
timer_netlink_update_msec = MCE_DEFAULT_NETLINK_TIMER_MSEC;
rx_poll_on_tx_tcp = MCE_DEFAULT_RX_POLL_ON_TX_TCP;
trigger_dummy_send_getsockname = MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME;
#ifdef VMA_TIME_MEASURE
vma_time_measure_num_samples = MCE_DEFAULT_TIME_MEASURE_NUM_SAMPLES;
#endif
read_hv();
/* Configure enable_socketxtreme as first because
* this mode has some special predefined parameter limitations
*/
if((env_ptr = getenv(SYS_VAR_SOCKETXTREME )) != NULL) {
enable_socketxtreme = atoi(env_ptr) ? true : false;
}
if (enable_socketxtreme) {
/* Set following parameters as default for SocketXtreme mode */
rx_num_wr = 1024;
gro_streams_max = 0;
progress_engine_interval_msec = MCE_CQ_DRAIN_INTERVAL_DISABLED;
}
if ((env_ptr = getenv(SYS_VAR_SPEC)) != NULL){
mce_spec = (uint32_t)vma_spec::from_str(env_ptr, MCE_SPEC_NONE);
}
switch (mce_spec) {
case MCE_SPEC_SOCKPERF_ULTRA_LATENCY_10:
tx_num_segs_tcp = 512; //MCE_DEFAULT_TX_NUM_SEGS_TCP (1000000)
tx_num_bufs = 512; //MCE_DEFAULT_TX_NUM_BUFS (200000)
tx_num_wr = 256; //MCE_DEFAULT_TX_NUM_WRE (3000)
tx_num_wr_to_signal = 4; //MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL (64)
tx_prefetch_bytes = MCE_DEFAULT_TX_PREFETCH_BYTES; //(256)
tx_bufs_batch_udp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_UDP (8)
tx_bufs_batch_tcp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_TCP;
rx_num_bufs = 1024; //MCE_DEFAULT_RX_NUM_BUFS (200000)
rx_bufs_batch = 4; //MCE_DEFAULT_RX_BUFS_BATCH (64)
rx_num_wr = 256; //MCE_DEFAULT_RX_NUM_WRE (16000)
rx_num_wr_to_post_recv = 4; //MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV (64)
rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS
#ifdef DEFINED_TSO
enable_tso = false; //MCE_DEFAULT_TSO (true)
#endif /* DEFINED_TSO */
rx_udp_poll_os_ratio = 0; //MCE_DEFAULT_RX_UDP_POLL_OS_RATIO
rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES; //(256)
rx_prefetch_bytes_before_poll = 256; //MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL 0
select_poll_num = -1;
select_poll_os_ratio = 0;
select_skip_os_fd_check = 0;
avoid_sys_calls_on_tcp_fd = true; //MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD (false)
gro_streams_max = 0; //MCE_DEFAULT_GRO_STREAMS_MAX (32)
progress_engine_interval_msec = 0;
cq_keep_qp_full = false; //MCE_DEFAULT_CQ_KEEP_QP_FULL(true)
thread_mode = THREAD_MODE_SINGLE;
mem_alloc_type = ALLOC_TYPE_HUGEPAGES;
tcp_nodelay = true; // MCE_DEFAULT_TCP_NODELAY (false)
ring_dev_mem_tx = 16384; // MCE_DEFAULT_RING_DEV_MEM_TX (0)
strcpy(internal_thread_affinity_str, "0"); //MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR;
break;
case MCE_SPEC_SOCKPERF_LATENCY_15:
tx_num_wr = 256; //MCE_DEFAULT_TX_NUM_WRE (3000)
tx_num_wr_to_signal = 4; //MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL(64)
tx_bufs_batch_udp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_UDP (8)
tx_bufs_batch_tcp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_TCP (16)
rx_bufs_batch = 4; //MCE_DEFAULT_RX_BUFS_BATCH (64)
rx_num_wr = 256; //MCE_DEFAULT_RX_NUM_WRE (16000)
rx_num_wr_to_post_recv = 4; //MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV (64)
rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS (100000)
#ifdef DEFINED_TSO
enable_tso = false; //MCE_DEFAULT_TSO (true)
#endif /* DEFINED_TSO */
rx_prefetch_bytes_before_poll = 256; //MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL (0)
select_poll_num = -1; //MCE_DEFAULT_SELECT_NUM_POLLS (100000)
avoid_sys_calls_on_tcp_fd = true; //MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD (false)
gro_streams_max = 0; //MCE_DEFAULT_GRO_STREAMS_MAX (32)
cq_keep_qp_full = false; //MCE_DEFAULT_CQ_KEEP_QP_FULL (true)
thread_mode = THREAD_MODE_SINGLE; //MCE_DEFAULT_THREAD_MODE (THREAD_MODE_MULTI)
mem_alloc_type = ALLOC_TYPE_HUGEPAGES; //MCE_DEFAULT_MEM_ALLOC_TYPE (ALLOC_TYPE_CONTIG)
strcpy(internal_thread_affinity_str, "0"); //MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR ("-1")
progress_engine_interval_msec = 100; //MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC (10)
select_poll_os_ratio = 100; //MCE_DEFAULT_SELECT_POLL_OS_RATIO (10)
select_poll_os_force = 1; //MCE_DEFAULT_SELECT_POLL_OS_FORCE (0)
tcp_nodelay = true; // MCE_DEFAULT_TCP_NODELAY (falst)
ring_dev_mem_tx = 16384; // MCE_DEFAULT_RING_DEV_MEM_TX (0)
break;
case MCE_SPEC_29WEST_LBM_29:
mce_spec_param1 = 5000; // [u-sec] Time out to send next pipe_write
mce_spec_param2 = 50; // Num of max sequential pipe_write to drop
rx_poll_num = 0;
rx_udp_poll_os_ratio = 100;
select_poll_num = 100000;
select_poll_os_ratio = 100;
select_skip_os_fd_check = 50;
break;
case MCE_SPEC_WOMBAT_FH_LBM_554:
mce_spec_param1 = 5000; // [u-sec] Time out to send next pipe_write
mce_spec_param2 = 50; // Num of max sequential pipe_write to drop
rx_poll_num = 0;
rx_udp_poll_os_ratio = 100;
select_poll_num = 0;
select_skip_os_fd_check = 20;
break;
case MCE_SPEC_RTI_784:
rx_poll_num = -1;
// TODO - Need to replace old QP/CQ allocation logic here
// qp_allocation_logic = QP_ALLOC_LOGIC__QP_PER_PEER_IP_PER_LOCAL_IP;
// cq_allocation_logic = CQ_ALLOC_LOGIC__CQ_PER_QP;
break;
case MCE_SPEC_MCD_623:
ring_allocation_logic_rx = RING_LOGIC_PER_CORE_ATTACH_THREADS;
ring_allocation_logic_tx = RING_LOGIC_PER_CORE_ATTACH_THREADS;
break;
case MCE_SPEC_MCD_IRQ_624:
ring_allocation_logic_rx = RING_LOGIC_PER_CORE_ATTACH_THREADS;
ring_allocation_logic_tx = RING_LOGIC_PER_CORE_ATTACH_THREADS;
select_poll_num = 0;
rx_poll_num = 0;
cq_moderation_enable = false;
break;
case MCE_SPEC_LL_7750:
tx_num_bufs = 8192; // MCE_DEFAULT_TX_NUM_BUFS (200000), Global TX data buffers allocated
rx_num_bufs = 204800; // MCE_DEFAULT_RX_NUM_BUFS (200000), RX data buffers used on all QPs on all HCAs
log_level = VLOG_WARNING; //VLOG_DEFAULT(VLOG_INFO) VMA_TRACELEVEL
stats_fd_num_max = 1024; //MCE_DEFAULT_STATS_FD_NUM(100), max. number of sockets monitored by VMA stats
strcpy(internal_thread_affinity_str, "0x3"); // MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR(-1), first 2 cores
rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS(100000), Infinite RX poll for ready packets (during read/recv)
select_poll_num = -1; //MCE_DEFAULT_SELECT_NUM_POLLS(100000), Infinite poll the hardware on RX (before sleeping in epoll/select, etc)
select_poll_os_ratio = 0; //MCE_DEFAULT_SELECT_POLL_OS_RATIO(10), Disable polling OS fd's (re-enabled if bound on OS fd)
tcp_3t_rules = true; //MCE_DEFAULT_TCP_3T_RULES(false), Use only 3 tuple rules for TCP
avoid_sys_calls_on_tcp_fd = 1; //MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD (false), Disable handling control packets on a separate thread
buffer_batching_mode = BUFFER_BATCHING_NONE; //MCE_DEFAULT_BUFFER_BATCHING_MODE(BUFFER_BATCHING_WITH_RECLAIM), Disable handling control packets on a separate thread
tcp_ctl_thread = CTL_THREAD_NO_WAKEUP; //MCE_DEFAULT_TCP_CTL_THREAD (CTL_THREAD_DISABLE), wait for thread timer to expire
break;
case MCE_SPEC_LL_MULTI_RING:
mem_alloc_type = ALLOC_TYPE_HUGEPAGES; //MCE_DEFAULT_MEM_ALLOC_TYPE (ALLOC_TYPE_CONTIG) VMA_MEM_ALLOC_TYPE
select_poll_num = -1; //MCE_DEFAULT_SELECT_NUM_POLLS (100000) VMA_SELECT_POLL
rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS(100000) VMA_RX_POLL
ring_allocation_logic_tx = RING_LOGIC_PER_THREAD; //MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX(RING_LOGIC_PER_INTERFACE) VMA_RING_ALLOCATION_LOGIC_TX
ring_allocation_logic_rx = RING_LOGIC_PER_THREAD; //MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX(RING_LOGIC_PER_INTERFACE) VMA_RING_ALLOCATION_LOGIC_RX
select_poll_os_ratio = 0; //MCE_DEFAULT_SELECT_POLL_OS_RATIO(10) VMA_SELECT_POLL_OS_RATIO
select_skip_os_fd_check = 0; //MCE_DEFAULT_SELECT_SKIP_OS(4) VMA_SELECT_SKIP_OS
rx_poll_on_tx_tcp = true; //MCE_DEFAULT_RX_POLL_ON_TX_TCP (false)
trigger_dummy_send_getsockname = true; //MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME (false)
break;
case MCE_SPEC_NONE:
default:
break;
}
if ((env_ptr = getenv(SYS_VAR_SPEC_PARAM1)) != NULL)
mce_spec_param1 = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_SPEC_PARAM2)) != NULL)
mce_spec_param2 = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_LOG_FILENAME)) != NULL){
read_env_variable_with_pid(log_filename, sizeof(log_filename), env_ptr);
}
if ((env_ptr = getenv(SYS_VAR_STATS_FILENAME)) != NULL){
read_env_variable_with_pid(stats_filename, sizeof(stats_filename), env_ptr);
}
if ((env_ptr = getenv(SYS_VAR_STATS_SHMEM_DIRNAME)) != NULL){
read_env_variable_with_pid(stats_shmem_dirname, sizeof(stats_shmem_dirname), env_ptr);
}
if ((env_ptr = getenv(SYS_VAR_CONF_FILENAME)) != NULL){
read_env_variable_with_pid(conf_filename, sizeof(conf_filename), env_ptr);
}
if ((env_ptr = getenv(SYS_VAR_VMAD_DIR)) != NULL){
read_env_variable_with_pid(vmad_notify_dir, sizeof(vmad_notify_dir), env_ptr);
}
if ((env_ptr = getenv(SYS_VAR_LOG_LEVEL)) != NULL)
log_level = log_level::from_str(env_ptr, VLOG_DEFAULT);
if (log_level >= VLOG_DEBUG)
log_details = 2;
if ((env_ptr = getenv(SYS_VAR_LOG_DETAILS)) != NULL)
log_details = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_LOG_COLORS)) != NULL)
log_colors = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_APPLICATION_ID)) != NULL){
read_env_variable_with_pid(app_id, sizeof(app_id), env_ptr);
}
if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGINTR)) != NULL)
handle_sigintr = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGSEGV)) != NULL)
handle_segfault = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_STATS_FD_NUM)) != NULL) {
stats_fd_num_max = (uint32_t)atoi(env_ptr);
if (stats_fd_num_max > MAX_STATS_FD_NUM) {
vlog_printf(VLOG_WARNING," Can only monitor maximum %d sockets in statistics \n", MAX_STATS_FD_NUM);
stats_fd_num_max = MAX_STATS_FD_NUM;
}
}
if ((env_ptr = getenv(SYS_VAR_TX_NUM_SEGS_TCP)) != NULL)
tx_num_segs_tcp = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_TX_NUM_BUFS)) != NULL)
tx_num_bufs = (uint32_t)atoi(env_ptr);
#ifdef DEFINED_TSO
if ((env_ptr = getenv(SYS_VAR_TX_BUF_SIZE)) != NULL)
tx_buf_size = (uint32_t)atoi(env_ptr);
#endif /* DEFINED_TSO */
if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE)) != NULL)
tx_num_wr = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE_TO_SIGNAL)) != NULL)
tx_num_wr_to_signal = MIN(NUM_TX_WRE_TO_SIGNAL_MAX, MAX(1, (uint32_t)atoi(env_ptr)));
if (tx_num_wr <= (tx_num_wr_to_signal * 2))
tx_num_wr = tx_num_wr_to_signal * 2;
if ((env_ptr = getenv(SYS_VAR_TX_MAX_INLINE)) != NULL)
tx_max_inline = (uint32_t)atoi(env_ptr);
if (tx_max_inline > MAX_SUPPORTED_IB_INLINE_SIZE) {
vlog_printf(VLOG_WARNING,"VMA_TX_MAX_INLINE must be smaller or equal to %d [%d]\n",
MAX_SUPPORTED_IB_INLINE_SIZE, tx_max_inline);
tx_max_inline = MAX_SUPPORTED_IB_INLINE_SIZE;
}
unsigned int cx4_max_tx_wre_for_inl = (16 * 1024 * 64) / (VMA_ALIGN(VMA_ALIGN(tx_max_inline - 12, 64) + 12, 64));
if (tx_num_wr > cx4_max_tx_wre_for_inl) {
vlog_printf(VLOG_WARNING,"For the given VMA_TX_MAX_INLINE [%d], VMA_TX_WRE [%d] must be smaller than %d\n",
tx_max_inline, tx_num_wr, cx4_max_tx_wre_for_inl);
tx_num_wr = cx4_max_tx_wre_for_inl;
}
if ((env_ptr = getenv(SYS_VAR_TX_MC_LOOPBACK)) != NULL)
tx_mc_loopback_default = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_TX_NONBLOCKED_EAGAINS)) != NULL)
tx_nonblocked_eagains = atoi(env_ptr)? true : false;
if ((env_ptr = getenv(SYS_VAR_TX_PREFETCH_BYTES)) != NULL)
tx_prefetch_bytes = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_TX)) != NULL) {
ring_allocation_logic_tx = (ring_logic_t)atoi(env_ptr);
if (!is_ring_logic_valid(ring_allocation_logic_tx)) {
vlog_printf(VLOG_WARNING,"%s = %d is not valid, setting logic to default = %d\n",
SYS_VAR_RING_ALLOCATION_LOGIC_TX, ring_allocation_logic_tx, MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX);
ring_allocation_logic_tx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX;
}
}
if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_RX)) != NULL) {
ring_allocation_logic_rx = (ring_logic_t)atoi(env_ptr);
if (!is_ring_logic_valid(ring_allocation_logic_rx)) {
vlog_printf(VLOG_WARNING,"%s = %d is not valid, setting logic to default = %d\n",
SYS_VAR_RING_ALLOCATION_LOGIC_RX, ring_allocation_logic_rx, MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX);
ring_allocation_logic_rx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX;
}
}
if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_TX)) != NULL)
ring_migration_ratio_tx = (int32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_RX)) != NULL)
ring_migration_ratio_rx = (int32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_RING_LIMIT_PER_INTERFACE)) != NULL)
ring_limit_per_interface = MAX(0, (int32_t)atoi(env_ptr));
if ((env_ptr = getenv(SYS_VAR_RING_DEV_MEM_TX)) != NULL)
ring_dev_mem_tx = MAX(0, (int32_t)atoi(env_ptr));
if ((env_ptr = getenv(SYS_VAR_TCP_MAX_SYN_RATE)) != NULL)
tcp_max_syn_rate = MIN(TCP_MAX_SYN_RATE_TOP_LIMIT, MAX(0, (int32_t)atoi(env_ptr)));
if ((env_ptr = getenv(SYS_VAR_RX_NUM_BUFS)) != NULL)
rx_num_bufs = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE_TO_POST_RECV)) != NULL)
rx_num_wr_to_post_recv = MIN(NUM_RX_WRE_TO_POST_RECV_MAX, MAX(1, (uint32_t)atoi(env_ptr)));
if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE)) != NULL)
rx_num_wr = (uint32_t)atoi(env_ptr);
if (rx_num_wr <= (rx_num_wr_to_post_recv * 2))
rx_num_wr = rx_num_wr_to_post_recv * 2;
if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS)) != NULL) {
rx_poll_num = atoi(env_ptr);
}
if (rx_poll_num < MCE_MIN_RX_NUM_POLLS || rx_poll_num > MCE_MAX_RX_NUM_POLLS) {
vlog_printf(VLOG_WARNING," Rx Poll loops should be between %d and %d [%d]\n", MCE_MIN_RX_NUM_POLLS, MCE_MAX_RX_NUM_POLLS, rx_poll_num);
rx_poll_num = MCE_DEFAULT_RX_NUM_POLLS;
}
if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS_INIT)) != NULL)
rx_poll_num_init = atoi(env_ptr);
if (rx_poll_num_init < MCE_MIN_RX_NUM_POLLS || rx_poll_num_init > MCE_MAX_RX_NUM_POLLS) {
vlog_printf(VLOG_WARNING," Rx Poll loops should be between %d and %d [%d]\n", MCE_MIN_RX_NUM_POLLS, MCE_MAX_RX_NUM_POLLS, rx_poll_num_init);
rx_poll_num_init = MCE_DEFAULT_RX_NUM_POLLS_INIT;
}
if (rx_poll_num == 0)
rx_poll_num = 1; // Force at least one good polling loop
if ((env_ptr = getenv(SYS_VAR_RX_UDP_POLL_OS_RATIO)) != NULL)
rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_HW_TS_CONVERSION_MODE)) != NULL) {
hw_ts_conversion_mode = (ts_conversion_mode_t)atoi(env_ptr);
if ((uint32_t)hw_ts_conversion_mode >= TS_CONVERSION_MODE_LAST) {
vlog_printf(VLOG_WARNING,"HW TS conversion size out of range [%d] (min=%d, max=%d). using default [%d]\n", hw_ts_conversion_mode, TS_CONVERSION_MODE_DISABLE , TS_CONVERSION_MODE_LAST - 1, MCE_DEFAULT_HW_TS_CONVERSION_MODE);
hw_ts_conversion_mode = MCE_DEFAULT_HW_TS_CONVERSION_MODE;
}
}
//The following 2 params were replaced by SYS_VAR_RX_UDP_POLL_OS_RATIO
if ((env_ptr = getenv(SYS_VAR_RX_POLL_OS_RATIO)) != NULL) {
rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr);
vlog_printf(VLOG_WARNING,"The parameter VMA_RX_POLL_OS_RATIO is no longer in use. Parameter VMA_RX_UDP_POLL_OS_RATIO was set to %d instead\n", rx_udp_poll_os_ratio);
}
if ((env_ptr = getenv(SYS_VAR_RX_SKIP_OS)) != NULL) {
rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr);
vlog_printf(VLOG_WARNING,"The parameter VMA_RX_SKIP_OS is no longer in use. Parameter VMA_RX_UDP_POLL_OS_RATIO was set to %d instead\n", rx_udp_poll_os_ratio);
}
if ((env_ptr = getenv(SYS_VAR_RX_POLL_YIELD)) != NULL)
rx_poll_yield_loops = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_SELECT_CPU_USAGE_STATS)) != NULL)
select_handle_cpu_usage_stats = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_RX_BYTE_MIN_LIMIT)) != NULL)
rx_ready_byte_min_limit = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES)) != NULL)
rx_prefetch_bytes = (uint32_t)atoi(env_ptr);
if (rx_prefetch_bytes < MCE_MIN_RX_PREFETCH_BYTES || rx_prefetch_bytes > MCE_MAX_RX_PREFETCH_BYTES) {
vlog_printf(VLOG_WARNING," Rx prefetch bytes size out of range [%d] (min=%d, max=%d)\n", rx_prefetch_bytes, MCE_MIN_RX_PREFETCH_BYTES, MCE_MAX_RX_PREFETCH_BYTES);
rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES;
}
if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL)) != NULL)
rx_prefetch_bytes_before_poll = (uint32_t)atoi(env_ptr);
if (rx_prefetch_bytes_before_poll != 0 && (rx_prefetch_bytes_before_poll < MCE_MIN_RX_PREFETCH_BYTES || rx_prefetch_bytes_before_poll > MCE_MAX_RX_PREFETCH_BYTES)) {
vlog_printf(VLOG_WARNING," Rx prefetch bytes size out of range [%d] (min=%d, max=%d, disabled=0)\n", rx_prefetch_bytes_before_poll, MCE_MIN_RX_PREFETCH_BYTES, MCE_MAX_RX_PREFETCH_BYTES);
rx_prefetch_bytes_before_poll = MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL;
}
if ((env_ptr = getenv(SYS_VAR_RX_CQ_DRAIN_RATE_NSEC)) != NULL)
rx_cq_drain_rate_nsec = atoi(env_ptr);
// Update the rx cq polling rate for draining logic
tscval_t tsc_per_second = get_tsc_rate_per_second();
rx_delta_tsc_between_cq_polls = tsc_per_second * rx_cq_drain_rate_nsec / NSEC_PER_SEC;
if ((env_ptr = getenv(SYS_VAR_GRO_STREAMS_MAX)) != NULL)
gro_streams_max = MAX(atoi(env_ptr), 0);
if ((env_ptr = getenv(SYS_VAR_TCP_3T_RULES)) != NULL)
tcp_3t_rules = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_UDP_3T_RULES)) != NULL)
udp_3t_rules = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_ETH_MC_L2_ONLY_RULES)) != NULL)
eth_mc_l2_only_rules = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_MC_FORCE_FLOWTAG)) != NULL)
mc_force_flowtag = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_SELECT_NUM_POLLS)) != NULL)
select_poll_num = atoi(env_ptr);
if (select_poll_num < MCE_MIN_RX_NUM_POLLS || select_poll_num > MCE_MAX_RX_NUM_POLLS) {
vlog_printf(VLOG_WARNING," Select Poll loops can not be below zero [%d]\n", select_poll_num);
select_poll_num = MCE_DEFAULT_SELECT_NUM_POLLS;
}
if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_FORCE)) != NULL)
select_poll_os_force = (uint32_t)atoi(env_ptr);
if (select_poll_os_force) {
select_poll_os_ratio = 1;
select_skip_os_fd_check = 1;
}
if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_RATIO)) != NULL)
select_poll_os_ratio = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_SELECT_SKIP_OS)) != NULL)
select_skip_os_fd_check = (uint32_t)atoi(env_ptr);
#ifdef DEFINED_IBV_CQ_ATTR_MODERATE
if (rx_poll_num < 0 || select_poll_num < 0) {
cq_moderation_enable = false;
}
if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE)) != NULL)
cq_moderation_enable = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT)) != NULL)
cq_moderation_count = (uint32_t)atoi(env_ptr);
if (cq_moderation_count > rx_num_wr / 2) {
cq_moderation_count = rx_num_wr / 2;
}
if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC)) != NULL)
cq_moderation_period_usec = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT)) != NULL)
cq_aim_max_count = (uint32_t)atoi(env_ptr);
if (cq_aim_max_count > rx_num_wr / 2){
cq_aim_max_count = rx_num_wr / 2;
}
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC)) != NULL)
cq_aim_max_period_usec = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC)) != NULL)
cq_aim_interval_msec = (uint32_t)atoi(env_ptr);
if (!cq_moderation_enable) {
cq_aim_interval_msec = MCE_CQ_ADAPTIVE_MODERATION_DISABLED;
}
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC)) != NULL)
cq_aim_interrupts_rate_per_sec = (uint32_t)atoi(env_ptr);
#else
if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE)) != NULL) {
vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_MODERATION_ENABLE);
}
if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT)) != NULL) {
vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_MODERATION_COUNT);
}
if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC)) != NULL) {
vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_MODERATION_PERIOD_USEC);
}
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT)) != NULL) {
vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_MAX_COUNT);
}
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC)) != NULL) {
vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_MAX_PERIOD_USEC);
}
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC)) != NULL) {
vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_INTERVAL_MSEC);
}
if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC)) != NULL) {
vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC);
}
#endif /* DEFINED_IBV_CQ_ATTR_MODERATE */
if ((env_ptr = getenv(SYS_VAR_CQ_POLL_BATCH_MAX)) != NULL)
cq_poll_batch_max = (uint32_t)atoi(env_ptr);
if (cq_poll_batch_max < MCE_MIN_CQ_POLL_BATCH || cq_poll_batch_max > MCE_MAX_CQ_POLL_BATCH) {
vlog_printf(VLOG_WARNING," Rx number of cq poll batchs should be between %d and %d [%d]\n", MCE_MIN_CQ_POLL_BATCH, MCE_MAX_CQ_POLL_BATCH, cq_poll_batch_max);
cq_poll_batch_max = MCE_DEFAULT_CQ_POLL_BATCH;
}
if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) != NULL)
progress_engine_interval_msec = (uint32_t)atoi(env_ptr);
if (enable_socketxtreme && (progress_engine_interval_msec != MCE_CQ_DRAIN_INTERVAL_DISABLED)) {
progress_engine_interval_msec = MCE_CQ_DRAIN_INTERVAL_DISABLED;
vlog_printf(VLOG_DEBUG,"%s parameter is forced to %d in case %s is enabled\n",
SYS_VAR_PROGRESS_ENGINE_INTERVAL, progress_engine_interval_msec, SYS_VAR_SOCKETXTREME);
}
if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_WCE_MAX)) != NULL)
progress_engine_wce_max = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_CQ_KEEP_QP_FULL)) != NULL)
cq_keep_qp_full = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_QP_COMPENSATION_LEVEL)) != NULL)
qp_compensation_level = (uint32_t)atoi(env_ptr);
if (qp_compensation_level < rx_num_wr_to_post_recv)
qp_compensation_level = rx_num_wr_to_post_recv;
if ((env_ptr = getenv(SYS_VAR_OFFLOADED_SOCKETS)) != NULL)
offloaded_sockets = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_TIMER_RESOLUTION_MSEC)) != NULL)
timer_resolution_msec = atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_TCP_TIMER_RESOLUTION_MSEC)) != NULL)
tcp_timer_resolution_msec = atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_TCP_TIMER_HANDLING)) != NULL) {
internal_thread_tcp_timer_handling =
atoi(env_ptr) == 1 ? INTERNAL_THREAD_TCP_TIMER_HANDLING_IMMEDIATE : INTERNAL_THREAD_TCP_TIMER_HANDLING_DEFERRED;
}
if ((env_ptr = getenv(SYS_VAR_TCP_CTL_THREAD)) != NULL) {
tcp_ctl_thread = (tcp_ctl_thread_t)atoi(env_ptr);
if (tcp_ctl_thread >= CTL_THREAD_LAST || tcp_ctl_thread < 0)
tcp_ctl_thread = MCE_DEFAULT_TCP_CTL_THREAD;
}
if ((env_ptr = getenv(SYS_VAR_TCP_TIMESTAMP_OPTION)) != NULL) {
tcp_ts_opt = (tcp_ts_opt_t)atoi(env_ptr);
if ((uint32_t) tcp_ts_opt >= TCP_TS_OPTION_LAST) {
vlog_printf(VLOG_WARNING,"TCP timestamp option value is out of range [%d] (min=%d, max=%d). using default [%d]\n", tcp_ts_opt, TCP_TS_OPTION_DISABLE , TCP_TS_OPTION_LAST - 1, MCE_DEFAULT_TCP_TIMESTAMP_OPTION);
tcp_ts_opt = MCE_DEFAULT_TCP_TIMESTAMP_OPTION;
}
}
if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY)) != NULL) {
tcp_nodelay = atoi(env_ptr) ? true : false;
}
if ((env_ptr = getenv(SYS_VAR_TCP_QUICKACK)) != NULL) {
tcp_quickack = atoi(env_ptr) ? true : false;
}
// TODO: this should be replaced by calling "exception_handling.init()" that will be called from init()
if ((env_ptr = getenv(vma_exception_handling::getSysVar())) != NULL) {
exception_handling = vma_exception_handling(strtol(env_ptr, NULL, 10)); // vma_exception_handling is responsible for its invariant
}
if ((env_ptr = getenv(SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD)) != NULL) {
avoid_sys_calls_on_tcp_fd = atoi(env_ptr) ? true : false;
}
if ((env_ptr = getenv(SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT)) != NULL) {
allow_privileged_sock_opt = atoi(env_ptr) ? true : false;
}
if(tcp_timer_resolution_msec < timer_resolution_msec){
vlog_printf(VLOG_WARNING," TCP timer resolution [%s=%d] cannot be smaller than timer resolution [%s=%d]. Setting TCP timer resolution to %d msec.\n", SYS_VAR_TCP_TIMER_RESOLUTION_MSEC, tcp_timer_resolution_msec, SYS_VAR_TIMER_RESOLUTION_MSEC, timer_resolution_msec, timer_resolution_msec);
tcp_timer_resolution_msec = timer_resolution_msec;
}
if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_ARM_CQ)) != NULL)
internal_thread_arm_cq_enabled = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_CPUSET)) != NULL) {
snprintf(internal_thread_cpuset, FILENAME_MAX, "%s", env_ptr);
}
// handle internal thread affinity - default is CPU-0
if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_AFFINITY)) != NULL) {
int n = snprintf(internal_thread_affinity_str,
sizeof(internal_thread_affinity_str), "%s", env_ptr);
if (unlikely(((int)sizeof(internal_thread_affinity_str) < n) || (n < 0))) {
vlog_printf(VLOG_WARNING,"Failed to process: %s.\n",
SYS_VAR_INTERNAL_THREAD_AFFINITY);
}
}
if (env_to_cpuset(internal_thread_affinity_str, &internal_thread_affinity)) {
vlog_printf(VLOG_WARNING," Failed to set internal thread affinity: %s... deferring to cpu-0.\n",
internal_thread_affinity_str);
}
if ((env_ptr = getenv(SYS_VAR_WAIT_AFTER_JOIN_MSEC)) != NULL)
wait_after_join_msec = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_THREAD_MODE)) != NULL) {
thread_mode = (thread_mode_t)atoi(env_ptr);
if (thread_mode < 0 || thread_mode >= THREAD_MODE_LAST)
thread_mode = MCE_DEFAULT_THREAD_MODE;
}
if ((env_ptr = getenv(SYS_VAR_BUFFER_BATCHING_MODE)) != NULL) {
buffer_batching_mode = (buffer_batching_mode_t)atoi(env_ptr);
if (buffer_batching_mode < 0 || buffer_batching_mode >= BUFFER_BATCHING_LAST)
buffer_batching_mode = MCE_DEFAULT_BUFFER_BATCHING_MODE;
}
if (buffer_batching_mode == BUFFER_BATCHING_NONE) {
tx_bufs_batch_tcp = 1;
tx_bufs_batch_udp = 1;
rx_bufs_batch = 1;
}
if ((env_ptr = getenv(SYS_VAR_NETLINK_TIMER_MSEC)) != NULL)
timer_netlink_update_msec = (uint32_t)atoi(env_ptr);
if((env_ptr = getenv(SYS_VAR_NEIGH_NUM_ERR_RETRIES))!= NULL) {
neigh_num_err_retries = (uint32_t)atoi(env_ptr);
}
if((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC)) != NULL){
neigh_wait_till_send_arp_msec = (uint32_t)atoi(env_ptr);
}
if((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_QUATA)) != NULL){
neigh_uc_arp_quata = (uint32_t)atoi(env_ptr);
}
if ((getenv(SYS_VAR_HUGETBL)) != NULL)
{
vlog_printf(VLOG_WARNING, "**********************************************************************************************************************\n");
vlog_printf(VLOG_WARNING, "The '%s' parameter is no longer supported, please refer to '%s' in README.txt for more info\n", SYS_VAR_HUGETBL, SYS_VAR_MEM_ALLOC_TYPE);
vlog_printf(VLOG_WARNING, "**********************************************************************************************************************\n");
}
if ((env_ptr = getenv(SYS_VAR_MEM_ALLOC_TYPE)) != NULL)
mem_alloc_type = (alloc_mode_t)atoi(env_ptr);
if (mem_alloc_type < 0 || mem_alloc_type >= ALLOC_TYPE_LAST_ALLOWED_TO_USE)
mem_alloc_type = MCE_DEFAULT_MEM_ALLOC_TYPE;
if (mce_sys_var::HYPER_MSHV == hypervisor && mem_alloc_type == ALLOC_TYPE_CONTIG) {
char mem_str[sizeof(int) + 1] = {0};
len = snprintf(mem_str, sizeof(mem_str), "%d", ALLOC_TYPE_HUGEPAGES);
if (likely((0 < len) && (len < (int)sizeof(mem_str)))) {
setenv(SYS_VAR_MEM_ALLOC_TYPE, mem_str, 1); // Setenv to avoid core dump while valgrind is used.
}
vlog_printf(VLOG_DEBUG, "The '%s' parameter can not be %d for %s.\n",
SYS_VAR_MEM_ALLOC_TYPE, mem_alloc_type, cpuid_hv_vendor());
mem_alloc_type = ALLOC_TYPE_HUGEPAGES;
}
if ((env_ptr = getenv(SYS_VAR_BF)) != NULL)
handle_bf = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_FORK)) != NULL)
handle_fork = atoi(env_ptr) ? true : false;
if((env_ptr = getenv(SYS_VAR_IPOIB )) != NULL)
enable_ipoib = atoi(env_ptr) ? true : false;
#ifdef DEFINED_TSO
if((env_ptr = getenv(SYS_VAR_TSO)) != NULL)
enable_tso = atoi(env_ptr) ? true : false;
if (enable_tso && (ring_migration_ratio_tx != -1)) {
ring_migration_ratio_tx = -1;
vlog_printf(VLOG_DEBUG,"%s parameter is forced to %d in case %s is enabled\n",
SYS_VAR_RING_MIGRATION_RATIO_TX, -1, SYS_VAR_TSO);
}
#endif /* DEFINED_TSO */
if ((env_ptr = getenv(SYS_VAR_CLOSE_ON_DUP2)) != NULL)
close_on_dup2 = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_MTU)) != NULL)
mtu = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_MSS)) != NULL)
lwip_mss = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_TCP_CC_ALGO)) != NULL)
lwip_cc_algo_mod = (uint32_t)atoi(env_ptr);
if ((env_ptr = getenv(SYS_VAR_VMA_RX_POLL_ON_TX_TCP)) != NULL)
rx_poll_on_tx_tcp = atoi(env_ptr) ? true : false;
if ((env_ptr = getenv(SYS_VAR_VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME)) != NULL)
trigger_dummy_send_getsockname = atoi(env_ptr) ? true : false;
#ifdef VMA_TIME_MEASURE
if ((env_ptr = getenv(SYS_VAR_VMA_TIME_MEASURE_NUM_SAMPLES)) != NULL) {
vma_time_measure_num_samples = (uint32_t)atoi(env_ptr);
if(vma_time_measure_num_samples > INST_SIZE){
vlog_printf(VLOG_WARNING, "The value of '%s' is bigger than %d. Time samples over %d will be dropped.\n", SYS_VAR_VMA_TIME_MEASURE_NUM_SAMPLES, INST_SIZE, INST_SIZE);
}
}
if ((env_ptr = getenv(SYS_VAR_VMA_TIME_MEASURE_DUMP_FILE)) != NULL){
read_env_variable_with_pid(vma_time_measure_filename, sizeof(vma_time_measure_filename), env_ptr);
}
#endif
}
void set_env_params()
{
// Need to call setenv() only after getenv() is done, because /bin/sh has
// a custom setenv() which overrides original environment.
//setenv("MLX4_SINGLE_THREADED", "1", 0);
/*
* MLX4_DEVICE_FATAL_CLEANUP/MLX5_DEVICE_FATAL_CLEANUP/RDMAV_ALLOW_DISASSOC_DESTROY
* tells ibv_destroy functions we want to get success errno value
* in case of calling them when the device was removed.
* It helps to destroy resources in DEVICE_FATAL state
*/
setenv("MLX4_DEVICE_FATAL_CLEANUP", "1", 1);
setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1);
setenv("RDMAV_ALLOW_DISASSOC_DESTROY", "1", 1);
if (safe_mce_sys().handle_bf) {
setenv("MLX4_POST_SEND_PREFER_BF", "1", 1);
setenv("MLX5_POST_SEND_PREFER_BF", "1", 1);
} else {
/* todo - these seem not to work if inline is on, since libmlx is doing (inl || bf) when deciding to bf*/
setenv("MLX4_POST_SEND_PREFER_BF", "0", 1);
setenv("MLX5_POST_SEND_PREFER_BF", "0", 1);
}
switch (safe_mce_sys().mem_alloc_type) {
case ALLOC_TYPE_ANON:
setenv("MLX_QP_ALLOC_TYPE", "ANON", 0);
setenv("MLX_CQ_ALLOC_TYPE", "ANON", 0);
break;
case ALLOC_TYPE_HUGEPAGES:
setenv("RDMAV_HUGEPAGES_SAFE", "1", 0);
setenv("MLX_QP_ALLOC_TYPE", "ALL", 0);
setenv("MLX_CQ_ALLOC_TYPE", "ALL", 0);
break;
case ALLOC_TYPE_CONTIG:
default:
setenv("MLX_QP_ALLOC_TYPE", "PREFER_CONTIG", 0);
setenv("MLX_CQ_ALLOC_TYPE", "PREFER_CONTIG", 0);
break;
}
}