/*
* Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <dev/ring_profile.h>
#include "vma/dev/ring_allocation_logic.h"
#define MODULE_NAME "ral"
#undef MODULE_HDR_INFO
#define MODULE_HDR_INFO MODULE_NAME "%s:%d:%s() "
#undef __INFO__
#define __INFO__ m_tostr.c_str()
#define ral_logpanic __log_info_panic
#define ral_logerr __log_info_err
#define ral_logwarn __log_info_warn
#define ral_loginfo __log_info_info
#define ral_logdbg __log_info_dbg
#define ral_logfunc __log_info_func
#define ral_logfuncall __log_info_funcall
ring_allocation_logic::ring_allocation_logic():m_ring_migration_ratio(0),
m_source(-1),
m_migration_try_count(0),
m_migration_candidate(0),
m_active(true),
m_res_key() {}
ring_allocation_logic::ring_allocation_logic(ring_logic_t allocation_logic,
int ring_migration_ratio, source_t source,
resource_allocation_key &ring_profile):
m_tostr("base"), m_ring_migration_ratio(ring_migration_ratio),
m_source(source), m_migration_try_count(ring_migration_ratio)
{
if (ring_profile.get_ring_alloc_logic() == RING_LOGIC_PER_INTERFACE &&
ring_profile.get_ring_profile_key() < START_RING_INDEX) {
ring_profile.set_ring_alloc_logic(allocation_logic);
}
m_res_key = resource_allocation_key(ring_profile);
m_migration_candidate = 0;
m_res_key.set_user_id_key(calc_res_key_by_logic());
m_active = true;
}
/**
*
* @return the key that is part of a unique id in rings map
*/
uint64_t ring_allocation_logic::calc_res_key_by_logic()
{
uint64_t res_key = 0;
switch (m_res_key.get_ring_alloc_logic()) {
case RING_LOGIC_PER_INTERFACE:
res_key = 0;
if (safe_mce_sys().tcp_ctl_thread > CTL_THREAD_DISABLE)
res_key = 1;
break;
case RING_LOGIC_PER_IP:
res_key = m_source.m_ip;
break;
case RING_LOGIC_PER_SOCKET:
res_key = m_source.m_fd;
break;
case RING_LOGIC_PER_USER_ID:
res_key = m_res_key.get_user_id_key();
break;
case RING_LOGIC_PER_THREAD:
res_key = pthread_self();
break;
case RING_LOGIC_PER_CORE:
case RING_LOGIC_PER_CORE_ATTACH_THREADS:
res_key = sched_getcpu();
break;
BULLSEYE_EXCLUDE_BLOCK_START
default:
//not suppose to get here
ral_logdbg("non-valid ring logic = %d", m_res_key.get_ring_alloc_logic());
break;
BULLSEYE_EXCLUDE_BLOCK_END
}
return res_key;
}
resource_allocation_key* ring_allocation_logic::create_new_key(in_addr_t addr, int suggested_cpu /* = NO_CPU */)
{
if (m_res_key.get_ring_alloc_logic() == RING_LOGIC_PER_CORE_ATTACH_THREADS) {
pthread_t tid = pthread_self();
int cpu = g_cpu_manager.reserve_cpu_for_thread(tid, suggested_cpu);
if (cpu >= 0) {
m_res_key.set_user_id_key(cpu);
return &m_res_key;
}
}
if (m_res_key.get_ring_alloc_logic() == RING_LOGIC_PER_IP) {
m_source.m_ip = addr;
}
m_res_key.set_user_id_key(calc_res_key_by_logic());
return &m_res_key;
}
/*
* return true if ring migration is recommended.
*/
bool ring_allocation_logic::should_migrate_ring()
{
ral_logfuncall("currently accessed from thread=%lu, cpu=%d", pthread_self(), sched_getcpu());
if (false == m_active) {
return false;
}
int count_max = m_ring_migration_ratio;
if (m_migration_candidate) {
count_max = CANDIDATE_STABILITY_ROUNDS;
uint64_t new_id = calc_res_key_by_logic();
if (m_migration_candidate != new_id) {
m_migration_candidate = 0;
m_migration_try_count = 0;
return false;
}
}
if (m_migration_try_count < count_max) {
m_migration_try_count++;
return false;
} else {
m_migration_try_count = 0;
}
if (!m_migration_candidate) {
// save current used allocation key
// no need to save profile, and allocation logic
uint64_t curr_id = m_res_key.get_user_id_key();
// calc new key
uint64_t new_id = calc_res_key_by_logic();
if (new_id == curr_id || g_n_internal_thread_id == curr_id) {
return false;
}
m_migration_candidate = new_id;
return false;
}
ral_logdbg("migrating from ring of id=%s to ring of id=%lu",
m_res_key.to_str(), m_migration_candidate);
m_migration_candidate = 0;
return true;
}
cpu_manager g_cpu_manager;
__thread int g_n_thread_cpu_core = NO_CPU;
cpu_manager::cpu_manager()
{
reset();
}
void cpu_manager::reset()
{
memset(m_cpu_thread_count, 0, sizeof(m_cpu_thread_count));
}
int cpu_manager::reserve_cpu_for_thread(pthread_t tid, int suggested_cpu /* = NO_CPU */)
{
lock();
int cpu = g_n_thread_cpu_core;
if (cpu != NO_CPU) { //already reserved
unlock();
return cpu;
}
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
int ret = pthread_getaffinity_np(tid, sizeof(cpu_set_t), &cpu_set);
if (ret) {
unlock();
__log_err("pthread_getaffinity_np failed for tid=%lu, ret=%d (errno=%d %m)", tid, ret, errno);
return -1;
}
int avail_cpus = CPU_COUNT(&cpu_set);
if (avail_cpus == 0) {
unlock();
__log_err("no cpu available for tid=%lu", tid);
return -1;
}
if (avail_cpus == 1) { //already attached
for (cpu = 0; cpu < MAX_CPU && !CPU_ISSET(cpu, &cpu_set); cpu++) {}
} else { //need to choose one cpu to attach to
int min_cpu_count = -1;
for (int i = 0, j = 0; i < MAX_CPU && j < avail_cpus; i++) {
if (!CPU_ISSET(i, &cpu_set)) continue;
j++;
if (min_cpu_count < 0 || m_cpu_thread_count[i] < min_cpu_count) {
min_cpu_count = m_cpu_thread_count[i];
cpu = i;
}
}
if (suggested_cpu >= 0
&& CPU_ISSET(suggested_cpu, &cpu_set)
&& m_cpu_thread_count[suggested_cpu] <= min_cpu_count + 1 ) {
cpu = suggested_cpu;
}
CPU_ZERO(&cpu_set);
CPU_SET(cpu, &cpu_set);
__log_dbg("attach tid=%lu running on cpu=%d to cpu=%d", tid, sched_getcpu(), cpu);
ret = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpu_set);
if (ret) {
unlock();
__log_err("pthread_setaffinity_np failed for tid=%lu to cpu=%d, ret=%d (errno=%d %m)", tid, cpu, ret, errno);
return -1;
}
}
g_n_thread_cpu_core = cpu;
if (cpu > NO_CPU && cpu < MAX_CPU)
m_cpu_thread_count[cpu]++;
unlock();
return cpu;
}