/*
* BSD LICENSE
*
* Copyright(c) 2020 Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
/**
* @brief Implementation of HW PQoS monitoring API.
*
* CPUID and MSR operations are done on 'local' system.
*
*/
#include <stdlib.h>
#include <string.h>
#include "cap.h"
#include "cpu_registers.h"
#include "log.h"
#include "hw_monitoring.h"
#include "machine.h"
#include "monitoring.h"
#include "perf_monitoring.h"
/**
* ---------------------------------------
* Local macros
* ---------------------------------------
*/
/**
* Special RMID - after reset all cores are associated with it.
*
* The assumption is that if core is not assigned to it
* then it is subject of monitoring activity by a different process.
*/
#define RMID0 (0)
/**
* ---------------------------------------
* Local data types
* ---------------------------------------
*/
/**
* ---------------------------------------
* Local data structures
* ---------------------------------------
*/
static unsigned m_rmid_max = 0; /**< max RMID */
#ifdef PQOS_RMID_CUSTOM
/* clang-format off */
/** Custom RMID configuration */
static struct pqos_rmid_config rmid_cfg = {PQOS_RMID_TYPE_DEFAULT,
{0, NULL, NULL} };
/* clang-format on */
#endif
/** List of non-virtual perf events */
static const enum pqos_mon_event perf_event[] = {
PQOS_PERF_EVENT_LLC_MISS, (enum pqos_mon_event)PQOS_PERF_EVENT_CYCLES,
(enum pqos_mon_event)PQOS_PERF_EVENT_INSTRUCTIONS};
/**
* ---------------------------------------
* Local Functions
* ---------------------------------------
*/
static int mon_assoc_set(const unsigned lcore, const pqos_rmid_t rmid);
static int mon_assoc_get(const unsigned lcore, pqos_rmid_t *rmid);
static int mon_read(const unsigned lcore,
const pqos_rmid_t rmid,
const enum pqos_mon_event event,
uint64_t *value);
static unsigned get_event_id(const enum pqos_mon_event event);
static uint64_t scale_event(const enum pqos_mon_event event,
const uint64_t val);
/*
* =======================================
* =======================================
*
* initialize and shutdown
*
* =======================================
* =======================================
*/
int
hw_mon_init(const struct pqos_cpuinfo *cpu,
const struct pqos_cap *cap,
const struct pqos_config *cfg)
{
int ret;
const struct pqos_capability *item = NULL;
UNUSED_PARAM(cpu);
ret = pqos_cap_get_type(cap, PQOS_CAP_TYPE_MON, &item);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_RESOURCE;
m_rmid_max = item->u.mon->max_rmid;
if (m_rmid_max == 0) {
ret = PQOS_RETVAL_PARAM;
goto hw_mon_init_exit;
}
LOG_DEBUG("Max RMID per monitoring cluster is %u\n", m_rmid_max);
#ifdef __linux__
ret = perf_mon_init(cpu, cap);
/* perf MBM is not supported */
if (ret == PQOS_RETVAL_RESOURCE)
ret = PQOS_RETVAL_OK;
else if (ret != PQOS_RETVAL_OK)
goto hw_mon_init_exit;
#endif
#ifdef PQOS_RMID_CUSTOM
rmid_cfg.type = cfg->rmid_cfg.type;
if (cfg->rmid_cfg.type == PQOS_RMID_TYPE_MAP) {
const unsigned num = cfg->rmid_cfg.map.num;
unsigned i;
if (cfg->rmid_cfg.map.core == NULL ||
cfg->rmid_cfg.map.rmid == NULL) {
ret = PQOS_RETVAL_PARAM;
goto hw_mon_init_exit;
}
rmid_cfg.map.num = num;
rmid_cfg.map.core = (unsigned *)malloc(sizeof(unsigned) * num);
rmid_cfg.map.rmid =
(pqos_rmid_t *)malloc(sizeof(pqos_rmid_t) * num);
for (i = 0; i < num; i++) {
rmid_cfg.map.core[i] = cfg->rmid_cfg.map.core[i];
rmid_cfg.map.rmid[i] = cfg->rmid_cfg.map.rmid[i];
}
}
#else
UNUSED_PARAM(cfg);
#endif
hw_mon_init_exit:
if (ret != PQOS_RETVAL_OK)
hw_mon_fini();
return ret;
}
int
hw_mon_fini(void)
{
m_rmid_max = 0;
#ifdef __linux__
perf_mon_fini();
#endif
#ifdef PQOS_RMID_CUSTOM
if (rmid_cfg.map.core != NULL)
free(rmid_cfg.map.core);
if (rmid_cfg.map.rmid != NULL)
free(rmid_cfg.map.rmid);
#endif
return PQOS_RETVAL_OK;
}
/*
* =======================================
* =======================================
*
* RMID allocation
*
* =======================================
* =======================================
*/
/**
* @brief Gets max RMID number for given \a event
*
* @param [out] rmid resource monitoring id
* @param [in] event Monitoring event type
*
* @return Operations status
*/
static int
rmid_get_event_max(pqos_rmid_t *rmid, const enum pqos_mon_event event)
{
pqos_rmid_t max_rmid = m_rmid_max;
const struct pqos_capability *item = NULL;
const struct pqos_cap_mon *mon = NULL;
unsigned mask_found = 0;
unsigned i;
int ret;
if (rmid == NULL)
return PQOS_RETVAL_PARAM;
/**
* This is not so straight forward as it appears to be.
* We first have to figure out max RMID
* for given event type. In order to do so we need:
* - go through capabilities structure
* - find monitoring capability
* - look for the \a event in the event list
* - find max RMID matching the \a event
*/
ret = _pqos_cap_get_type(PQOS_CAP_TYPE_MON, &item);
if (ret != PQOS_RETVAL_OK)
return ret;
ASSERT(item != NULL);
mon = item->u.mon;
/* Find which events are supported vs requested */
max_rmid = m_rmid_max;
for (i = 0; i < mon->num_events; i++)
if (event & mon->events[i].type) {
mask_found |= mon->events[i].type;
max_rmid = (max_rmid > mon->events[i].max_rmid)
? mon->events[i].max_rmid
: max_rmid;
}
/**
* Check if all of the events are supported
*/
if (event != mask_found || max_rmid == 0)
return PQOS_RETVAL_ERROR;
ASSERT(m_rmid_max >= max_rmid);
*rmid = max_rmid;
return PQOS_RETVAL_OK;
}
/**
* @brief Get used RMIDs on ctx->cluster
*
* @param [inout] ctx poll context
* @param [in] event Monitoring event type
*
* @return Operations status
*/
static int
rmid_alloc(struct pqos_mon_poll_ctx *ctx, const enum pqos_mon_event event)
{
const struct pqos_cpuinfo *cpu;
int ret = PQOS_RETVAL_OK;
unsigned max_rmid = 0;
unsigned *core_list = NULL;
unsigned i, core_count;
pqos_rmid_t *rmid_list = NULL;
ASSERT(ctx != NULL);
_pqos_cap_get(NULL, &cpu);
/* Getting max RMID for given event */
ret = rmid_get_event_max(&max_rmid, event);
if (ret != PQOS_RETVAL_OK)
return ret;
/**
* Check for free RMID in the cluster by reading current associations.
*/
core_list = pqos_cpu_get_cores_l3id(cpu, ctx->cluster, &core_count);
if (core_list == NULL)
return PQOS_RETVAL_ERROR;
ASSERT(core_count > 0);
rmid_list = (pqos_rmid_t *)malloc(sizeof(rmid_list[0]) * core_count);
if (rmid_list == NULL) {
ret = PQOS_RETVAL_RESOURCE;
goto rmid_alloc_error;
}
for (i = 0; i < core_count; i++) {
ret = mon_assoc_get(core_list[i], &rmid_list[i]);
if (ret != PQOS_RETVAL_OK)
goto rmid_alloc_error;
}
ret = PQOS_RETVAL_ERROR;
for (i = 1; i < max_rmid; i++) {
unsigned j = 0;
for (j = 0; j < core_count; j++)
if (i == rmid_list[j])
break;
if (j >= core_count) {
ret = PQOS_RETVAL_OK;
ctx->rmid = i;
break;
}
}
rmid_alloc_error:
if (rmid_list != NULL)
free(rmid_list);
if (core_list != NULL)
free(core_list);
return ret;
}
#ifdef PQOS_RMID_CUSTOM
/**
* @brief Gets RMID value based on information stored in rmid_cfg
*
* @param [inout] ctx poll context
* @param [in] event Monitoring event type
* @param [in] rmid_cfg rmid configuration parameters
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
rmid_alloc_custom(struct pqos_mon_poll_ctx *ctx,
const enum pqos_mon_event event,
const struct pqos_rmid_config *rmid_cfg)
{
if (ctx == NULL)
return PQOS_RETVAL_PARAM;
if (rmid_cfg == NULL || rmid_cfg->type == PQOS_RMID_TYPE_DEFAULT) {
return rmid_alloc(ctx, event);
} else if (rmid_cfg->type == PQOS_RMID_TYPE_MAP) {
unsigned i;
for (i = 0; i < rmid_cfg->map.num; i++) {
if (ctx->lcore == rmid_cfg->map.core[i]) {
ctx->rmid = rmid_cfg->map.rmid[i];
return PQOS_RETVAL_OK;
}
}
LOG_ERROR("RMID Custom: No mapping for core %u\n", ctx->lcore);
} else {
LOG_ERROR("RMID Custom: Unsupported rmid type: %u\n",
rmid_cfg->type);
return PQOS_RETVAL_PARAM;
}
return PQOS_RETVAL_ERROR;
}
#endif
/*
* =======================================
* =======================================
*
* Monitoring
*
* =======================================
* =======================================
*/
/**
* @brief Scale event values to bytes
*
* Retrieve event scale factor and scale value to bytes
*
* @param [in] event event scale factor to retrieve
* @param [in] val value to be scaled
*
* @return scaled value
* @retval value in bytes
*/
static uint64_t
scale_event(const enum pqos_mon_event event, const uint64_t val)
{
const struct pqos_cap *cap;
const struct pqos_monitor *pmon;
int ret;
_pqos_cap_get(&cap, NULL);
ASSERT(cap != NULL);
ret = pqos_cap_get_event(cap, event, &pmon);
ASSERT(ret == PQOS_RETVAL_OK);
if (ret != PQOS_RETVAL_OK)
return val;
else
return val * pmon->scale_factor;
}
/**
* @brief Associates core with RMID at register level
*
* This function doesn't acquire API lock
* and can be used internally when lock is already taken.
*
* @param lcore logical core id
* @param rmid resource monitoring ID
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
mon_assoc_set(const unsigned lcore, const pqos_rmid_t rmid)
{
int ret = 0;
uint32_t reg = 0;
uint64_t val = 0;
reg = PQOS_MSR_ASSOC;
ret = msr_read(lcore, reg, &val);
if (ret != MACHINE_RETVAL_OK)
return PQOS_RETVAL_ERROR;
val &= PQOS_MSR_ASSOC_QECOS_MASK;
val |= (uint64_t)(rmid & PQOS_MSR_ASSOC_RMID_MASK);
ret = msr_write(lcore, reg, val);
if (ret != MACHINE_RETVAL_OK)
return PQOS_RETVAL_ERROR;
return PQOS_RETVAL_OK;
}
/**
* @brief Reads \a lcore to RMID association
*
* @param lcore logical core id
* @param rmid place to store RMID \a lcore is assigned to
*
* @return Operation status
* @retval PQOS_RETVAL_OK success
* @retval PQOS_RETVAL_ERROR on error
*/
static int
mon_assoc_get(const unsigned lcore, pqos_rmid_t *rmid)
{
int ret = 0;
uint32_t reg = PQOS_MSR_ASSOC;
uint64_t val = 0;
ASSERT(rmid != NULL);
ret = msr_read(lcore, reg, &val);
if (ret != MACHINE_RETVAL_OK)
return PQOS_RETVAL_ERROR;
val &= PQOS_MSR_ASSOC_RMID_MASK;
*rmid = (pqos_rmid_t)val;
return PQOS_RETVAL_OK;
}
int
hw_mon_assoc_get(const unsigned lcore, pqos_rmid_t *rmid)
{
int ret = PQOS_RETVAL_OK;
const struct pqos_cpuinfo *cpu;
if (rmid == NULL)
return PQOS_RETVAL_PARAM;
_pqos_cap_get(NULL, &cpu);
ASSERT(cpu != NULL);
ret = pqos_cpu_check_core(cpu, lcore);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_PARAM;
ret = mon_assoc_get(lcore, rmid);
return ret;
}
int
hw_mon_reset(void)
{
int ret = PQOS_RETVAL_OK;
unsigned i;
const struct pqos_cpuinfo *cpu;
_pqos_cap_get(NULL, &cpu);
for (i = 0; i < cpu->num_cores; i++) {
int retval = mon_assoc_set(cpu->cores[i].lcore, RMID0);
if (retval != PQOS_RETVAL_OK)
ret = retval;
}
return ret;
}
/**
* @brief Reads monitoring event data from given core
*
* This function doesn't acquire API lock.
*
* @param lcore logical core id
* @param rmid RMID to be read
* @param event monitoring event
* @param value place to store read value
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
mon_read(const unsigned lcore,
const pqos_rmid_t rmid,
const unsigned event,
uint64_t *value)
{
int retries = 0, retval = PQOS_RETVAL_ERROR;
uint64_t val = 0;
uint64_t val_evtsel = 0;
int flag_wrt = 1;
/**
* Set event selection register (RMID + event id)
*/
val_evtsel = ((uint64_t)rmid) & PQOS_MSR_MON_EVTSEL_RMID_MASK;
val_evtsel <<= PQOS_MSR_MON_EVTSEL_RMID_SHIFT;
val_evtsel |= ((uint64_t)event) & PQOS_MSR_MON_EVTSEL_EVTID_MASK;
for (retries = 0; retries < 4; retries++) {
if (flag_wrt) {
if (msr_write(lcore, PQOS_MSR_MON_EVTSEL, val_evtsel) !=
MACHINE_RETVAL_OK)
break;
}
if (msr_read(lcore, PQOS_MSR_MON_QMC, &val) !=
MACHINE_RETVAL_OK)
break;
if ((val & PQOS_MSR_MON_QMC_ERROR) != 0ULL) {
/* Read back IA32_QM_EVTSEL register
* to check for content change.
*/
if (msr_read(lcore, PQOS_MSR_MON_EVTSEL, &val) !=
MACHINE_RETVAL_OK)
break;
if (val != val_evtsel) {
flag_wrt = 1;
continue;
}
}
if ((val & PQOS_MSR_MON_QMC_UNAVAILABLE) != 0ULL) {
/**
* Waiting for monitoring data
*/
flag_wrt = 0;
continue;
}
retval = PQOS_RETVAL_OK;
break;
}
/**
* Store event value
*/
if (retval == PQOS_RETVAL_OK)
*value = (val & PQOS_MSR_MON_QMC_DATA_MASK);
else
LOG_WARN("Error reading event %u on core %u (RMID%u)!\n", event,
lcore, (unsigned)rmid);
return retval;
}
/**
* @brief Gives the difference between two values with regard to the possible
* overrun and counter length
*
* @param event event counter length to retrieve
* @param old_value previous value
* @param new_value current value
*
* @return difference between the two values
*/
static uint64_t
get_delta(const enum pqos_mon_event event,
const uint64_t old_value,
const uint64_t new_value)
{
const struct pqos_cap *cap;
const struct pqos_monitor *pmon;
int ret;
uint64_t max_value = 1LLU << 24;
_pqos_cap_get(&cap, NULL);
ret = pqos_cap_get_event(cap, event, &pmon);
if (ret == PQOS_RETVAL_OK)
max_value = 1LLU << pmon->counter_length;
if (old_value > new_value)
return (max_value - old_value) + new_value;
else
return new_value - old_value;
}
/**
* @brief Sets up IA32 performance counters for IPC and LLC miss ratio events
*
* @param group monitoring data
* @param event mask of selected monitoring events
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
ia32_perf_counter_start(const struct pqos_mon_data *group,
const enum pqos_mon_event event)
{
uint64_t global_ctrl_mask = 0;
unsigned i;
const unsigned *cores = group->cores;
const unsigned num_cores = group->num_cores;
ASSERT(cores != NULL && num_cores > 0);
if (!(event & (PQOS_PERF_EVENT_LLC_MISS | PQOS_PERF_EVENT_IPC)))
return PQOS_RETVAL_OK;
if (event & PQOS_PERF_EVENT_IPC)
global_ctrl_mask |= (0x3ULL << 32); /**< fixed counters 0&1 */
if (event & PQOS_PERF_EVENT_LLC_MISS)
global_ctrl_mask |= 0x1ULL; /**< programmable counter 0 */
/**
* Fixed counters are used for IPC calculations.
* Programmable counters are used for LLC miss calculations.
* Let's check if they are in use.
*/
for (i = 0; i < num_cores; i++) {
uint64_t global_inuse = 0;
int ret;
ret = msr_read(cores[i], IA32_MSR_PERF_GLOBAL_CTRL,
&global_inuse);
if (ret != MACHINE_RETVAL_OK)
return PQOS_RETVAL_ERROR;
if (global_inuse & global_ctrl_mask)
LOG_WARN("Hijacking performance counters on core %u\n",
cores[i]);
}
/**
* - Disable counters in global control and
* reset counter values to 0.
* - Program counters for desired events
* - Enable counters in global control
*/
for (i = 0; i < num_cores; i++) {
const uint64_t fixed_ctrl = 0x33ULL; /**< track usr + os */
int ret;
ret = msr_write(cores[i], IA32_MSR_PERF_GLOBAL_CTRL, 0);
if (ret != MACHINE_RETVAL_OK)
break;
if (event & PQOS_PERF_EVENT_IPC) {
ret = msr_write(cores[i], IA32_MSR_INST_RETIRED_ANY, 0);
if (ret != MACHINE_RETVAL_OK)
break;
ret = msr_write(cores[i], IA32_MSR_CPU_UNHALTED_THREAD,
0);
if (ret != MACHINE_RETVAL_OK)
break;
ret = msr_write(cores[i], IA32_MSR_FIXED_CTR_CTRL,
fixed_ctrl);
if (ret != MACHINE_RETVAL_OK)
break;
}
if (event & PQOS_PERF_EVENT_LLC_MISS) {
const uint64_t evtsel0_miss =
IA32_EVENT_LLC_MISS_MASK |
(IA32_EVENT_LLC_MISS_UMASK << 8) | (1ULL << 16) |
(1ULL << 17) | (1ULL << 22);
ret = msr_write(cores[i], IA32_MSR_PMC0, 0);
if (ret != MACHINE_RETVAL_OK)
break;
ret = msr_write(cores[i], IA32_MSR_PERFEVTSEL0,
evtsel0_miss);
if (ret != MACHINE_RETVAL_OK)
break;
}
ret = msr_write(cores[i], IA32_MSR_PERF_GLOBAL_CTRL,
global_ctrl_mask);
if (ret != MACHINE_RETVAL_OK)
break;
}
if (i < num_cores)
return PQOS_RETVAL_ERROR;
return PQOS_RETVAL_OK;
}
/**
* @brief Disables IA32 performance counters
*
* @param num_cores number of cores in \a cores table
* @param cores table with core id's
* @param event mask of selected monitoring events
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
ia32_perf_counter_stop(const unsigned num_cores,
const unsigned *cores,
const enum pqos_mon_event event)
{
int retval = PQOS_RETVAL_OK;
unsigned i;
ASSERT(cores != NULL && num_cores > 0);
if (!(event & (PQOS_PERF_EVENT_LLC_MISS | PQOS_PERF_EVENT_IPC)))
return retval;
for (i = 0; i < num_cores; i++) {
int ret = msr_write(cores[i], IA32_MSR_PERF_GLOBAL_CTRL, 0);
if (ret != MACHINE_RETVAL_OK)
retval = PQOS_RETVAL_ERROR;
}
return retval;
}
/**
* @brief Start perf monitoring counters
*
* @param group monitoring structure
* @param event PQoS event type
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
hw_mon_start_perf(struct pqos_mon_data *group, enum pqos_mon_event event)
{
int ret = PQOS_RETVAL_OK;
unsigned i;
enum pqos_mon_event hw_event = (enum pqos_mon_event)0;
group->intl->perf.ctx =
malloc(sizeof(group->intl->perf.ctx[0]) * group->num_cores);
if (group->intl->perf.ctx == NULL) {
LOG_ERROR("Memory allocation failed\n");
return PQOS_RETVAL_ERROR;
}
for (i = 0; i < DIM(perf_event); i++) {
enum pqos_mon_event evt = perf_event[i];
if (event & evt) {
#ifdef __linux__
if (perf_mon_is_event_supported(evt)) {
ret = perf_mon_start(group, evt);
if (ret != PQOS_RETVAL_OK)
return ret;
group->intl->perf.event |= evt;
continue;
}
#endif
hw_event |= evt;
}
}
if (!group->intl->perf.event) {
free(group->intl->perf.ctx);
group->intl->perf.ctx = NULL;
}
/* Start IA32 performance counters */
if (hw_event) {
ret = ia32_perf_counter_start(group, hw_event);
if (ret == PQOS_RETVAL_OK)
group->intl->hw.event |= hw_event;
}
return ret;
}
/**
* @brief Stop perf monitoring counters
*
* @param group monitoring structure
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
hw_mon_stop_perf(struct pqos_mon_data *group)
{
int ret = PQOS_RETVAL_OK;
unsigned i;
enum pqos_mon_event hw_event = (enum pqos_mon_event)0;
for (i = 0; i < DIM(perf_event); i++) {
enum pqos_mon_event evt = perf_event[i];
#ifdef __linux__
/* Stop perf event */
if (group->intl->perf.event & evt) {
ret = perf_mon_stop(group, evt);
if (ret != PQOS_RETVAL_OK)
return ret;
continue;
}
#endif
if (group->intl->hw.event & evt)
hw_event |= evt;
}
/* Stop IA32 performance counters */
if (hw_event) {
ret = ia32_perf_counter_stop(group->num_cores, group->cores,
group->event);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_RESOURCE;
}
if (group->intl->perf.ctx != NULL) {
free(group->intl->perf.ctx);
group->intl->perf.ctx = NULL;
}
return ret;
}
/**
* @brief Start HW monitoring counters
*
* @param group monitoring structure
* @param event PQoS event type
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
*/
static int
hw_mon_start_counter(struct pqos_mon_data *group, enum pqos_mon_event event)
{
const unsigned num_cores = group->num_cores;
const struct pqos_cpuinfo *cpu;
unsigned core2cluster[num_cores];
struct pqos_mon_poll_ctx ctxs[num_cores];
unsigned num_ctxs = 0;
unsigned i;
int ret = PQOS_RETVAL_OK;
enum pqos_mon_event ctx_event = (enum pqos_mon_event)(
event & (PQOS_MON_EVENT_L3_OCCUP | PQOS_MON_EVENT_LMEM_BW |
PQOS_MON_EVENT_TMEM_BW | PQOS_MON_EVENT_RMEM_BW));
_pqos_cap_get(NULL, &cpu);
memset(ctxs, 0, sizeof(ctxs));
/*
* Initialize poll context table:
* - get core cluster
* - allocate RMID
*/
for (i = 0; i < group->num_cores; i++) {
const unsigned lcore = group->cores[i];
unsigned j;
unsigned cluster = 0;
ret = pqos_cpu_get_clusterid(cpu, lcore, &cluster);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_PARAM;
core2cluster[i] = cluster;
for (j = 0; j < num_ctxs; j++)
if (ctxs[j].lcore == lcore ||
ctxs[j].cluster == cluster)
break;
if (j >= num_ctxs) {
/**
* New cluster is found
* - save cluster id in the table
* - allocate RMID for the cluster
*/
ctxs[num_ctxs].lcore = lcore;
ctxs[num_ctxs].cluster = cluster;
#ifdef PQOS_RMID_CUSTOM
ret = rmid_alloc_custom(&ctxs[num_ctxs], ctx_event,
&rmid_cfg);
#else
ret = rmid_alloc(&ctxs[num_ctxs], ctx_event);
#endif
if (ret != PQOS_RETVAL_OK)
return ret;
num_ctxs++;
}
}
group->intl->hw.ctx = (struct pqos_mon_poll_ctx *)malloc(
sizeof(group->intl->hw.ctx[0]) * num_ctxs);
if (group->intl->hw.ctx == NULL)
return PQOS_RETVAL_RESOURCE;
/**
* Associate requested cores with
* the allocated RMID
*/
group->num_cores = num_cores;
for (i = 0; i < num_cores; i++) {
unsigned cluster, j;
pqos_rmid_t rmid;
cluster = core2cluster[i];
for (j = 0; j < num_ctxs; j++)
if (ctxs[j].cluster == cluster)
break;
if (j >= num_ctxs) {
ret = PQOS_RETVAL_ERROR;
goto hw_mon_start_counter_exit;
}
rmid = ctxs[j].rmid;
ret = mon_assoc_set(group->cores[i], rmid);
if (ret != PQOS_RETVAL_OK)
goto hw_mon_start_counter_exit;
}
group->intl->hw.num_ctx = num_ctxs;
for (i = 0; i < num_ctxs; i++)
group->intl->hw.ctx[i] = ctxs[i];
group->intl->hw.event |= ctx_event;
hw_mon_start_counter_exit:
if (ret != PQOS_RETVAL_OK) {
for (i = 0; i < num_cores; i++)
(void)mon_assoc_set(group->cores[i], RMID0);
if (group->intl->hw.ctx != NULL)
free(group->intl->hw.ctx);
}
return ret;
}
int
hw_mon_start(const unsigned num_cores,
const unsigned *cores,
const enum pqos_mon_event event,
void *context,
struct pqos_mon_data *group)
{
unsigned i;
int ret = PQOS_RETVAL_OK;
int retval = PQOS_RETVAL_OK;
const struct pqos_cap *cap;
const struct pqos_cpuinfo *cpu;
enum pqos_mon_event req_events;
enum pqos_mon_event started_evts = (enum pqos_mon_event)0;
ASSERT(group != NULL);
ASSERT(cores != NULL);
ASSERT(num_cores > 0);
ASSERT(event > 0);
_pqos_cap_get(&cap, &cpu);
req_events = event;
if (req_events & PQOS_MON_EVENT_RMEM_BW)
req_events |= (enum pqos_mon_event)(PQOS_MON_EVENT_LMEM_BW |
PQOS_MON_EVENT_TMEM_BW);
if (req_events & PQOS_PERF_EVENT_IPC)
req_events |= (enum pqos_mon_event)(
PQOS_PERF_EVENT_CYCLES | PQOS_PERF_EVENT_INSTRUCTIONS);
/**
* Validate if event is listed in capabilities
*/
for (i = 0; i < (sizeof(event) * 8); i++) {
const enum pqos_mon_event evt_mask =
(enum pqos_mon_event)(1U << i);
const struct pqos_monitor *ptr = NULL;
if (!(evt_mask & event))
continue;
retval = pqos_cap_get_event(cap, evt_mask, &ptr);
if (retval != PQOS_RETVAL_OK || ptr == NULL)
return PQOS_RETVAL_PARAM;
}
/**
* Check if all requested cores are valid
* and not used by other monitoring processes.
*
* Check if any of requested cores is already subject to monitoring
* within this process.
*/
for (i = 0; i < num_cores; i++) {
const unsigned lcore = cores[i];
pqos_rmid_t rmid = RMID0;
ret = pqos_cpu_check_core(cpu, lcore);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_PARAM;
ret = mon_assoc_get(lcore, &rmid);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_PARAM;
if (rmid != RMID0) {
/* If not RMID0 then it is already monitored */
LOG_INFO("Core %u is already monitored with "
"RMID%u.\n",
lcore, rmid);
return PQOS_RETVAL_RESOURCE;
}
}
/**
* Fill in the monitoring group structure
*/
group->event = event;
group->context = context;
group->num_cores = num_cores;
group->cores = (unsigned *)malloc(sizeof(group->cores[0]) * num_cores);
if (group->cores == NULL)
return PQOS_RETVAL_RESOURCE;
for (i = 0; i < group->num_cores; i++)
group->cores[i] = cores[i];
/* start perf events */
retval = hw_mon_start_perf(group, req_events);
if (retval != PQOS_RETVAL_OK)
goto pqos_mon_start_error;
/* start MBM/CMT events */
retval = hw_mon_start_counter(group, req_events);
if (retval != PQOS_RETVAL_OK)
goto pqos_mon_start_error;
started_evts |= group->intl->perf.event;
started_evts |= group->intl->hw.event;
/**
* All events required by RMEM has been started
*/
if ((started_evts & PQOS_MON_EVENT_LMEM_BW) &&
(started_evts & PQOS_MON_EVENT_TMEM_BW)) {
group->values.mbm_remote = 0;
started_evts |= (enum pqos_mon_event)PQOS_MON_EVENT_RMEM_BW;
}
/**
* All events required by IPC has been started
*/
if ((started_evts & PQOS_PERF_EVENT_CYCLES) &&
(started_evts & PQOS_PERF_EVENT_INSTRUCTIONS)) {
group->values.ipc = 0;
started_evts |= (enum pqos_mon_event)PQOS_PERF_EVENT_IPC;
}
/* Check if all selected events were started */
if ((group->event & started_evts) != group->event) {
LOG_ERROR("Failed to start all selected "
"HW monitoring events\n");
retval = PQOS_RETVAL_ERROR;
}
pqos_mon_start_error:
if (retval != PQOS_RETVAL_OK) {
hw_mon_stop_perf(group);
if (group->cores != NULL)
free(group->cores);
}
return retval;
}
int
hw_mon_stop(struct pqos_mon_data *group)
{
int ret = PQOS_RETVAL_OK;
int retval = PQOS_RETVAL_OK;
unsigned i = 0;
const struct pqos_cpuinfo *cpu;
ASSERT(group != NULL);
if (group->num_cores == 0 || group->cores == NULL ||
group->intl->hw.num_ctx == 0 || group->intl->hw.ctx == NULL) {
return PQOS_RETVAL_PARAM;
}
_pqos_cap_get(NULL, &cpu);
for (i = 0; i < group->intl->hw.num_ctx; i++) {
/**
* Validate core list in the group structure is correct
*/
const unsigned lcore = group->intl->hw.ctx[i].lcore;
pqos_rmid_t rmid = RMID0;
ret = pqos_cpu_check_core(cpu, lcore);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_PARAM;
ret = mon_assoc_get(lcore, &rmid);
if (ret != PQOS_RETVAL_OK)
return PQOS_RETVAL_PARAM;
if (rmid != group->intl->hw.ctx[i].rmid)
LOG_WARN("Core %u RMID association changed from %u "
"to %u! The core has been hijacked!\n",
lcore, group->intl->hw.ctx[i].rmid, rmid);
}
for (i = 0; i < group->num_cores; i++) {
/**
* Associate cores from the group back with RMID0
*/
ret = mon_assoc_set(group->cores[i], RMID0);
if (ret != PQOS_RETVAL_OK)
retval = PQOS_RETVAL_RESOURCE;
}
/* stop perf counters */
ret = hw_mon_stop_perf(group);
if (ret != PQOS_RETVAL_OK)
retval = ret;
/**
* Free poll contexts, core list and clear the group structure
*/
free(group->cores);
free(group->intl->hw.ctx);
memset(group, 0, sizeof(*group));
return retval;
}
/**
* @brief Read HW counter
*
* Reads counters for all events and stores values
*
* @param group monitoring structure
* @param event PQoS event
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
* @retval PQOS_RETVAL_ERROR if error occurs
*/
static int
hw_mon_read_counter(struct pqos_mon_data *group,
const enum pqos_mon_event event)
{
struct pqos_event_values *pv = &group->values;
uint64_t value = 0;
uint64_t max_value = 1LLU << 24;
const struct pqos_cap *cap;
const struct pqos_monitor *pmon;
unsigned i;
int ret;
ASSERT(event == PQOS_MON_EVENT_L3_OCCUP ||
event == PQOS_MON_EVENT_LMEM_BW ||
event == PQOS_MON_EVENT_TMEM_BW);
_pqos_cap_get(&cap, NULL);
ret = pqos_cap_get_event(cap, event, &pmon);
if (ret == PQOS_RETVAL_OK)
max_value = 1LLU << pmon->counter_length;
for (i = 0; i < group->intl->hw.num_ctx; i++) {
uint64_t tmp = 0;
const unsigned lcore = group->intl->hw.ctx[i].lcore;
const pqos_rmid_t rmid = group->intl->hw.ctx[i].rmid;
int retval;
retval = mon_read(lcore, rmid, get_event_id(event), &tmp);
if (retval != MACHINE_RETVAL_OK)
return PQOS_RETVAL_ERROR;
value += tmp;
if (value >= max_value)
value -= max_value;
}
switch (event) {
case PQOS_MON_EVENT_L3_OCCUP:
pv->llc = scale_event(PQOS_MON_EVENT_L3_OCCUP, value);
break;
case PQOS_MON_EVENT_LMEM_BW:
if (group->intl->valid_mbm_read) {
pv->mbm_local_delta =
get_delta(event, pv->mbm_local, value);
pv->mbm_local_delta =
scale_event(event, pv->mbm_local_delta);
} else
/* Report zero memory bandwidth with first read */
pv->mbm_local_delta = 0;
pv->mbm_local = value;
break;
case PQOS_MON_EVENT_TMEM_BW:
if (group->intl->valid_mbm_read) {
pv->mbm_total_delta =
get_delta(event, pv->mbm_total, value);
pv->mbm_total_delta =
scale_event(event, pv->mbm_total_delta);
} else
/* Report zero memory bandwidth with first read */
pv->mbm_total_delta = 0;
pv->mbm_total = value;
break;
default:
return PQOS_RETVAL_PARAM;
}
return PQOS_RETVAL_OK;
}
/**
* @brief Read HW perf counter
*
* @param group monitoring structure
* @param event PQoS event
*
* @return Operation status
* @retval PQOS_RETVAL_OK on success
* @retval PQOS_RETVAL_ERROR if error occurs
*/
static int
hw_mon_read_perf(struct pqos_mon_data *group, const enum pqos_mon_event event)
{
struct pqos_event_values *pv = &group->values;
uint64_t val = 0;
unsigned n;
uint64_t reg;
uint64_t *value;
uint64_t *delta;
switch (event) {
case (enum pqos_mon_event)PQOS_PERF_EVENT_INSTRUCTIONS:
reg = IA32_MSR_INST_RETIRED_ANY;
value = &pv->ipc_retired;
delta = &pv->ipc_retired_delta;
break;
case (enum pqos_mon_event)PQOS_PERF_EVENT_CYCLES:
reg = IA32_MSR_CPU_UNHALTED_THREAD;
value = &pv->ipc_unhalted;
delta = &pv->ipc_unhalted_delta;
break;
case PQOS_PERF_EVENT_LLC_MISS:
reg = IA32_MSR_PMC0;
value = &pv->llc_misses;
delta = &pv->llc_misses_delta;
break;
default:
return PQOS_RETVAL_PARAM;
}
/**
* If multiple cores monitored in one group
* then we have to accumulate the values in the group.
*/
for (n = 0; n < group->num_cores; n++) {
uint64_t tmp = 0;
int ret = msr_read(group->cores[n], reg, &tmp);
if (ret != MACHINE_RETVAL_OK)
return PQOS_RETVAL_ERROR;
val += tmp;
}
*delta = val - *value;
*value = val;
return PQOS_RETVAL_OK;
}
int
hw_mon_poll(struct pqos_mon_data *group, const enum pqos_mon_event event)
{
int ret = PQOS_RETVAL_OK;
switch (event) {
case PQOS_MON_EVENT_L3_OCCUP:
case PQOS_MON_EVENT_LMEM_BW:
case PQOS_MON_EVENT_TMEM_BW:
ret = hw_mon_read_counter(group, event);
if (ret != PQOS_RETVAL_OK)
goto pqos_core_poll__exit;
break;
case (enum pqos_mon_event)PQOS_PERF_EVENT_CYCLES:
case (enum pqos_mon_event)PQOS_PERF_EVENT_INSTRUCTIONS:
case PQOS_PERF_EVENT_LLC_MISS:
ret = hw_mon_read_perf(group, event);
if (ret != PQOS_RETVAL_OK)
goto pqos_core_poll__exit;
break;
default:
ret = PQOS_RETVAL_PARAM;
}
pqos_core_poll__exit:
return ret;
}
/*
* =======================================
* =======================================
*
* Small utils
*
* =======================================
* =======================================
*/
/**
* @brief Maps PQoS API event onto an MSR event id
*
* @param [in] event PQoS API event id
*
* @return MSR event id
* @retval 0 if not successful
*/
static unsigned
get_event_id(const enum pqos_mon_event event)
{
switch (event) {
case PQOS_MON_EVENT_L3_OCCUP:
return 1;
break;
case PQOS_MON_EVENT_LMEM_BW:
return 3;
break;
case PQOS_MON_EVENT_TMEM_BW:
return 2;
break;
case PQOS_MON_EVENT_RMEM_BW:
default:
ASSERT(0); /**< this means bug */
break;
}
return 0;
}