/*
* Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved.
* Copyright (c) 2010-2015 Mellanox Technologies LTD. All rights reserved.
* Copyright (c) 2009 HNR Consulting. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
/*
* Abstract:
* Implementation of OpenSM QoS infrastructure primitives
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif /* HAVE_CONFIG_H */
#include <stdlib.h>
#include <string.h>
#include <iba/ib_types.h>
#include <complib/cl_qmap.h>
#include <complib/cl_debug.h>
#include <opensm/osm_file_ids.h>
#define FILE_ID OSM_FILE_QOS_C
#include <opensm/osm_opensm.h>
#include <opensm/osm_subnet.h>
#include <opensm/osm_qos_policy.h>
struct qos_config {
uint8_t max_vls;
uint8_t vl_high_limit;
ib_vl_arb_table_t vlarb_high[2];
ib_vl_arb_table_t vlarb_low[2];
ib_slvl_table_t sl2vl;
};
typedef struct qos_mad_item {
cl_list_item_t list_item;
osm_madw_t *p_madw;
} qos_mad_item_t;
typedef struct qos_mad_list {
cl_list_item_t list_item;
cl_qlist_t port_mad_list;
} qos_mad_list_t;
static void qos_build_config(struct qos_config *cfg,
osm_qos_options_t * opt,
osm_qos_options_t * dflt);
/*
* QoS primitives
*/
static qos_mad_item_t *osm_qos_mad_create(IN osm_sm_t * sm,
IN osm_physp_t * p,
IN uint32_t data_size,
IN uint8_t * p_data,
IN ib_net16_t attr_id,
IN uint32_t attr_mod)
{
qos_mad_item_t *p_mad;
osm_madw_context_t context;
osm_madw_t *p_madw;
osm_node_t *p_node;
osm_physp_t *physp0;
ib_net64_t m_key;
uint32_t timeout = 0;
p_node = osm_physp_get_node_ptr(p);
if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH &&
osm_physp_get_port_num(p) != 0) {
physp0 = osm_node_get_physp_ptr(p_node, 0);
m_key = ib_port_info_get_m_key(&physp0->port_info);
} else
m_key = ib_port_info_get_m_key(&p->port_info);
switch (attr_id){
case IB_MAD_ATTR_SLVL_TABLE:
context.slvl_context.node_guid = osm_node_get_node_guid(p_node);
context.slvl_context.port_guid = osm_physp_get_port_guid(p);
context.slvl_context.set_method = TRUE;
if ((attr_mod & 0x30000) != 0) /* optimized ? */
timeout = sm->p_subn->opt.long_transaction_timeout;
break;
case IB_MAD_ATTR_VL_ARBITRATION:
context.vla_context.node_guid = osm_node_get_node_guid(p_node);
context.vla_context.port_guid = osm_physp_get_port_guid(p);
context.vla_context.set_method = TRUE;
break;
default:
return NULL;
}
p_mad = (qos_mad_item_t *) malloc(sizeof(*p_mad));
if (!p_mad)
return NULL;
memset(p_mad, 0, sizeof(*p_mad));
p_madw = osm_prepare_req_set(sm, osm_physp_get_dr_path_ptr(p),
p_data, data_size,
attr_id, cl_hton32(attr_mod),
FALSE, m_key,
timeout, CL_DISP_MSGID_NONE, &context);
if (p_madw == NULL) {
free(p_mad);
return NULL;
}
p_mad->p_madw = p_madw;
return p_mad;
}
static void osm_qos_mad_delete(qos_mad_item_t ** p_item)
{
free(*p_item);
*p_item = NULL;
}
static ib_api_status_t vlarb_update_table_block(osm_sm_t * sm,
osm_physp_t * p,
uint8_t port_num,
unsigned force_update,
const ib_vl_arb_table_t *
table_block,
unsigned block_length,
unsigned block_num,
cl_qlist_t *mad_list)
{
struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used;
ib_vl_arb_table_t block;
uint32_t attr_mod;
unsigned vl_mask, i;
qos_mad_item_t *p_mad;
vl_mask = (1 << (ib_port_info_get_op_vls(&p->port_info) - 1)) - 1;
memset(&block, 0, sizeof(block));
memcpy(&block, table_block, block_length * sizeof(block.vl_entry[0]));
if (re && re->update_vlarb)
re->update_vlarb(re->context, p, port_num, &block,
block_length, block_num);
for (i = 0; i < block_length; i++)
block.vl_entry[i].vl &= vl_mask;
if (!force_update &&
!memcmp(&p->vl_arb[block_num], &block,
block_length * sizeof(block.vl_entry[0])))
return IB_SUCCESS;
attr_mod = ((block_num + 1) << 16) | port_num;
p_mad = osm_qos_mad_create(sm,p,sizeof(block),(uint8_t *) & block,
IB_MAD_ATTR_VL_ARBITRATION, attr_mod);
if (!p_mad)
return IB_INSUFFICIENT_MEMORY;
/*
* Zero the stored VL Arbitration block, so in case the MAD will
* end up with error, we will resend it in the next sweep.
*/
memset(&p->vl_arb[block_num], 0,
block_length * sizeof(block.vl_entry[0]));
cl_qlist_insert_tail(mad_list, &p_mad->list_item);
return IB_SUCCESS;
}
static ib_api_status_t vlarb_update(osm_sm_t * sm, osm_physp_t * p,
uint8_t port_num, unsigned force_update,
const struct qos_config *qcfg,
cl_qlist_t *mad_list)
{
ib_api_status_t status = IB_SUCCESS;
ib_port_info_t *p_pi = &p->port_info;
unsigned len;
if (ib_port_info_get_vl_cap(p_pi) == 1) { /* no VLArb if 1 data VL */
p->vl_high_limit = 0;
return status;
}
if (p_pi->vl_arb_low_cap > 0) {
len = p_pi->vl_arb_low_cap < IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK ?
p_pi->vl_arb_low_cap : IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK;
if ((status = vlarb_update_table_block(sm, p, port_num,
force_update,
&qcfg->vlarb_low[0],
len, 0,
mad_list)) != IB_SUCCESS)
return status;
}
if (p_pi->vl_arb_low_cap > IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK) {
len = p_pi->vl_arb_low_cap % IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK;
if ((status = vlarb_update_table_block(sm, p, port_num,
force_update,
&qcfg->vlarb_low[1],
len, 1,
mad_list)) != IB_SUCCESS)
return status;
}
if (p_pi->vl_arb_high_cap > 0) {
len = p_pi->vl_arb_high_cap < IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK ?
p_pi->vl_arb_high_cap : IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK;
if ((status = vlarb_update_table_block(sm, p, port_num,
force_update,
&qcfg->vlarb_high[0],
len, 2,
mad_list)) != IB_SUCCESS)
return status;
}
if (p_pi->vl_arb_high_cap > IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK) {
len = p_pi->vl_arb_high_cap % IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK;
if ((status = vlarb_update_table_block(sm, p, port_num,
force_update,
&qcfg->vlarb_high[1],
len, 3,
mad_list)) != IB_SUCCESS)
return status;
}
return status;
}
static ib_api_status_t sl2vl_update_table(osm_sm_t * sm, osm_physp_t * p,
uint8_t in_port, uint32_t attr_mod,
unsigned force_update,
const ib_slvl_table_t * sl2vl_table,
cl_qlist_t *mad_list)
{
ib_slvl_table_t tbl, *p_tbl;
unsigned vl_mask;
uint8_t vl1, vl2;
int i;
qos_mad_item_t *p_mad;
vl_mask = (1 << (ib_port_info_get_op_vls(&p->port_info) - 1)) - 1;
for (i = 0; i < IB_MAX_NUM_VLS / 2; i++) {
vl1 = sl2vl_table->raw_vl_by_sl[i] >> 4;
vl2 = sl2vl_table->raw_vl_by_sl[i] & 0xf;
if (vl1 != 15)
vl1 &= vl_mask;
if (vl2 != 15)
vl2 &= vl_mask;
tbl.raw_vl_by_sl[i] = (vl1 << 4) | vl2;
}
p_tbl = osm_physp_get_slvl_tbl(p, in_port);
if (!force_update && !memcmp(p_tbl, &tbl, sizeof(tbl)))
return IB_SUCCESS;
p_mad = osm_qos_mad_create(sm, p, sizeof(tbl), (uint8_t *) & tbl,
IB_MAD_ATTR_SLVL_TABLE, attr_mod);
if (!p_mad)
return IB_INSUFFICIENT_MEMORY;
/*
* Zero the stored SL2VL block, so in case the MAD will
* end up with error, we will resend it in the next sweep.
*/
memset(p_tbl, 0, sizeof(tbl));
cl_qlist_insert_tail(mad_list, &p_mad->list_item);
return IB_SUCCESS;
}
static int qos_extports_setup(osm_sm_t * sm, osm_node_t *node,
const struct qos_config *qcfg,
cl_qlist_t *port_mad_list)
{
osm_physp_t *p0, *p;
unsigned force_update;
unsigned num_ports = osm_node_get_num_physp(node);
struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used;
int ret = 0;
unsigned in, out;
uint8_t op_vl, common_op_vl = 0, max_num = 0;
uint8_t op_vl_arr[15];
/*
* Do nothing unless the most recent routing attempt was successful.
*/
if (!re)
return ret;
for (out = 1; out < num_ports; out++) {
p = osm_node_get_physp_ptr(node, out);
if (!p)
continue;
if (ib_port_info_get_port_state(&p->port_info) == IB_LINK_DOWN)
continue;
force_update = p->need_update || sm->p_subn->need_update;
p->vl_high_limit = qcfg->vl_high_limit;
if (vlarb_update(sm, p, p->port_num, force_update, qcfg,
port_mad_list))
ret = -1;
}
p0 = osm_node_get_physp_ptr(node, 0);
if (!(p0->port_info.capability_mask & IB_PORT_CAP_HAS_SL_MAP))
return ret;
if (ib_switch_info_get_opt_sl2vlmapping(&node->sw->switch_info) &&
sm->p_subn->opt.use_optimized_slvl && !re->update_sl2vl) {
/* we should find the op_vl that is used by majority of ports */
memset(&op_vl_arr[0], 0, sizeof(op_vl_arr));
p0 = osm_node_get_physp_ptr(node, 1);
for (out = 1; out < num_ports; out++) {
p = osm_node_get_physp_ptr(node, out);
if (!p)
continue;
if (ib_port_info_get_port_state(&p->port_info) ==
IB_LINK_DOWN)
continue;
op_vl = ib_port_info_get_op_vls(&p->port_info);
op_vl_arr[op_vl]++;
if (op_vl_arr[op_vl] > max_num){
max_num = op_vl_arr[op_vl];
common_op_vl = op_vl;
/* remember the port with most common op_vl */
p0 = p;
}
}
if (!p0)
return -1;
force_update = node->sw->need_update || sm->p_subn->need_update;
if (sl2vl_update_table(sm, p0, p0->port_num, 0x30000, force_update,
&qcfg->sl2vl, port_mad_list))
ret = -1;
/*
* Overwrite default ALL configuration if port's
* op_vl is different.
*/
for (out = 1; out < num_ports; out++) {
p = osm_node_get_physp_ptr(node, out);
if (!p)
continue;
if (ib_port_info_get_port_state(&p->port_info) ==
IB_LINK_DOWN)
continue;
force_update = p->need_update || force_update;
if (ib_port_info_get_op_vls(&p->port_info) !=
common_op_vl &&
sl2vl_update_table(sm, p, p->port_num, 0x20000 | out,
force_update, &qcfg->sl2vl,
port_mad_list))
ret = -1;
}
return ret;
}
/* non optimized sl2vl configuration */
out = ib_switch_info_is_enhanced_port0(&node->sw->switch_info) ? 0 : 1;
for (; out < num_ports; out++) {
p = osm_node_get_physp_ptr(node, out);
if (!p)
continue;
if (ib_port_info_get_port_state(&p->port_info) == IB_LINK_DOWN)
continue;
force_update = p->need_update || sm->p_subn->need_update;
/* go over all in ports */
for (in = 0; in < num_ports; in++) {
const ib_slvl_table_t *port_sl2vl = &qcfg->sl2vl;
ib_slvl_table_t routing_sl2vl;
if (re->update_sl2vl) {
routing_sl2vl = *port_sl2vl;
re->update_sl2vl(re->context,
p, in, out, &routing_sl2vl);
port_sl2vl = &routing_sl2vl;
}
if (sl2vl_update_table(sm, p, in, in << 8 | out,
force_update, port_sl2vl,
port_mad_list))
ret = -1;
}
}
return ret;
}
static int qos_endport_setup(osm_sm_t * sm, osm_physp_t * p,
const struct qos_config *qcfg, int vlarb_only,
cl_qlist_t *port_mad_list)
{
unsigned force_update = p->need_update || sm->p_subn->need_update;
struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used;
const ib_slvl_table_t *port_sl2vl = &qcfg->sl2vl;
ib_slvl_table_t routing_sl2vl;
p->vl_high_limit = qcfg->vl_high_limit;
if (vlarb_update(sm, p, 0, force_update, qcfg, port_mad_list))
return -1;
if (vlarb_only)
return 0;
if (!(p->port_info.capability_mask & IB_PORT_CAP_HAS_SL_MAP))
return 0;
if (re && re->update_sl2vl) {
routing_sl2vl = *port_sl2vl;
re->update_sl2vl(re->context, p, 0, 0, &routing_sl2vl);
port_sl2vl = &routing_sl2vl;
}
if (sl2vl_update_table(sm, p, 0, 0, force_update, port_sl2vl,
port_mad_list))
return -1;
return 0;
}
int osm_qos_setup(osm_opensm_t * p_osm)
{
struct qos_config ca_config, sw0_config, swe_config, rtr_config;
struct qos_config *cfg;
cl_qmap_t *p_tbl;
cl_map_item_t *p_next;
osm_port_t *p_port;
osm_node_t *p_node;
int ret = 0;
int vlarb_only;
qos_mad_list_t *p_list, *p_list_next;
qos_mad_item_t *p_port_mad;
cl_qlist_t qos_mad_list;
if (!p_osm->subn.opt.qos)
return 0;
OSM_LOG_ENTER(&p_osm->log);
qos_build_config(&ca_config, &p_osm->subn.opt.qos_ca_options,
&p_osm->subn.opt.qos_options);
qos_build_config(&sw0_config, &p_osm->subn.opt.qos_sw0_options,
&p_osm->subn.opt.qos_options);
qos_build_config(&swe_config, &p_osm->subn.opt.qos_swe_options,
&p_osm->subn.opt.qos_options);
qos_build_config(&rtr_config, &p_osm->subn.opt.qos_rtr_options,
&p_osm->subn.opt.qos_options);
cl_qlist_init(&qos_mad_list);
cl_plock_excl_acquire(&p_osm->lock);
/* read QoS policy config file */
osm_qos_parse_policy_file(&p_osm->subn);
p_tbl = &p_osm->subn.port_guid_tbl;
p_next = cl_qmap_head(p_tbl);
while (p_next != cl_qmap_end(p_tbl)) {
vlarb_only = 0;
p_port = (osm_port_t *) p_next;
p_next = cl_qmap_next(p_next);
p_list = (qos_mad_list_t *) malloc(sizeof(*p_list));
if (!p_list) {
cl_plock_release(&p_osm->lock);
return -1;
}
memset(p_list, 0, sizeof(*p_list));
cl_qlist_init(&p_list->port_mad_list);
p_node = p_port->p_node;
if (p_node->sw) {
if (qos_extports_setup(&p_osm->sm, p_node, &swe_config,
&p_list->port_mad_list)) {
cl_plock_release(&p_osm->lock);
ret = -1;
}
/* skip base port 0 */
if (!ib_switch_info_is_enhanced_port0
(&p_node->sw->switch_info))
goto Continue;
if (ib_switch_info_get_opt_sl2vlmapping(&p_node->sw->switch_info) &&
p_osm->sm.p_subn->opt.use_optimized_slvl &&
!memcmp(&swe_config.sl2vl, &sw0_config.sl2vl,
sizeof(swe_config.sl2vl)))
vlarb_only = 1;
cfg = &sw0_config;
} else if (osm_node_get_type(p_node) == IB_NODE_TYPE_ROUTER)
cfg = &rtr_config;
else
cfg = &ca_config;
if (qos_endport_setup(&p_osm->sm, p_port->p_physp, cfg,
vlarb_only, &p_list->port_mad_list)) {
cl_plock_release(&p_osm->lock);
ret = -1;
}
Continue:
/* if MAD list is not empty, add it to the global MAD list */
if (cl_qlist_count(&p_list->port_mad_list)) {
cl_qlist_insert_tail(&qos_mad_list, &p_list->list_item);
} else {
free(p_list);
}
}
while (cl_qlist_count(&qos_mad_list)) {
p_list_next = (qos_mad_list_t *) cl_qlist_head(&qos_mad_list);
while (p_list_next !=
(qos_mad_list_t *) cl_qlist_end(&qos_mad_list)) {
p_list = p_list_next;
p_list_next = (qos_mad_list_t *)
cl_qlist_next(&p_list->list_item);
/* next MAD to send*/
p_port_mad = (qos_mad_item_t *)
cl_qlist_remove_head(&p_list->port_mad_list);
osm_send_req_mad(&p_osm->sm, p_port_mad->p_madw);
osm_qos_mad_delete(&p_port_mad);
/* remove the QoS MAD from global MAD list */
if (cl_qlist_count(&p_list->port_mad_list) == 0) {
cl_qlist_remove_item(&qos_mad_list, &p_list->list_item);
free(p_list);
}
}
}
cl_plock_release(&p_osm->lock);
OSM_LOG_EXIT(&p_osm->log);
return ret;
}
/*
* QoS config stuff
*/
static int parse_one_unsigned(const char *str, char delim, unsigned *val)
{
char *end;
*val = strtoul(str, &end, 0);
if (*end)
end++;
return (int)(end - str);
}
static int parse_vlarb_entry(const char *str, ib_vl_arb_element_t * e)
{
unsigned val;
const char *p = str;
p += parse_one_unsigned(p, ':', &val);
e->vl = val % 15;
p += parse_one_unsigned(p, ',', &val);
e->weight = (uint8_t) val;
return (int)(p - str);
}
static int parse_sl2vl_entry(const char *str, uint8_t * raw)
{
unsigned val1, val2;
const char *p = str;
p += parse_one_unsigned(p, ',', &val1);
p += parse_one_unsigned(p, ',', &val2);
*raw = (val1 << 4) | (val2 & 0xf);
return (int)(p - str);
}
static void qos_build_config(struct qos_config *cfg, osm_qos_options_t * opt,
osm_qos_options_t * dflt)
{
int i;
const char *p;
memset(cfg, 0, sizeof(*cfg));
if (opt->max_vls > 0)
cfg->max_vls = opt->max_vls;
else {
if (dflt->max_vls > 0)
cfg->max_vls = dflt->max_vls;
else
cfg->max_vls = OSM_DEFAULT_QOS_MAX_VLS;
}
if (opt->high_limit >= 0)
cfg->vl_high_limit = (uint8_t) opt->high_limit;
else {
if (dflt->high_limit >= 0)
cfg->vl_high_limit = (uint8_t) dflt->high_limit;
else
cfg->vl_high_limit = (uint8_t) OSM_DEFAULT_QOS_HIGH_LIMIT;
}
if (opt->vlarb_high)
p = opt->vlarb_high;
else {
if (dflt->vlarb_high)
p = dflt->vlarb_high;
else
p = OSM_DEFAULT_QOS_VLARB_HIGH;
}
for (i = 0; i < 2 * IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; i++) {
p += parse_vlarb_entry(p,
&cfg->vlarb_high[i /
IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK].
vl_entry[i %
IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]);
}
if (opt->vlarb_low)
p = opt->vlarb_low;
else {
if (dflt->vlarb_low)
p = dflt->vlarb_low;
else
p = OSM_DEFAULT_QOS_VLARB_LOW;
}
for (i = 0; i < 2 * IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; i++) {
p += parse_vlarb_entry(p,
&cfg->vlarb_low[i /
IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK].
vl_entry[i %
IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]);
}
p = opt->sl2vl ? opt->sl2vl : dflt->sl2vl;
if (opt->sl2vl)
p = opt->sl2vl;
else {
if (dflt->sl2vl)
p = dflt->sl2vl;
else
p = OSM_DEFAULT_QOS_SL2VL;
}
for (i = 0; i < IB_MAX_NUM_VLS / 2; i++)
p += parse_sl2vl_entry(p, &cfg->sl2vl.raw_vl_by_sl[i]);
}