/* * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved. * Copyright (c) 2010-2015 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* * Abstract: * Implementation of OpenSM QoS infrastructure primitives */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #define FILE_ID OSM_FILE_QOS_C #include #include #include struct qos_config { uint8_t max_vls; uint8_t vl_high_limit; ib_vl_arb_table_t vlarb_high[2]; ib_vl_arb_table_t vlarb_low[2]; ib_slvl_table_t sl2vl; }; typedef struct qos_mad_item { cl_list_item_t list_item; osm_madw_t *p_madw; } qos_mad_item_t; typedef struct qos_mad_list { cl_list_item_t list_item; cl_qlist_t port_mad_list; } qos_mad_list_t; static void qos_build_config(struct qos_config *cfg, osm_qos_options_t * opt, osm_qos_options_t * dflt); /* * QoS primitives */ static qos_mad_item_t *osm_qos_mad_create(IN osm_sm_t * sm, IN osm_physp_t * p, IN uint32_t data_size, IN uint8_t * p_data, IN ib_net16_t attr_id, IN uint32_t attr_mod) { qos_mad_item_t *p_mad; osm_madw_context_t context; osm_madw_t *p_madw; osm_node_t *p_node; osm_physp_t *physp0; ib_net64_t m_key; uint32_t timeout = 0; p_node = osm_physp_get_node_ptr(p); if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH && osm_physp_get_port_num(p) != 0) { physp0 = osm_node_get_physp_ptr(p_node, 0); m_key = ib_port_info_get_m_key(&physp0->port_info); } else m_key = ib_port_info_get_m_key(&p->port_info); switch (attr_id){ case IB_MAD_ATTR_SLVL_TABLE: context.slvl_context.node_guid = osm_node_get_node_guid(p_node); context.slvl_context.port_guid = osm_physp_get_port_guid(p); context.slvl_context.set_method = TRUE; if ((attr_mod & 0x30000) != 0) /* optimized ? */ timeout = sm->p_subn->opt.long_transaction_timeout; break; case IB_MAD_ATTR_VL_ARBITRATION: context.vla_context.node_guid = osm_node_get_node_guid(p_node); context.vla_context.port_guid = osm_physp_get_port_guid(p); context.vla_context.set_method = TRUE; break; default: return NULL; } p_mad = (qos_mad_item_t *) malloc(sizeof(*p_mad)); if (!p_mad) return NULL; memset(p_mad, 0, sizeof(*p_mad)); p_madw = osm_prepare_req_set(sm, osm_physp_get_dr_path_ptr(p), p_data, data_size, attr_id, cl_hton32(attr_mod), FALSE, m_key, timeout, CL_DISP_MSGID_NONE, &context); if (p_madw == NULL) { free(p_mad); return NULL; } p_mad->p_madw = p_madw; return p_mad; } static void osm_qos_mad_delete(qos_mad_item_t ** p_item) { free(*p_item); *p_item = NULL; } static ib_api_status_t vlarb_update_table_block(osm_sm_t * sm, osm_physp_t * p, uint8_t port_num, unsigned force_update, const ib_vl_arb_table_t * table_block, unsigned block_length, unsigned block_num, cl_qlist_t *mad_list) { struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used; ib_vl_arb_table_t block; uint32_t attr_mod; unsigned vl_mask, i; qos_mad_item_t *p_mad; vl_mask = (1 << (ib_port_info_get_op_vls(&p->port_info) - 1)) - 1; memset(&block, 0, sizeof(block)); memcpy(&block, table_block, block_length * sizeof(block.vl_entry[0])); if (re && re->update_vlarb) re->update_vlarb(re->context, p, port_num, &block, block_length, block_num); for (i = 0; i < block_length; i++) block.vl_entry[i].vl &= vl_mask; if (!force_update && !memcmp(&p->vl_arb[block_num], &block, block_length * sizeof(block.vl_entry[0]))) return IB_SUCCESS; attr_mod = ((block_num + 1) << 16) | port_num; p_mad = osm_qos_mad_create(sm,p,sizeof(block),(uint8_t *) & block, IB_MAD_ATTR_VL_ARBITRATION, attr_mod); if (!p_mad) return IB_INSUFFICIENT_MEMORY; /* * Zero the stored VL Arbitration block, so in case the MAD will * end up with error, we will resend it in the next sweep. */ memset(&p->vl_arb[block_num], 0, block_length * sizeof(block.vl_entry[0])); cl_qlist_insert_tail(mad_list, &p_mad->list_item); return IB_SUCCESS; } static ib_api_status_t vlarb_update(osm_sm_t * sm, osm_physp_t * p, uint8_t port_num, unsigned force_update, const struct qos_config *qcfg, cl_qlist_t *mad_list) { ib_api_status_t status = IB_SUCCESS; ib_port_info_t *p_pi = &p->port_info; unsigned len; if (ib_port_info_get_vl_cap(p_pi) == 1) { /* no VLArb if 1 data VL */ p->vl_high_limit = 0; return status; } if (p_pi->vl_arb_low_cap > 0) { len = p_pi->vl_arb_low_cap < IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK ? p_pi->vl_arb_low_cap : IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; if ((status = vlarb_update_table_block(sm, p, port_num, force_update, &qcfg->vlarb_low[0], len, 0, mad_list)) != IB_SUCCESS) return status; } if (p_pi->vl_arb_low_cap > IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK) { len = p_pi->vl_arb_low_cap % IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; if ((status = vlarb_update_table_block(sm, p, port_num, force_update, &qcfg->vlarb_low[1], len, 1, mad_list)) != IB_SUCCESS) return status; } if (p_pi->vl_arb_high_cap > 0) { len = p_pi->vl_arb_high_cap < IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK ? p_pi->vl_arb_high_cap : IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; if ((status = vlarb_update_table_block(sm, p, port_num, force_update, &qcfg->vlarb_high[0], len, 2, mad_list)) != IB_SUCCESS) return status; } if (p_pi->vl_arb_high_cap > IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK) { len = p_pi->vl_arb_high_cap % IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; if ((status = vlarb_update_table_block(sm, p, port_num, force_update, &qcfg->vlarb_high[1], len, 3, mad_list)) != IB_SUCCESS) return status; } return status; } static ib_api_status_t sl2vl_update_table(osm_sm_t * sm, osm_physp_t * p, uint8_t in_port, uint32_t attr_mod, unsigned force_update, const ib_slvl_table_t * sl2vl_table, cl_qlist_t *mad_list) { ib_slvl_table_t tbl, *p_tbl; unsigned vl_mask; uint8_t vl1, vl2; int i; qos_mad_item_t *p_mad; vl_mask = (1 << (ib_port_info_get_op_vls(&p->port_info) - 1)) - 1; for (i = 0; i < IB_MAX_NUM_VLS / 2; i++) { vl1 = sl2vl_table->raw_vl_by_sl[i] >> 4; vl2 = sl2vl_table->raw_vl_by_sl[i] & 0xf; if (vl1 != 15) vl1 &= vl_mask; if (vl2 != 15) vl2 &= vl_mask; tbl.raw_vl_by_sl[i] = (vl1 << 4) | vl2; } p_tbl = osm_physp_get_slvl_tbl(p, in_port); if (!force_update && !memcmp(p_tbl, &tbl, sizeof(tbl))) return IB_SUCCESS; p_mad = osm_qos_mad_create(sm, p, sizeof(tbl), (uint8_t *) & tbl, IB_MAD_ATTR_SLVL_TABLE, attr_mod); if (!p_mad) return IB_INSUFFICIENT_MEMORY; /* * Zero the stored SL2VL block, so in case the MAD will * end up with error, we will resend it in the next sweep. */ memset(p_tbl, 0, sizeof(tbl)); cl_qlist_insert_tail(mad_list, &p_mad->list_item); return IB_SUCCESS; } static int qos_extports_setup(osm_sm_t * sm, osm_node_t *node, const struct qos_config *qcfg, cl_qlist_t *port_mad_list) { osm_physp_t *p0, *p; unsigned force_update; unsigned num_ports = osm_node_get_num_physp(node); struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used; int ret = 0; unsigned in, out; uint8_t op_vl, common_op_vl = 0, max_num = 0; uint8_t op_vl_arr[15]; /* * Do nothing unless the most recent routing attempt was successful. */ if (!re) return ret; for (out = 1; out < num_ports; out++) { p = osm_node_get_physp_ptr(node, out); if (!p) continue; if (ib_port_info_get_port_state(&p->port_info) == IB_LINK_DOWN) continue; force_update = p->need_update || sm->p_subn->need_update; p->vl_high_limit = qcfg->vl_high_limit; if (vlarb_update(sm, p, p->port_num, force_update, qcfg, port_mad_list)) ret = -1; } p0 = osm_node_get_physp_ptr(node, 0); if (!(p0->port_info.capability_mask & IB_PORT_CAP_HAS_SL_MAP)) return ret; if (ib_switch_info_get_opt_sl2vlmapping(&node->sw->switch_info) && sm->p_subn->opt.use_optimized_slvl && !re->update_sl2vl) { /* we should find the op_vl that is used by majority of ports */ memset(&op_vl_arr[0], 0, sizeof(op_vl_arr)); p0 = osm_node_get_physp_ptr(node, 1); for (out = 1; out < num_ports; out++) { p = osm_node_get_physp_ptr(node, out); if (!p) continue; if (ib_port_info_get_port_state(&p->port_info) == IB_LINK_DOWN) continue; op_vl = ib_port_info_get_op_vls(&p->port_info); op_vl_arr[op_vl]++; if (op_vl_arr[op_vl] > max_num){ max_num = op_vl_arr[op_vl]; common_op_vl = op_vl; /* remember the port with most common op_vl */ p0 = p; } } if (!p0) return -1; force_update = node->sw->need_update || sm->p_subn->need_update; if (sl2vl_update_table(sm, p0, p0->port_num, 0x30000, force_update, &qcfg->sl2vl, port_mad_list)) ret = -1; /* * Overwrite default ALL configuration if port's * op_vl is different. */ for (out = 1; out < num_ports; out++) { p = osm_node_get_physp_ptr(node, out); if (!p) continue; if (ib_port_info_get_port_state(&p->port_info) == IB_LINK_DOWN) continue; force_update = p->need_update || force_update; if (ib_port_info_get_op_vls(&p->port_info) != common_op_vl && sl2vl_update_table(sm, p, p->port_num, 0x20000 | out, force_update, &qcfg->sl2vl, port_mad_list)) ret = -1; } return ret; } /* non optimized sl2vl configuration */ out = ib_switch_info_is_enhanced_port0(&node->sw->switch_info) ? 0 : 1; for (; out < num_ports; out++) { p = osm_node_get_physp_ptr(node, out); if (!p) continue; if (ib_port_info_get_port_state(&p->port_info) == IB_LINK_DOWN) continue; force_update = p->need_update || sm->p_subn->need_update; /* go over all in ports */ for (in = 0; in < num_ports; in++) { const ib_slvl_table_t *port_sl2vl = &qcfg->sl2vl; ib_slvl_table_t routing_sl2vl; if (re->update_sl2vl) { routing_sl2vl = *port_sl2vl; re->update_sl2vl(re->context, p, in, out, &routing_sl2vl); port_sl2vl = &routing_sl2vl; } if (sl2vl_update_table(sm, p, in, in << 8 | out, force_update, port_sl2vl, port_mad_list)) ret = -1; } } return ret; } static int qos_endport_setup(osm_sm_t * sm, osm_physp_t * p, const struct qos_config *qcfg, int vlarb_only, cl_qlist_t *port_mad_list) { unsigned force_update = p->need_update || sm->p_subn->need_update; struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used; const ib_slvl_table_t *port_sl2vl = &qcfg->sl2vl; ib_slvl_table_t routing_sl2vl; p->vl_high_limit = qcfg->vl_high_limit; if (vlarb_update(sm, p, 0, force_update, qcfg, port_mad_list)) return -1; if (vlarb_only) return 0; if (!(p->port_info.capability_mask & IB_PORT_CAP_HAS_SL_MAP)) return 0; if (re && re->update_sl2vl) { routing_sl2vl = *port_sl2vl; re->update_sl2vl(re->context, p, 0, 0, &routing_sl2vl); port_sl2vl = &routing_sl2vl; } if (sl2vl_update_table(sm, p, 0, 0, force_update, port_sl2vl, port_mad_list)) return -1; return 0; } int osm_qos_setup(osm_opensm_t * p_osm) { struct qos_config ca_config, sw0_config, swe_config, rtr_config; struct qos_config *cfg; cl_qmap_t *p_tbl; cl_map_item_t *p_next; osm_port_t *p_port; osm_node_t *p_node; int ret = 0; int vlarb_only; qos_mad_list_t *p_list, *p_list_next; qos_mad_item_t *p_port_mad; cl_qlist_t qos_mad_list; if (!p_osm->subn.opt.qos) return 0; OSM_LOG_ENTER(&p_osm->log); qos_build_config(&ca_config, &p_osm->subn.opt.qos_ca_options, &p_osm->subn.opt.qos_options); qos_build_config(&sw0_config, &p_osm->subn.opt.qos_sw0_options, &p_osm->subn.opt.qos_options); qos_build_config(&swe_config, &p_osm->subn.opt.qos_swe_options, &p_osm->subn.opt.qos_options); qos_build_config(&rtr_config, &p_osm->subn.opt.qos_rtr_options, &p_osm->subn.opt.qos_options); cl_qlist_init(&qos_mad_list); cl_plock_excl_acquire(&p_osm->lock); /* read QoS policy config file */ osm_qos_parse_policy_file(&p_osm->subn); p_tbl = &p_osm->subn.port_guid_tbl; p_next = cl_qmap_head(p_tbl); while (p_next != cl_qmap_end(p_tbl)) { vlarb_only = 0; p_port = (osm_port_t *) p_next; p_next = cl_qmap_next(p_next); p_list = (qos_mad_list_t *) malloc(sizeof(*p_list)); if (!p_list) { cl_plock_release(&p_osm->lock); return -1; } memset(p_list, 0, sizeof(*p_list)); cl_qlist_init(&p_list->port_mad_list); p_node = p_port->p_node; if (p_node->sw) { if (qos_extports_setup(&p_osm->sm, p_node, &swe_config, &p_list->port_mad_list)) { cl_plock_release(&p_osm->lock); ret = -1; } /* skip base port 0 */ if (!ib_switch_info_is_enhanced_port0 (&p_node->sw->switch_info)) goto Continue; if (ib_switch_info_get_opt_sl2vlmapping(&p_node->sw->switch_info) && p_osm->sm.p_subn->opt.use_optimized_slvl && !memcmp(&swe_config.sl2vl, &sw0_config.sl2vl, sizeof(swe_config.sl2vl))) vlarb_only = 1; cfg = &sw0_config; } else if (osm_node_get_type(p_node) == IB_NODE_TYPE_ROUTER) cfg = &rtr_config; else cfg = &ca_config; if (qos_endport_setup(&p_osm->sm, p_port->p_physp, cfg, vlarb_only, &p_list->port_mad_list)) { cl_plock_release(&p_osm->lock); ret = -1; } Continue: /* if MAD list is not empty, add it to the global MAD list */ if (cl_qlist_count(&p_list->port_mad_list)) { cl_qlist_insert_tail(&qos_mad_list, &p_list->list_item); } else { free(p_list); } } while (cl_qlist_count(&qos_mad_list)) { p_list_next = (qos_mad_list_t *) cl_qlist_head(&qos_mad_list); while (p_list_next != (qos_mad_list_t *) cl_qlist_end(&qos_mad_list)) { p_list = p_list_next; p_list_next = (qos_mad_list_t *) cl_qlist_next(&p_list->list_item); /* next MAD to send*/ p_port_mad = (qos_mad_item_t *) cl_qlist_remove_head(&p_list->port_mad_list); osm_send_req_mad(&p_osm->sm, p_port_mad->p_madw); osm_qos_mad_delete(&p_port_mad); /* remove the QoS MAD from global MAD list */ if (cl_qlist_count(&p_list->port_mad_list) == 0) { cl_qlist_remove_item(&qos_mad_list, &p_list->list_item); free(p_list); } } } cl_plock_release(&p_osm->lock); OSM_LOG_EXIT(&p_osm->log); return ret; } /* * QoS config stuff */ static int parse_one_unsigned(const char *str, char delim, unsigned *val) { char *end; *val = strtoul(str, &end, 0); if (*end) end++; return (int)(end - str); } static int parse_vlarb_entry(const char *str, ib_vl_arb_element_t * e) { unsigned val; const char *p = str; p += parse_one_unsigned(p, ':', &val); e->vl = val % 15; p += parse_one_unsigned(p, ',', &val); e->weight = (uint8_t) val; return (int)(p - str); } static int parse_sl2vl_entry(const char *str, uint8_t * raw) { unsigned val1, val2; const char *p = str; p += parse_one_unsigned(p, ',', &val1); p += parse_one_unsigned(p, ',', &val2); *raw = (val1 << 4) | (val2 & 0xf); return (int)(p - str); } static void qos_build_config(struct qos_config *cfg, osm_qos_options_t * opt, osm_qos_options_t * dflt) { int i; const char *p; memset(cfg, 0, sizeof(*cfg)); if (opt->max_vls > 0) cfg->max_vls = opt->max_vls; else { if (dflt->max_vls > 0) cfg->max_vls = dflt->max_vls; else cfg->max_vls = OSM_DEFAULT_QOS_MAX_VLS; } if (opt->high_limit >= 0) cfg->vl_high_limit = (uint8_t) opt->high_limit; else { if (dflt->high_limit >= 0) cfg->vl_high_limit = (uint8_t) dflt->high_limit; else cfg->vl_high_limit = (uint8_t) OSM_DEFAULT_QOS_HIGH_LIMIT; } if (opt->vlarb_high) p = opt->vlarb_high; else { if (dflt->vlarb_high) p = dflt->vlarb_high; else p = OSM_DEFAULT_QOS_VLARB_HIGH; } for (i = 0; i < 2 * IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; i++) { p += parse_vlarb_entry(p, &cfg->vlarb_high[i / IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]. vl_entry[i % IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]); } if (opt->vlarb_low) p = opt->vlarb_low; else { if (dflt->vlarb_low) p = dflt->vlarb_low; else p = OSM_DEFAULT_QOS_VLARB_LOW; } for (i = 0; i < 2 * IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK; i++) { p += parse_vlarb_entry(p, &cfg->vlarb_low[i / IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]. vl_entry[i % IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]); } p = opt->sl2vl ? opt->sl2vl : dflt->sl2vl; if (opt->sl2vl) p = opt->sl2vl; else { if (dflt->sl2vl) p = dflt->sl2vl; else p = OSM_DEFAULT_QOS_SL2VL; } for (i = 0; i < IB_MAX_NUM_VLS / 2; i++) p += parse_sl2vl_entry(p, &cfg->sl2vl.raw_vl_by_sl[i]); }