/* * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. * Copyright (c) 2007 Simula Research Laboratory. All rights reserved. * Copyright (c) 2007 Silicon Graphics Inc. All rights reserved. * Copyright (c) 2008,2009 System Fabric Works, Inc. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. * Copyright (c) 2009-2011 ZIH, TU Dresden, Federal Republic of Germany. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ /* * Abstract: * Implementation of LASH algorithm Calculation functions */ #if HAVE_CONFIG_H # include #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #define FILE_ID OSM_FILE_UCAST_LASH_C #include #include #include #include #include typedef struct _reachable_dest { int switch_id; struct _reachable_dest *next; } reachable_dest_t; static void connect_switches(lash_t * p_lash, int sw1, int sw2, int phy_port_1) { osm_log_t *p_log = &p_lash->p_osm->log; unsigned num = p_lash->switches[sw1]->node->num_links; switch_t *s1 = p_lash->switches[sw1]; mesh_node_t *node = s1->node; switch_t *s2; link_t *l; unsigned int i; /* * if doing mesh analysis: * - do not consider connections to self * - collapse multiple connections between * pair of switches to a single locical link */ if (p_lash->p_osm->subn.opt.do_mesh_analysis) { if (sw1 == sw2) return; /* see if we are already linked to sw2 */ for (i = 0; i < num; i++) { l = node->links[i]; if (node->links[i]->switch_id == sw2) { l->ports[l->num_ports++] = phy_port_1; return; } } } l = node->links[num]; l->switch_id = sw2; l->link_id = -1; l->ports[l->num_ports++] = phy_port_1; s2 = p_lash->switches[sw2]; for (i = 0; i < s2->node->num_links; i++) { if (s2->node->links[i]->switch_id == sw1) { s2->node->links[i]->link_id = num; l->link_id = i; break; } } node->num_links++; OSM_LOG(p_log, OSM_LOG_VERBOSE, "LASH connect: %d, %d, %d\n", sw1, sw2, phy_port_1); } static osm_switch_t *get_osm_switch_from_port(const osm_port_t * port) { osm_physp_t *p = port->p_physp; if (p->p_node->sw) return p->p_node->sw; else if (p->p_remote_physp && p->p_remote_physp->p_node->sw) return p->p_remote_physp->p_node->sw; return NULL; } static int cycle_exists(cdg_vertex_t * start, cdg_vertex_t * current, cdg_vertex_t * prev, int visit_num) { int i, new_visit_num; int cycle_found = 0; if (current != NULL && current->visiting_number > 0) { if (visit_num > current->visiting_number && current->seen == 0) { cycle_found = 1; } } else { if (current == NULL) { current = start; CL_ASSERT(prev == NULL); } current->visiting_number = visit_num; if (prev != NULL) { prev->next = current; CL_ASSERT(prev->to == current->from); CL_ASSERT(prev->visiting_number > 0); } new_visit_num = visit_num + 1; for (i = 0; i < current->num_deps; i++) { cycle_found = cycle_exists(start, current->deps[i].v, current, new_visit_num); if (cycle_found == 1) i = current->num_deps; } current->seen = 1; if (prev != NULL) prev->next = NULL; } return cycle_found; } static inline int get_next_switch(lash_t *p_lash, int sw, int link) { return p_lash->switches[sw]->node->links[link]->switch_id; } static void remove_semipermanent_depend_for_sp(lash_t * p_lash, int sw, int dest_switch, int lane) { switch_t **switches = p_lash->switches; cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix; int i_next_switch, output_link, i, next_link, i_next_next_switch, depend = 0; cdg_vertex_t *v; int __attribute__((unused)) found; output_link = switches[sw]->routing_table[dest_switch].out_link; i_next_switch = get_next_switch(p_lash, sw, output_link); while (sw != dest_switch) { v = cdg_vertex_matrix[lane][sw][i_next_switch]; CL_ASSERT(v != NULL); if (v->num_using_vertex == 1) { cdg_vertex_matrix[lane][sw][i_next_switch] = NULL; free(v); } else { v->num_using_vertex--; if (i_next_switch != dest_switch) { next_link = switches[i_next_switch]->routing_table[dest_switch].out_link; i_next_next_switch = get_next_switch(p_lash, i_next_switch, next_link); found = 0; for (i = 0; i < v->num_deps; i++) if (v->deps[i].v == cdg_vertex_matrix[lane][i_next_switch] [i_next_next_switch]) { found = 1; depend = i; } CL_ASSERT(found); if (v->deps[depend].num_used == 1) { for (i = depend; i < v->num_deps - 1; i++) { v->deps[i].v = v->deps[i + 1].v; v->deps[i].num_used = v->deps[i + 1].num_used; } v->num_deps--; } else v->deps[depend].num_used--; } } sw = i_next_switch; output_link = switches[sw]->routing_table[dest_switch].out_link; if (sw != dest_switch) i_next_switch = get_next_switch(p_lash, sw, output_link); } } inline static void enqueue(cl_list_t * bfsq, switch_t * sw) { CL_ASSERT(sw->q_state == UNQUEUED); sw->q_state = Q_MEMBER; cl_list_insert_tail(bfsq, sw); } inline static void dequeue(cl_list_t * bfsq, switch_t ** sw) { *sw = (switch_t *) cl_list_remove_head(bfsq); CL_ASSERT((*sw)->q_state == Q_MEMBER); (*sw)->q_state = MST_MEMBER; } static int get_phys_connection(switch_t *sw, int switch_to) { unsigned int i; for (i = 0; i < sw->node->num_links; i++) if (sw->node->links[i]->switch_id == switch_to) return i; return i; } static void shortest_path(lash_t * p_lash, int ir) { switch_t **switches = p_lash->switches, *sw, *swi; unsigned int i; cl_list_t bfsq; cl_list_construct(&bfsq); cl_list_init(&bfsq, 20); enqueue(&bfsq, switches[ir]); while (!cl_is_list_empty(&bfsq)) { dequeue(&bfsq, &sw); for (i = 0; i < sw->node->num_links; i++) { swi = switches[sw->node->links[i]->switch_id]; if (swi->q_state == UNQUEUED) { enqueue(&bfsq, swi); sw->dij_channels[sw->used_channels++] = swi->id; } } } cl_list_destroy(&bfsq); } static int generate_routing_func_for_mst(lash_t * p_lash, int sw_id, reachable_dest_t ** destinations) { int i, next_switch; switch_t *sw = p_lash->switches[sw_id]; int num_channels = sw->used_channels; reachable_dest_t *dest, *i_dest, *concat_dest = NULL, *prev; for (i = 0; i < num_channels; i++) { next_switch = sw->dij_channels[i]; if (generate_routing_func_for_mst(p_lash, next_switch, &dest)) return -1; i_dest = dest; prev = i_dest; while (i_dest != NULL) { if (sw->routing_table[i_dest->switch_id].out_link == NONE) sw->routing_table[i_dest->switch_id].out_link = get_phys_connection(sw, next_switch); prev = i_dest; i_dest = i_dest->next; } CL_ASSERT(prev->next == NULL); prev->next = concat_dest; concat_dest = dest; } i_dest = (reachable_dest_t *) malloc(sizeof(reachable_dest_t)); if (!i_dest) return -1; i_dest->switch_id = sw->id; i_dest->next = concat_dest; *destinations = i_dest; return 0; } static int generate_cdg_for_sp(lash_t * p_lash, int sw, int dest_switch, int lane) { unsigned num_switches = p_lash->num_switches; switch_t **switches = p_lash->switches; cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix; int next_switch, output_link, j, exists; cdg_vertex_t *v, *prev = NULL; output_link = switches[sw]->routing_table[dest_switch].out_link; next_switch = get_next_switch(p_lash, sw, output_link); while (sw != dest_switch) { if (cdg_vertex_matrix[lane][sw][next_switch] == NULL) { v = calloc(1, sizeof(*v) + (num_switches - 1) * sizeof(v->deps[0])); if (!v) return -1; v->from = sw; v->to = next_switch; v->temp = 1; cdg_vertex_matrix[lane][sw][next_switch] = v; } else v = cdg_vertex_matrix[lane][sw][next_switch]; v->num_using_vertex++; if (prev != NULL) { exists = 0; for (j = 0; j < prev->num_deps; j++) if (prev->deps[j].v == v) { exists = 1; prev->deps[j].num_used++; } if (exists == 0) { prev->deps[prev->num_deps].v = v; prev->deps[prev->num_deps].num_used++; prev->num_deps++; CL_ASSERT(prev->num_deps < (int)num_switches); if (prev->temp == 0) prev->num_temp_depend++; } } sw = next_switch; output_link = switches[sw]->routing_table[dest_switch].out_link; if (sw != dest_switch) { CL_ASSERT(output_link != NONE); next_switch = get_next_switch(p_lash, sw, output_link); } prev = v; } return 0; } static void set_temp_depend_to_permanent_for_sp(lash_t * p_lash, int sw, int dest_switch, int lane) { switch_t **switches = p_lash->switches; cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix; int next_switch, output_link; cdg_vertex_t *v; output_link = switches[sw]->routing_table[dest_switch].out_link; next_switch = get_next_switch(p_lash, sw, output_link); while (sw != dest_switch) { v = cdg_vertex_matrix[lane][sw][next_switch]; CL_ASSERT(v != NULL); if (v->temp == 1) v->temp = 0; else v->num_temp_depend = 0; sw = next_switch; output_link = switches[sw]->routing_table[dest_switch].out_link; if (sw != dest_switch) next_switch = get_next_switch(p_lash, sw, output_link); } } static void remove_temp_depend_for_sp(lash_t * p_lash, int sw, int dest_switch, int lane) { switch_t **switches = p_lash->switches; cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix; int next_switch, output_link, i; cdg_vertex_t *v; output_link = switches[sw]->routing_table[dest_switch].out_link; next_switch = get_next_switch(p_lash, sw, output_link); while (sw != dest_switch) { v = cdg_vertex_matrix[lane][sw][next_switch]; CL_ASSERT(v != NULL); if (v->temp == 1) { cdg_vertex_matrix[lane][sw][next_switch] = NULL; free(v); } else { CL_ASSERT(v->num_temp_depend <= v->num_deps); v->num_deps = v->num_deps - v->num_temp_depend; v->num_temp_depend = 0; v->num_using_vertex--; for (i = v->num_deps; i < p_lash->num_switches - 1; i++) v->deps[i].num_used = 0; } sw = next_switch; output_link = switches[sw]->routing_table[dest_switch].out_link; if (sw != dest_switch) next_switch = get_next_switch(p_lash, sw, output_link); } } static int balance_virtual_lanes(lash_t * p_lash, unsigned lanes_needed) { unsigned num_switches = p_lash->num_switches; cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix; int *num_mst_in_lane = p_lash->num_mst_in_lane; int ***virtual_location = p_lash->virtual_location; int min_filled_lane, max_filled_lane, trials; int old_min_filled_lane, old_max_filled_lane, new_num_min_lane, new_num_max_lane; unsigned int i, j; int src, dest, start, next_switch, output_link; int next_switch2, output_link2; int stop = 0, cycle_found; int cycle_found2; unsigned start_vl = p_lash->p_osm->subn.opt.lash_start_vl; max_filled_lane = 0; min_filled_lane = lanes_needed - 1; trials = num_mst_in_lane[max_filled_lane]; if (lanes_needed == 1) stop = 1; while (stop == 0) { src = abs(rand()) % (num_switches); dest = abs(rand()) % (num_switches); while (virtual_location[src][dest][max_filled_lane] != 1) { start = dest; if (dest == num_switches - 1) dest = 0; else dest++; while (dest != start && virtual_location[src][dest][max_filled_lane] != 1) { if (dest == num_switches - 1) dest = 0; else dest++; } if (virtual_location[src][dest][max_filled_lane] != 1) { if (src == num_switches - 1) src = 0; else src++; } } if (generate_cdg_for_sp(p_lash, src, dest, min_filled_lane) || generate_cdg_for_sp(p_lash, dest, src, min_filled_lane)) return -1; output_link = p_lash->switches[src]->routing_table[dest].out_link; next_switch = get_next_switch(p_lash, src, output_link); output_link2 = p_lash->switches[dest]->routing_table[src].out_link; next_switch2 = get_next_switch(p_lash, dest, output_link2); CL_ASSERT(cdg_vertex_matrix[min_filled_lane][src][next_switch] != NULL); CL_ASSERT(cdg_vertex_matrix[min_filled_lane][dest][next_switch2] != NULL); cycle_found = cycle_exists(cdg_vertex_matrix[min_filled_lane][src][next_switch], NULL, NULL, 1); cycle_found2 = cycle_exists(cdg_vertex_matrix[min_filled_lane][dest][next_switch2], NULL, NULL, 1); for (i = 0; i < num_switches; i++) for (j = 0; j < num_switches; j++) if (cdg_vertex_matrix[min_filled_lane][i][j] != NULL) { cdg_vertex_matrix[min_filled_lane][i][j]->visiting_number = 0; cdg_vertex_matrix[min_filled_lane][i][j]->seen = 0; } if (cycle_found == 1 || cycle_found2 == 1) { remove_temp_depend_for_sp(p_lash, src, dest, min_filled_lane); remove_temp_depend_for_sp(p_lash, dest, src, min_filled_lane); virtual_location[src][dest][max_filled_lane] = 2; virtual_location[dest][src][max_filled_lane] = 2; trials--; trials--; } else { set_temp_depend_to_permanent_for_sp(p_lash, src, dest, min_filled_lane); set_temp_depend_to_permanent_for_sp(p_lash, dest, src, min_filled_lane); num_mst_in_lane[max_filled_lane]--; num_mst_in_lane[max_filled_lane]--; num_mst_in_lane[min_filled_lane]++; num_mst_in_lane[min_filled_lane]++; remove_semipermanent_depend_for_sp(p_lash, src, dest, max_filled_lane); remove_semipermanent_depend_for_sp(p_lash, dest, src, max_filled_lane); virtual_location[src][dest][max_filled_lane] = 0; virtual_location[dest][src][max_filled_lane] = 0; virtual_location[src][dest][min_filled_lane] = 1; virtual_location[dest][src][min_filled_lane] = 1; p_lash->switches[src]->routing_table[dest].lane = min_filled_lane + start_vl; p_lash->switches[dest]->routing_table[src].lane = min_filled_lane + start_vl; } if (trials == 0) stop = 1; else { if (num_mst_in_lane[max_filled_lane] - num_mst_in_lane[min_filled_lane] < p_lash->balance_limit) stop = 1; } old_min_filled_lane = min_filled_lane; old_max_filled_lane = max_filled_lane; new_num_min_lane = MAX_INT; new_num_max_lane = 0; for (i = 0; i < lanes_needed; i++) { if (num_mst_in_lane[i] < new_num_min_lane) { new_num_min_lane = num_mst_in_lane[i]; min_filled_lane = i; } if (num_mst_in_lane[i] > new_num_max_lane) { new_num_max_lane = num_mst_in_lane[i]; max_filled_lane = i; } } if (old_min_filled_lane != min_filled_lane) { trials = num_mst_in_lane[max_filled_lane]; for (i = 0; i < num_switches; i++) for (j = 0; j < num_switches; j++) if (virtual_location[i][j][max_filled_lane] == 2) virtual_location[i][j][max_filled_lane] = 1; } if (old_max_filled_lane != max_filled_lane) { trials = num_mst_in_lane[max_filled_lane]; for (i = 0; i < num_switches; i++) for (j = 0; j < num_switches; j++) if (virtual_location[i][j][old_max_filled_lane] == 2) virtual_location[i][j][old_max_filled_lane] = 1; } } return 0; } static switch_t *switch_create(lash_t * p_lash, unsigned id, osm_switch_t * p_sw) { unsigned num_switches = p_lash->num_switches; unsigned num_ports = p_sw->num_ports; switch_t *sw; unsigned int i; sw = malloc(sizeof(*sw) + num_switches * sizeof(sw->routing_table[0])); if (!sw) return NULL; memset(sw, 0, sizeof(*sw)); for (i = 0; i < num_switches; i++) { sw->routing_table[i].out_link = NONE; sw->routing_table[i].lane = NONE; } sw->id = id; sw->dij_channels = malloc(num_ports * sizeof(int)); if (!sw->dij_channels) { free(sw); return NULL; } sw->p_sw = p_sw; p_sw->priv = sw; if (osm_mesh_node_create(p_lash, sw)) { free(sw->dij_channels); free(sw); return NULL; } return sw; } static void switch_delete(lash_t *p_lash, switch_t * sw) { if (sw->dij_channels) free(sw->dij_channels); free(sw); } static void delete_mesh_switches(lash_t *p_lash) { if (p_lash->switches) { unsigned id; for (id = 0; ((int)id) < p_lash->num_switches; id++) if (p_lash->switches[id]) osm_mesh_node_delete(p_lash, p_lash->switches[id]); } } static void free_lash_structures(lash_t * p_lash) { unsigned int i, j, k; unsigned num_switches = p_lash->num_switches; osm_log_t *p_log = &p_lash->p_osm->log; OSM_LOG_ENTER(p_log); delete_mesh_switches(p_lash); /* free cdg_vertex_matrix */ for (i = 0; i < p_lash->vl_min; i++) { for (j = 0; j < num_switches; j++) { for (k = 0; k < num_switches; k++) if (p_lash->cdg_vertex_matrix[i][j][k]) free(p_lash->cdg_vertex_matrix[i][j][k]); if (p_lash->cdg_vertex_matrix[i][j]) free(p_lash->cdg_vertex_matrix[i][j]); } if (p_lash->cdg_vertex_matrix[i]) free(p_lash->cdg_vertex_matrix[i]); } if (p_lash->cdg_vertex_matrix) free(p_lash->cdg_vertex_matrix); /* free virtual_location */ for (i = 0; i < num_switches; i++) { for (j = 0; j < num_switches; j++) { if (p_lash->virtual_location[i][j]) free(p_lash->virtual_location[i][j]); } if (p_lash->virtual_location[i]) free(p_lash->virtual_location[i]); } if (p_lash->virtual_location) free(p_lash->virtual_location); OSM_LOG_EXIT(p_log); } static int init_lash_structures(lash_t * p_lash) { unsigned vl_min = p_lash->vl_min; unsigned num_switches = p_lash->num_switches; osm_log_t *p_log = &p_lash->p_osm->log; int status = 0; unsigned int i, j, k; OSM_LOG_ENTER(p_log); /* initialise cdg_vertex_matrix[num_layers][num_switches][num_switches] */ p_lash->cdg_vertex_matrix = (cdg_vertex_t ****) malloc(vl_min * sizeof(cdg_vertex_t ***)); if (p_lash->cdg_vertex_matrix == NULL) goto Exit_Mem_Error; for (i = 0; i < vl_min; i++) { p_lash->cdg_vertex_matrix[i] = (cdg_vertex_t ***) malloc(num_switches * sizeof(cdg_vertex_t **)); if (p_lash->cdg_vertex_matrix[i] == NULL) goto Exit_Mem_Error; } for (i = 0; i < vl_min; i++) { for (j = 0; j < num_switches; j++) { p_lash->cdg_vertex_matrix[i][j] = (cdg_vertex_t **) malloc(num_switches * sizeof(cdg_vertex_t *)); if (p_lash->cdg_vertex_matrix[i][j] == NULL) goto Exit_Mem_Error; for (k = 0; k < num_switches; k++) p_lash->cdg_vertex_matrix[i][j][k] = NULL; } } /* * initialise virtual_location[num_switches][num_switches][num_layers], * default value = 0 */ p_lash->virtual_location = (int ***)malloc(num_switches * sizeof(int ***)); if (p_lash->virtual_location == NULL) goto Exit_Mem_Error; for (i = 0; i < num_switches; i++) { p_lash->virtual_location[i] = (int **)malloc(num_switches * sizeof(int **)); if (p_lash->virtual_location[i] == NULL) goto Exit_Mem_Error; } for (i = 0; i < num_switches; i++) { for (j = 0; j < num_switches; j++) { p_lash->virtual_location[i][j] = (int *)malloc(vl_min * sizeof(int *)); if (p_lash->virtual_location[i][j] == NULL) goto Exit_Mem_Error; for (k = 0; k < vl_min; k++) p_lash->virtual_location[i][j][k] = 0; } } /* initialise num_mst_in_lane[num_switches], default 0 */ memset(p_lash->num_mst_in_lane, 0, IB_MAX_NUM_VLS * sizeof(p_lash->num_mst_in_lane[0])); goto Exit; Exit_Mem_Error: status = -1; OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D01: " "Could not allocate required memory for LASH errno %d, errno %d for lack of memory\n", errno, ENOMEM); Exit: OSM_LOG_EXIT(p_log); return status; } static int lash_core(lash_t * p_lash) { osm_log_t *p_log = &p_lash->p_osm->log; unsigned num_switches = p_lash->num_switches; switch_t **switches = p_lash->switches; unsigned lanes_needed = 1; unsigned int i, j, k, dest_switch = 0; reachable_dest_t *dests, *idest; int cycle_found = 0; unsigned v_lane; int stop = 0, output_link, i_next_switch; int output_link2, i_next_switch2; int cycle_found2 = 0; int status = -1; int *switch_bitmap = NULL; /* Bitmap to check if we have processed this pair */ unsigned start_vl = p_lash->p_osm->subn.opt.lash_start_vl; OSM_LOG_ENTER(p_log); if (p_lash->p_osm->subn.opt.do_mesh_analysis && osm_do_mesh_analysis(p_lash)) { OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D05: Mesh analysis failed\n"); goto Exit; } for (i = 0; i < num_switches; i++) { shortest_path(p_lash, i); if (generate_routing_func_for_mst(p_lash, i, &dests)) { OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D06: " "generate_routing_func_for_mst failed\n"); goto Exit; } idest = dests; while (idest != NULL) { dests = dests->next; free(idest); idest = dests; } for (j = 0; j < num_switches; j++) { switches[j]->used_channels = 0; switches[j]->q_state = UNQUEUED; } } switch_bitmap = calloc(num_switches * num_switches, sizeof(int)); if (!switch_bitmap) { OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D04: " "Failed allocating switch_bitmap - out of memory\n"); goto Exit; } for (i = 0; i < num_switches; i++) { for (dest_switch = 0; dest_switch < num_switches; dest_switch++) if (dest_switch != i && switch_bitmap[i * num_switches + dest_switch] == 0) { v_lane = 0; stop = 0; while (v_lane < lanes_needed && stop == 0) { if (generate_cdg_for_sp(p_lash, i, dest_switch, v_lane) || generate_cdg_for_sp(p_lash, dest_switch, i, v_lane)) { OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D07: generate_cdg_for_sp failed\n"); goto Exit; } output_link = switches[i]->routing_table[dest_switch].out_link; output_link2 = switches[dest_switch]->routing_table[i].out_link; i_next_switch = get_next_switch(p_lash, i, output_link); i_next_switch2 = get_next_switch(p_lash, dest_switch, output_link2); CL_ASSERT(p_lash-> cdg_vertex_matrix[v_lane][i][i_next_switch] != NULL); CL_ASSERT(p_lash-> cdg_vertex_matrix[v_lane][dest_switch] [i_next_switch2] != NULL); cycle_found = cycle_exists(p_lash-> cdg_vertex_matrix[v_lane][i] [i_next_switch], NULL, NULL, 1); cycle_found2 = cycle_exists(p_lash-> cdg_vertex_matrix[v_lane][dest_switch] [i_next_switch2], NULL, NULL, 1); for (j = 0; j < num_switches; j++) for (k = 0; k < num_switches; k++) if (p_lash-> cdg_vertex_matrix[v_lane][j][k] != NULL) { p_lash-> cdg_vertex_matrix[v_lane][j] [k]->visiting_number = 0; p_lash-> cdg_vertex_matrix[v_lane][j] [k]->seen = 0; } if (cycle_found == 1 || cycle_found2 == 1) { remove_temp_depend_for_sp(p_lash, i, dest_switch, v_lane); remove_temp_depend_for_sp(p_lash, dest_switch, i, v_lane); v_lane++; } else { set_temp_depend_to_permanent_for_sp(p_lash, i, dest_switch, v_lane); set_temp_depend_to_permanent_for_sp(p_lash, dest_switch, i, v_lane); stop = 1; p_lash->num_mst_in_lane[v_lane]++; p_lash->num_mst_in_lane[v_lane]++; } } switches[i]->routing_table[dest_switch].lane = v_lane + start_vl; switches[dest_switch]->routing_table[i].lane = v_lane + start_vl; if (cycle_found == 1 || cycle_found2 == 1) { if (++lanes_needed > p_lash->vl_min) goto Error_Not_Enough_Lanes; if (generate_cdg_for_sp(p_lash, i, dest_switch, v_lane) || generate_cdg_for_sp(p_lash, dest_switch, i, v_lane)) { OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D08: generate_cdg_for_sp failed\n"); goto Exit; } set_temp_depend_to_permanent_for_sp(p_lash, i, dest_switch, v_lane); set_temp_depend_to_permanent_for_sp(p_lash, dest_switch, i, v_lane); p_lash->num_mst_in_lane[v_lane]++; p_lash->num_mst_in_lane[v_lane]++; } p_lash->virtual_location[i][dest_switch][v_lane] = 1; p_lash->virtual_location[dest_switch][i][v_lane] = 1; switch_bitmap[i * num_switches + dest_switch] = 1; switch_bitmap[dest_switch * num_switches + i] = 1; } } for (i = 0; i < lanes_needed; i++) OSM_LOG(p_log, OSM_LOG_INFO, "Lanes in layer %d: %d\n", i, p_lash->num_mst_in_lane[i]); OSM_LOG(p_log, OSM_LOG_INFO, "Lanes needed: %d, Balancing\n", lanes_needed); if (balance_virtual_lanes(p_lash, lanes_needed)) { OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D09: Balancing failed\n"); goto Exit; } for (i = 0; i < lanes_needed; i++) OSM_LOG(p_log, OSM_LOG_INFO, "Lanes in layer %d: %d\n", i, p_lash->num_mst_in_lane[i]); status = 0; goto Exit; Error_Not_Enough_Lanes: OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D02: " "Lane requirements (%d) exceed available lanes (%d)" " with starting lane (%d)\n", lanes_needed, p_lash->vl_min, start_vl); Exit: if (switch_bitmap) free(switch_bitmap); OSM_LOG_EXIT(p_log); return status; } static unsigned get_lash_id(osm_switch_t * p_sw) { return ((switch_t *) p_sw->priv)->id; } static int get_next_port(switch_t *sw, int link) { link_t *l = sw->node->links[link]; int port = l->next_port++; /* * note if not doing mesh analysis * then num_ports is always 1 */ if (l->next_port >= l->num_ports) l->next_port = 0; return l->ports[port]; } static void populate_fwd_tbls(lash_t * p_lash) { osm_log_t *p_log = &p_lash->p_osm->log; osm_subn_t *p_subn = &p_lash->p_osm->subn; osm_switch_t *p_sw, *p_next_sw, *p_dst_sw; osm_port_t *port; uint16_t max_lid_ho, lid; OSM_LOG_ENTER(p_log); p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl); /* Go through each switch individually */ while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) { uint64_t current_guid; switch_t *sw; p_sw = p_next_sw; p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); max_lid_ho = p_sw->max_lid_ho; current_guid = p_sw->p_node->node_info.port_guid; sw = p_sw->priv; memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size); for (lid = 1; lid <= max_lid_ho; lid++) { port = osm_get_port_by_lid_ho(p_subn, lid); if (!port) continue; p_dst_sw = get_osm_switch_from_port(port); if (p_dst_sw == p_sw) { uint8_t egress_port = port->p_node->sw ? 0 : port->p_physp->p_remote_physp->port_num; p_sw->new_lft[lid] = egress_port; OSM_LOG(p_log, OSM_LOG_VERBOSE, "LASH fwd MY SRC SRC GUID 0x%016" PRIx64 " src lash id (%d), src lid no (%u) src lash port (%d) " "DST GUID 0x%016" PRIx64 " src lash id (%d), src lash port (%d)\n", cl_ntoh64(current_guid), -1, lid, egress_port, cl_ntoh64(current_guid), -1, egress_port); } else if (p_dst_sw) { unsigned dst_lash_switch_id = get_lash_id(p_dst_sw); uint8_t lash_egress_port = (uint8_t) sw-> routing_table[dst_lash_switch_id].out_link; uint8_t physical_egress_port = get_next_port(sw, lash_egress_port); p_sw->new_lft[lid] = physical_egress_port; OSM_LOG(p_log, OSM_LOG_VERBOSE, "LASH fwd SRC GUID 0x%016" PRIx64 " src lash id (%d), " "src lid no (%u) src lash port (%d) " "DST GUID 0x%016" PRIx64 " src lash id (%d), src lash port (%d)\n", cl_ntoh64(current_guid), sw->id, lid, lash_egress_port, cl_ntoh64(p_dst_sw->p_node->node_info. port_guid), dst_lash_switch_id, physical_egress_port); } } /* for */ } OSM_LOG_EXIT(p_log); } static void osm_lash_process_switch(lash_t * p_lash, osm_switch_t * p_sw) { osm_log_t *p_log = &p_lash->p_osm->log; int i, port_count; osm_physp_t *p_current_physp, *p_remote_physp; unsigned switch_a_lash_id, switch_b_lash_id; OSM_LOG_ENTER(p_log); switch_a_lash_id = get_lash_id(p_sw); port_count = osm_node_get_num_physp(p_sw->p_node); /* starting at port 1, ignoring management port on switch */ for (i = 1; i < port_count; i++) { p_current_physp = osm_node_get_physp_ptr(p_sw->p_node, i); if (p_current_physp) { p_remote_physp = p_current_physp->p_remote_physp; if (p_remote_physp && p_remote_physp->p_node->sw) { int physical_port_a_num = osm_physp_get_port_num(p_current_physp); int physical_port_b_num = osm_physp_get_port_num(p_remote_physp); switch_b_lash_id = get_lash_id(p_remote_physp->p_node->sw); connect_switches(p_lash, switch_a_lash_id, switch_b_lash_id, physical_port_a_num); OSM_LOG(p_log, OSM_LOG_VERBOSE, "LASH SUCCESS connected G 0x%016" PRIx64 " , lash_id(%u), P(%u) " " to G 0x%016" PRIx64 " , lash_id(%u) , P(%u)\n", cl_ntoh64(osm_physp_get_port_guid (p_current_physp)), switch_a_lash_id, physical_port_a_num, cl_ntoh64(osm_physp_get_port_guid (p_remote_physp)), switch_b_lash_id, physical_port_b_num); } } } OSM_LOG_EXIT(p_log); } static void lash_cleanup(lash_t * p_lash) { osm_subn_t *p_subn = &p_lash->p_osm->subn; osm_switch_t *p_next_sw, *p_sw; /* drop any existing references to old lash switches */ p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl); while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) { p_sw = p_next_sw; p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); p_sw->priv = NULL; } if (p_lash->switches) { unsigned id; for (id = 0; ((int)id) < p_lash->num_switches; id++) if (p_lash->switches[id]) switch_delete(p_lash, p_lash->switches[id]); free(p_lash->switches); } p_lash->switches = NULL; } /* static int discover_network_properties() Traverse the topology of the network in order to determine - the maximum number of switches, - the minimum number of virtual layers */ static int discover_network_properties(lash_t * p_lash) { int i, id = 0; uint8_t vl_min; osm_subn_t *p_subn = &p_lash->p_osm->subn; osm_switch_t *p_next_sw, *p_sw; osm_log_t *p_log = &p_lash->p_osm->log; p_lash->num_switches = cl_qmap_count(&p_subn->sw_guid_tbl); p_lash->switches = calloc(p_lash->num_switches, sizeof(switch_t *)); if (!p_lash->switches) return -1; vl_min = 5; /* set to a high value */ p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl); while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) { uint16_t port_count; p_sw = p_next_sw; p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); p_lash->switches[id] = switch_create(p_lash, id, p_sw); if (!p_lash->switches[id]) return -1; id++; port_count = osm_node_get_num_physp(p_sw->p_node); /* Note, ignoring port 0. management port */ for (i = 1; i < port_count; i++) { osm_physp_t *p_current_physp = osm_node_get_physp_ptr(p_sw->p_node, i); if (p_current_physp && p_current_physp->p_remote_physp) { ib_port_info_t *p_port_info = &p_current_physp->port_info; uint8_t port_vl_min = ib_port_info_get_op_vls(p_port_info); if (port_vl_min && port_vl_min < vl_min) vl_min = port_vl_min; } } /* for */ } /* while */ vl_min = 1 << (vl_min - 1); if (vl_min > 15) vl_min = 15; if (p_lash->p_osm->subn.opt.lash_start_vl >= vl_min) { OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D03: " "Start VL(%d) too high for min operational vl(%d)\n", p_lash->p_osm->subn.opt.lash_start_vl, vl_min); return -1; } p_lash->vl_min = vl_min - p_lash->p_osm->subn.opt.lash_start_vl; OSM_LOG(p_log, OSM_LOG_INFO, "min operational vl(%d) start vl(%d) max_switches(%d)\n", p_lash->vl_min, p_lash->p_osm->subn.opt.lash_start_vl, p_lash->num_switches); return 0; } static void process_switches(lash_t * p_lash) { osm_switch_t *p_sw, *p_next_sw; osm_subn_t *p_subn = &p_lash->p_osm->subn; /* Go through each switch and process it. i.e build the connection structure required by LASH */ p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl); while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) { p_sw = p_next_sw; p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item); osm_lash_process_switch(p_lash, p_sw); } } static int lash_process(void *context) { lash_t *p_lash = context; osm_log_t *p_log = &p_lash->p_osm->log; int status = 0; OSM_LOG_ENTER(p_log); p_lash->balance_limit = 6; /* everything starts here */ lash_cleanup(p_lash); status = discover_network_properties(p_lash); if (status) goto Exit; status = init_lash_structures(p_lash); if (status) goto Exit; process_switches(p_lash); status = lash_core(p_lash); if (status) goto Exit; populate_fwd_tbls(p_lash); Exit: if (p_lash->vl_min) free_lash_structures(p_lash); OSM_LOG_EXIT(p_log); return status; } static lash_t *lash_create(osm_opensm_t * p_osm) { lash_t *p_lash; p_lash = calloc(1, sizeof(lash_t)); if (!p_lash) return NULL; p_lash->p_osm = p_osm; return p_lash; } static void lash_delete(void *context) { lash_t *p_lash = context; if (p_lash->switches) { unsigned id; for (id = 0; ((int)id) < p_lash->num_switches; id++) if (p_lash->switches[id]) switch_delete(p_lash, p_lash->switches[id]); free(p_lash->switches); } free(p_lash); } static uint8_t get_lash_sl(void *context, uint8_t path_sl_hint, const ib_net16_t slid, const ib_net16_t dlid) { unsigned dst_id; unsigned src_id; osm_port_t *p_src_port, *p_dst_port; osm_switch_t *p_sw; lash_t *p_lash = context; osm_opensm_t *p_osm = p_lash->p_osm; if (!(p_osm->routing_engine_used && p_osm->routing_engine_used->type == OSM_ROUTING_ENGINE_TYPE_LASH)) return OSM_DEFAULT_SL; p_src_port = osm_get_port_by_lid(&p_osm->subn, slid); if (!p_src_port) return OSM_DEFAULT_SL; p_dst_port = osm_get_port_by_lid(&p_osm->subn, dlid); if (!p_dst_port) return OSM_DEFAULT_SL; p_sw = get_osm_switch_from_port(p_dst_port); if (!p_sw || !p_sw->priv) return OSM_DEFAULT_SL; dst_id = get_lash_id(p_sw); p_sw = get_osm_switch_from_port(p_src_port); if (!p_sw || !p_sw->priv) return OSM_DEFAULT_SL; src_id = get_lash_id(p_sw); if (src_id == dst_id) return p_osm->subn.opt.lash_start_vl; return (uint8_t) ((switch_t *) p_sw->priv)->routing_table[dst_id].lane; } int osm_ucast_lash_setup(struct osm_routing_engine *r, osm_opensm_t *p_osm) { lash_t *p_lash = lash_create(p_osm); if (!p_lash) return -1; r->context = p_lash; r->ucast_build_fwd_tables = lash_process; r->path_sl = get_lash_sl; r->destroy = lash_delete; return 0; }