Blob Blame History Raw
/*
 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
 * Copyright (c) 2007      Simula Research Laboratory. All rights reserved.
 * Copyright (c) 2007      Silicon Graphics Inc. All rights reserved.
 * Copyright (c) 2008,2009 System Fabric Works, Inc. All rights reserved.
 * Copyright (c) 2009      HNR Consulting. All rights reserved.
 * Copyright (c) 2009-2011 ZIH, TU Dresden, Federal Republic of Germany. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

/*
 * Abstract:
 *      Implementation of LASH algorithm Calculation functions
 */

#if HAVE_CONFIG_H
#  include <config.h>
#endif				/* HAVE_CONFIG_H */

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <complib/cl_debug.h>
#include <complib/cl_qmap.h>
#include <opensm/osm_file_ids.h>
#define FILE_ID OSM_FILE_UCAST_LASH_C
#include <opensm/osm_switch.h>
#include <opensm/osm_opensm.h>
#include <opensm/osm_log.h>
#include <opensm/osm_mesh.h>
#include <opensm/osm_ucast_lash.h>

typedef struct _reachable_dest {
	int switch_id;
	struct _reachable_dest *next;
} reachable_dest_t;

static void connect_switches(lash_t * p_lash, int sw1, int sw2, int phy_port_1)
{
	osm_log_t *p_log = &p_lash->p_osm->log;
	unsigned num = p_lash->switches[sw1]->node->num_links;
	switch_t *s1 = p_lash->switches[sw1];
	mesh_node_t *node = s1->node;
	switch_t *s2;
	link_t *l;
	unsigned int i;

	/*
	 * if doing mesh analysis:
	 *  - do not consider connections to self
	 *  - collapse multiple connections between
	 *    pair of switches to a single locical link
	 */
	if (p_lash->p_osm->subn.opt.do_mesh_analysis) {
		if (sw1 == sw2)
			return;

		/* see if we are already linked to sw2 */
		for (i = 0; i < num; i++) {
			l = node->links[i];

			if (node->links[i]->switch_id == sw2) {
				l->ports[l->num_ports++] = phy_port_1;
				return;
			}
		}
	}

	l = node->links[num];
	l->switch_id = sw2;
	l->link_id = -1;
	l->ports[l->num_ports++] = phy_port_1;

	s2 = p_lash->switches[sw2];
	for (i = 0; i < s2->node->num_links; i++) {
		if (s2->node->links[i]->switch_id == sw1) {
			s2->node->links[i]->link_id = num;
			l->link_id = i;
			break;
		}
	}

	node->num_links++;

	OSM_LOG(p_log, OSM_LOG_VERBOSE,
		"LASH connect: %d, %d, %d\n", sw1, sw2, phy_port_1);
}

static osm_switch_t *get_osm_switch_from_port(const osm_port_t * port)
{
	osm_physp_t *p = port->p_physp;
	if (p->p_node->sw)
		return p->p_node->sw;
	else if (p->p_remote_physp && p->p_remote_physp->p_node->sw)
		return p->p_remote_physp->p_node->sw;
	return NULL;
}

static int cycle_exists(cdg_vertex_t * start, cdg_vertex_t * current,
			cdg_vertex_t * prev, int visit_num)
{
	int i, new_visit_num;
	int cycle_found = 0;

	if (current != NULL && current->visiting_number > 0) {
		if (visit_num > current->visiting_number && current->seen == 0) {
			cycle_found = 1;
		}
	} else {
		if (current == NULL) {
			current = start;
			CL_ASSERT(prev == NULL);
		}

		current->visiting_number = visit_num;

		if (prev != NULL) {
			prev->next = current;
			CL_ASSERT(prev->to == current->from);
			CL_ASSERT(prev->visiting_number > 0);
		}

		new_visit_num = visit_num + 1;

		for (i = 0; i < current->num_deps; i++) {
			cycle_found =
			    cycle_exists(start, current->deps[i].v, current,
					 new_visit_num);
			if (cycle_found == 1)
				i = current->num_deps;
		}

		current->seen = 1;
		if (prev != NULL)
			prev->next = NULL;
	}

	return cycle_found;
}

static inline int get_next_switch(lash_t *p_lash, int sw, int link)
{
	return p_lash->switches[sw]->node->links[link]->switch_id;
}

static void remove_semipermanent_depend_for_sp(lash_t * p_lash, int sw,
					       int dest_switch, int lane)
{
	switch_t **switches = p_lash->switches;
	cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix;
	int i_next_switch, output_link, i, next_link, i_next_next_switch,
	    depend = 0;
	cdg_vertex_t *v;
	int __attribute__((unused)) found;

	output_link = switches[sw]->routing_table[dest_switch].out_link;
	i_next_switch = get_next_switch(p_lash, sw, output_link);

	while (sw != dest_switch) {
		v = cdg_vertex_matrix[lane][sw][i_next_switch];
		CL_ASSERT(v != NULL);

		if (v->num_using_vertex == 1) {

			cdg_vertex_matrix[lane][sw][i_next_switch] = NULL;

			free(v);
		} else {
			v->num_using_vertex--;
			if (i_next_switch != dest_switch) {
				next_link =
				    switches[i_next_switch]->routing_table[dest_switch].out_link;
				i_next_next_switch = get_next_switch(p_lash, i_next_switch, next_link);
				found = 0;

				for (i = 0; i < v->num_deps; i++)
					if (v->deps[i].v ==
					    cdg_vertex_matrix[lane][i_next_switch]
					    [i_next_next_switch]) {
						found = 1;
						depend = i;
					}

				CL_ASSERT(found);

				if (v->deps[depend].num_used == 1) {
					for (i = depend;
					     i < v->num_deps - 1; i++) {
						v->deps[i].v = v->deps[i + 1].v;
						v->deps[i].num_used =
						    v->deps[i + 1].num_used;
					}

					v->num_deps--;
				} else
					v->deps[depend].num_used--;
			}
		}

		sw = i_next_switch;
		output_link = switches[sw]->routing_table[dest_switch].out_link;

		if (sw != dest_switch)
			i_next_switch = get_next_switch(p_lash, sw, output_link);
	}
}

inline static void enqueue(cl_list_t * bfsq, switch_t * sw)
{
	CL_ASSERT(sw->q_state == UNQUEUED);
	sw->q_state = Q_MEMBER;
	cl_list_insert_tail(bfsq, sw);
}

inline static void dequeue(cl_list_t * bfsq, switch_t ** sw)
{
	*sw = (switch_t *) cl_list_remove_head(bfsq);
	CL_ASSERT((*sw)->q_state == Q_MEMBER);
	(*sw)->q_state = MST_MEMBER;
}

static int get_phys_connection(switch_t *sw, int switch_to)
{
	unsigned int i;

	for (i = 0; i < sw->node->num_links; i++)
		if (sw->node->links[i]->switch_id == switch_to)
			return i;
	return i;
}

static void shortest_path(lash_t * p_lash, int ir)
{
	switch_t **switches = p_lash->switches, *sw, *swi;
	unsigned int i;
	cl_list_t bfsq;

	cl_list_construct(&bfsq);
	cl_list_init(&bfsq, 20);

	enqueue(&bfsq, switches[ir]);

	while (!cl_is_list_empty(&bfsq)) {
		dequeue(&bfsq, &sw);
		for (i = 0; i < sw->node->num_links; i++) {
			swi = switches[sw->node->links[i]->switch_id];
			if (swi->q_state == UNQUEUED) {
				enqueue(&bfsq, swi);
				sw->dij_channels[sw->used_channels++] = swi->id;
			}
		}
	}

	cl_list_destroy(&bfsq);
}

static int generate_routing_func_for_mst(lash_t * p_lash, int sw_id,
					 reachable_dest_t ** destinations)
{
	int i, next_switch;
	switch_t *sw = p_lash->switches[sw_id];
	int num_channels = sw->used_channels;
	reachable_dest_t *dest, *i_dest, *concat_dest = NULL, *prev;

	for (i = 0; i < num_channels; i++) {
		next_switch = sw->dij_channels[i];
		if (generate_routing_func_for_mst(p_lash, next_switch, &dest))
			return -1;

		i_dest = dest;
		prev = i_dest;

		while (i_dest != NULL) {
			if (sw->routing_table[i_dest->switch_id].out_link ==
			    NONE)
				sw->routing_table[i_dest->switch_id].out_link =
				    get_phys_connection(sw, next_switch);

			prev = i_dest;
			i_dest = i_dest->next;
		}

		CL_ASSERT(prev->next == NULL);
		prev->next = concat_dest;
		concat_dest = dest;
	}

	i_dest = (reachable_dest_t *) malloc(sizeof(reachable_dest_t));
	if (!i_dest)
		return -1;
	i_dest->switch_id = sw->id;
	i_dest->next = concat_dest;
	*destinations = i_dest;
	return 0;
}

static int generate_cdg_for_sp(lash_t * p_lash, int sw, int dest_switch,
			       int lane)
{
	unsigned num_switches = p_lash->num_switches;
	switch_t **switches = p_lash->switches;
	cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix;
	int next_switch, output_link, j, exists;
	cdg_vertex_t *v, *prev = NULL;

	output_link = switches[sw]->routing_table[dest_switch].out_link;
	next_switch = get_next_switch(p_lash, sw, output_link);

	while (sw != dest_switch) {

		if (cdg_vertex_matrix[lane][sw][next_switch] == NULL) {
			v = calloc(1, sizeof(*v) + (num_switches - 1) * sizeof(v->deps[0]));
			if (!v)
				return -1;
			v->from = sw;
			v->to = next_switch;
			v->temp = 1;
			cdg_vertex_matrix[lane][sw][next_switch] = v;
		} else
			v = cdg_vertex_matrix[lane][sw][next_switch];

		v->num_using_vertex++;

		if (prev != NULL) {
			exists = 0;

			for (j = 0; j < prev->num_deps; j++)
				if (prev->deps[j].v == v) {
					exists = 1;
					prev->deps[j].num_used++;
				}

			if (exists == 0) {
				prev->deps[prev->num_deps].v = v;
				prev->deps[prev->num_deps].num_used++;
				prev->num_deps++;

				CL_ASSERT(prev->num_deps < (int)num_switches);

				if (prev->temp == 0)
					prev->num_temp_depend++;

			}
		}

		sw = next_switch;
		output_link = switches[sw]->routing_table[dest_switch].out_link;

		if (sw != dest_switch) {
			CL_ASSERT(output_link != NONE);
			next_switch = get_next_switch(p_lash, sw, output_link);
		}

		prev = v;
	}
	return 0;
}

static void set_temp_depend_to_permanent_for_sp(lash_t * p_lash, int sw,
						int dest_switch, int lane)
{
	switch_t **switches = p_lash->switches;
	cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix;
	int next_switch, output_link;
	cdg_vertex_t *v;

	output_link = switches[sw]->routing_table[dest_switch].out_link;
	next_switch = get_next_switch(p_lash, sw, output_link);

	while (sw != dest_switch) {
		v = cdg_vertex_matrix[lane][sw][next_switch];
		CL_ASSERT(v != NULL);

		if (v->temp == 1)
			v->temp = 0;
		else
			v->num_temp_depend = 0;

		sw = next_switch;
		output_link = switches[sw]->routing_table[dest_switch].out_link;

		if (sw != dest_switch)
			next_switch = get_next_switch(p_lash, sw, output_link);
	}

}

static void remove_temp_depend_for_sp(lash_t * p_lash, int sw, int dest_switch,
				      int lane)
{
	switch_t **switches = p_lash->switches;
	cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix;
	int next_switch, output_link, i;
	cdg_vertex_t *v;

	output_link = switches[sw]->routing_table[dest_switch].out_link;
	next_switch = get_next_switch(p_lash, sw, output_link);

	while (sw != dest_switch) {
		v = cdg_vertex_matrix[lane][sw][next_switch];
		CL_ASSERT(v != NULL);

		if (v->temp == 1) {
			cdg_vertex_matrix[lane][sw][next_switch] = NULL;
			free(v);
		} else {
			CL_ASSERT(v->num_temp_depend <= v->num_deps);
			v->num_deps = v->num_deps - v->num_temp_depend;
			v->num_temp_depend = 0;
			v->num_using_vertex--;

			for (i = v->num_deps; i < p_lash->num_switches - 1; i++)
				v->deps[i].num_used = 0;
		}

		sw = next_switch;
		output_link = switches[sw]->routing_table[dest_switch].out_link;

		if (sw != dest_switch)
			next_switch = get_next_switch(p_lash, sw, output_link);

	}
}

static int balance_virtual_lanes(lash_t * p_lash, unsigned lanes_needed)
{
	unsigned num_switches = p_lash->num_switches;
	cdg_vertex_t ****cdg_vertex_matrix = p_lash->cdg_vertex_matrix;
	int *num_mst_in_lane = p_lash->num_mst_in_lane;
	int ***virtual_location = p_lash->virtual_location;
	int min_filled_lane, max_filled_lane, trials;
	int old_min_filled_lane, old_max_filled_lane, new_num_min_lane,
	    new_num_max_lane;
	unsigned int i, j;
	int src, dest, start, next_switch, output_link;
	int next_switch2, output_link2;
	int stop = 0, cycle_found;
	int cycle_found2;
	unsigned start_vl = p_lash->p_osm->subn.opt.lash_start_vl;

	max_filled_lane = 0;
	min_filled_lane = lanes_needed - 1;

	trials = num_mst_in_lane[max_filled_lane];
	if (lanes_needed == 1)
		stop = 1;

	while (stop == 0) {
		src = abs(rand()) % (num_switches);
		dest = abs(rand()) % (num_switches);

		while (virtual_location[src][dest][max_filled_lane] != 1) {
			start = dest;
			if (dest == num_switches - 1)
				dest = 0;
			else
				dest++;

			while (dest != start
			       && virtual_location[src][dest][max_filled_lane]
			       != 1) {
				if (dest == num_switches - 1)
					dest = 0;
				else
					dest++;
			}

			if (virtual_location[src][dest][max_filled_lane] != 1) {
				if (src == num_switches - 1)
					src = 0;
				else
					src++;
			}
		}

		if (generate_cdg_for_sp(p_lash, src, dest, min_filled_lane) ||
		    generate_cdg_for_sp(p_lash, dest, src, min_filled_lane))
			return -1;

		output_link = p_lash->switches[src]->routing_table[dest].out_link;
		next_switch = get_next_switch(p_lash, src, output_link);

		output_link2 = p_lash->switches[dest]->routing_table[src].out_link;
		next_switch2 = get_next_switch(p_lash, dest, output_link2);

		CL_ASSERT(cdg_vertex_matrix[min_filled_lane][src][next_switch] != NULL);
		CL_ASSERT(cdg_vertex_matrix[min_filled_lane][dest][next_switch2] != NULL);

		cycle_found =
		    cycle_exists(cdg_vertex_matrix[min_filled_lane][src][next_switch], NULL, NULL,
				 1);
		cycle_found2 =
		    cycle_exists(cdg_vertex_matrix[min_filled_lane][dest][next_switch2], NULL, NULL,
				 1);

		for (i = 0; i < num_switches; i++)
			for (j = 0; j < num_switches; j++)
				if (cdg_vertex_matrix[min_filled_lane][i][j] != NULL) {
					cdg_vertex_matrix[min_filled_lane][i][j]->visiting_number =
					    0;
					cdg_vertex_matrix[min_filled_lane][i][j]->seen = 0;
				}

		if (cycle_found == 1 || cycle_found2 == 1) {
			remove_temp_depend_for_sp(p_lash, src, dest, min_filled_lane);
			remove_temp_depend_for_sp(p_lash, dest, src, min_filled_lane);

			virtual_location[src][dest][max_filled_lane] = 2;
			virtual_location[dest][src][max_filled_lane] = 2;
			trials--;
			trials--;
		} else {
			set_temp_depend_to_permanent_for_sp(p_lash, src, dest, min_filled_lane);
			set_temp_depend_to_permanent_for_sp(p_lash, dest, src, min_filled_lane);

			num_mst_in_lane[max_filled_lane]--;
			num_mst_in_lane[max_filled_lane]--;
			num_mst_in_lane[min_filled_lane]++;
			num_mst_in_lane[min_filled_lane]++;

			remove_semipermanent_depend_for_sp(p_lash, src, dest, max_filled_lane);
			remove_semipermanent_depend_for_sp(p_lash, dest, src, max_filled_lane);
			virtual_location[src][dest][max_filled_lane] = 0;
			virtual_location[dest][src][max_filled_lane] = 0;
			virtual_location[src][dest][min_filled_lane] = 1;
			virtual_location[dest][src][min_filled_lane] = 1;
			p_lash->switches[src]->routing_table[dest].lane = min_filled_lane + start_vl;
			p_lash->switches[dest]->routing_table[src].lane = min_filled_lane + start_vl;
		}

		if (trials == 0)
			stop = 1;
		else {
			if (num_mst_in_lane[max_filled_lane] - num_mst_in_lane[min_filled_lane] <
			    p_lash->balance_limit)
				stop = 1;
		}

		old_min_filled_lane = min_filled_lane;
		old_max_filled_lane = max_filled_lane;

		new_num_min_lane = MAX_INT;
		new_num_max_lane = 0;

		for (i = 0; i < lanes_needed; i++) {

			if (num_mst_in_lane[i] < new_num_min_lane) {
				new_num_min_lane = num_mst_in_lane[i];
				min_filled_lane = i;
			}

			if (num_mst_in_lane[i] > new_num_max_lane) {
				new_num_max_lane = num_mst_in_lane[i];
				max_filled_lane = i;
			}
		}

		if (old_min_filled_lane != min_filled_lane) {
			trials = num_mst_in_lane[max_filled_lane];
			for (i = 0; i < num_switches; i++)
				for (j = 0; j < num_switches; j++)
					if (virtual_location[i][j][max_filled_lane] == 2)
						virtual_location[i][j][max_filled_lane] = 1;
		}

		if (old_max_filled_lane != max_filled_lane) {
			trials = num_mst_in_lane[max_filled_lane];
			for (i = 0; i < num_switches; i++)
				for (j = 0; j < num_switches; j++)
					if (virtual_location[i][j][old_max_filled_lane] == 2)
						virtual_location[i][j][old_max_filled_lane] = 1;
		}
	}
	return 0;
}

static switch_t *switch_create(lash_t * p_lash, unsigned id, osm_switch_t * p_sw)
{
	unsigned num_switches = p_lash->num_switches;
	unsigned num_ports = p_sw->num_ports;
	switch_t *sw;
	unsigned int i;

	sw = malloc(sizeof(*sw) + num_switches * sizeof(sw->routing_table[0]));
	if (!sw)
		return NULL;

	memset(sw, 0, sizeof(*sw));
	for (i = 0; i < num_switches; i++) {
		sw->routing_table[i].out_link = NONE;
		sw->routing_table[i].lane = NONE;
	}

	sw->id = id;
	sw->dij_channels = malloc(num_ports * sizeof(int));
	if (!sw->dij_channels) {
		free(sw);
		return NULL;
	}

	sw->p_sw = p_sw;
	p_sw->priv = sw;

	if (osm_mesh_node_create(p_lash, sw)) {
		free(sw->dij_channels);
		free(sw);
		return NULL;
	}

	return sw;
}

static void switch_delete(lash_t *p_lash, switch_t * sw)
{
	if (sw->dij_channels)
		free(sw->dij_channels);
	free(sw);
}

static void delete_mesh_switches(lash_t *p_lash)
{
	if (p_lash->switches) {
		unsigned id;
		for (id = 0; ((int)id) < p_lash->num_switches; id++)
			if (p_lash->switches[id])
				osm_mesh_node_delete(p_lash,
						     p_lash->switches[id]);
	}
}

static void free_lash_structures(lash_t * p_lash)
{
	unsigned int i, j, k;
	unsigned num_switches = p_lash->num_switches;
	osm_log_t *p_log = &p_lash->p_osm->log;

	OSM_LOG_ENTER(p_log);

	delete_mesh_switches(p_lash);

	/* free cdg_vertex_matrix */
	for (i = 0; i < p_lash->vl_min; i++) {
		for (j = 0; j < num_switches; j++) {
			for (k = 0; k < num_switches; k++)
				if (p_lash->cdg_vertex_matrix[i][j][k])
					free(p_lash->cdg_vertex_matrix[i][j][k]);
			if (p_lash->cdg_vertex_matrix[i][j])
				free(p_lash->cdg_vertex_matrix[i][j]);
		}
		if (p_lash->cdg_vertex_matrix[i])
			free(p_lash->cdg_vertex_matrix[i]);
	}

	if (p_lash->cdg_vertex_matrix)
		free(p_lash->cdg_vertex_matrix);

	/* free virtual_location */
	for (i = 0; i < num_switches; i++) {
		for (j = 0; j < num_switches; j++) {
			if (p_lash->virtual_location[i][j])
				free(p_lash->virtual_location[i][j]);
		}
		if (p_lash->virtual_location[i])
			free(p_lash->virtual_location[i]);
	}
	if (p_lash->virtual_location)
		free(p_lash->virtual_location);

	OSM_LOG_EXIT(p_log);
}

static int init_lash_structures(lash_t * p_lash)
{
	unsigned vl_min = p_lash->vl_min;
	unsigned num_switches = p_lash->num_switches;
	osm_log_t *p_log = &p_lash->p_osm->log;
	int status = 0;
	unsigned int i, j, k;

	OSM_LOG_ENTER(p_log);

	/* initialise cdg_vertex_matrix[num_layers][num_switches][num_switches] */
	p_lash->cdg_vertex_matrix =
	    (cdg_vertex_t ****) malloc(vl_min * sizeof(cdg_vertex_t ***));
	if (p_lash->cdg_vertex_matrix == NULL)
		goto Exit_Mem_Error;
	for (i = 0; i < vl_min; i++) {
		p_lash->cdg_vertex_matrix[i] =
		    (cdg_vertex_t ***) malloc(num_switches *
					      sizeof(cdg_vertex_t **));

		if (p_lash->cdg_vertex_matrix[i] == NULL)
			goto Exit_Mem_Error;
	}

	for (i = 0; i < vl_min; i++) {
		for (j = 0; j < num_switches; j++) {
			p_lash->cdg_vertex_matrix[i][j] =
			    (cdg_vertex_t **) malloc(num_switches *
						     sizeof(cdg_vertex_t *));
			if (p_lash->cdg_vertex_matrix[i][j] == NULL)
				goto Exit_Mem_Error;

			for (k = 0; k < num_switches; k++)
				p_lash->cdg_vertex_matrix[i][j][k] = NULL;
		}
	}

	/*
	 * initialise virtual_location[num_switches][num_switches][num_layers],
	 * default value = 0
	 */
	p_lash->virtual_location =
	    (int ***)malloc(num_switches * sizeof(int ***));
	if (p_lash->virtual_location == NULL)
		goto Exit_Mem_Error;

	for (i = 0; i < num_switches; i++) {
		p_lash->virtual_location[i] =
		    (int **)malloc(num_switches * sizeof(int **));
		if (p_lash->virtual_location[i] == NULL)
			goto Exit_Mem_Error;
	}

	for (i = 0; i < num_switches; i++) {
		for (j = 0; j < num_switches; j++) {
			p_lash->virtual_location[i][j] =
			    (int *)malloc(vl_min * sizeof(int *));
			if (p_lash->virtual_location[i][j] == NULL)
				goto Exit_Mem_Error;
			for (k = 0; k < vl_min; k++)
				p_lash->virtual_location[i][j][k] = 0;
		}
	}

	/* initialise num_mst_in_lane[num_switches], default 0 */
	memset(p_lash->num_mst_in_lane, 0,
	       IB_MAX_NUM_VLS * sizeof(p_lash->num_mst_in_lane[0]));

	goto Exit;

Exit_Mem_Error:
	status = -1;
	OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D01: "
		"Could not allocate required memory for LASH errno %d, errno %d for lack of memory\n",
		errno, ENOMEM);

Exit:
	OSM_LOG_EXIT(p_log);
	return status;
}

static int lash_core(lash_t * p_lash)
{
	osm_log_t *p_log = &p_lash->p_osm->log;
	unsigned num_switches = p_lash->num_switches;
	switch_t **switches = p_lash->switches;
	unsigned lanes_needed = 1;
	unsigned int i, j, k, dest_switch = 0;
	reachable_dest_t *dests, *idest;
	int cycle_found = 0;
	unsigned v_lane;
	int stop = 0, output_link, i_next_switch;
	int output_link2, i_next_switch2;
	int cycle_found2 = 0;
	int status = -1;
	int *switch_bitmap = NULL;	/* Bitmap to check if we have processed this pair */
	unsigned start_vl = p_lash->p_osm->subn.opt.lash_start_vl;

	OSM_LOG_ENTER(p_log);

	if (p_lash->p_osm->subn.opt.do_mesh_analysis && osm_do_mesh_analysis(p_lash)) {
		OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D05: Mesh analysis failed\n");
		goto Exit;
	}

	for (i = 0; i < num_switches; i++) {

		shortest_path(p_lash, i);
		if (generate_routing_func_for_mst(p_lash, i, &dests)) {
			OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D06: "
				"generate_routing_func_for_mst failed\n");
			goto Exit;
		}

		idest = dests;
		while (idest != NULL) {
			dests = dests->next;
			free(idest);
			idest = dests;
		}

		for (j = 0; j < num_switches; j++) {
			switches[j]->used_channels = 0;
			switches[j]->q_state = UNQUEUED;
		}
	}

	switch_bitmap = calloc(num_switches * num_switches, sizeof(int));
	if (!switch_bitmap) {
		OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D04: "
			"Failed allocating switch_bitmap - out of memory\n");
		goto Exit;
	}

	for (i = 0; i < num_switches; i++) {
		for (dest_switch = 0; dest_switch < num_switches; dest_switch++)
			if (dest_switch != i && switch_bitmap[i * num_switches + dest_switch] == 0) {
				v_lane = 0;
				stop = 0;
				while (v_lane < lanes_needed && stop == 0) {
					if (generate_cdg_for_sp(p_lash, i, dest_switch, v_lane) ||
					    generate_cdg_for_sp(p_lash, dest_switch, i, v_lane)) {
						OSM_LOG(p_log, OSM_LOG_ERROR,
							"ERR 4D07: generate_cdg_for_sp failed\n");
						goto Exit;
					}

					output_link =
					    switches[i]->routing_table[dest_switch].out_link;
					output_link2 =
					    switches[dest_switch]->routing_table[i].out_link;

					i_next_switch = get_next_switch(p_lash, i, output_link);
					i_next_switch2 = get_next_switch(p_lash, dest_switch, output_link2);

					CL_ASSERT(p_lash->
						  cdg_vertex_matrix[v_lane][i][i_next_switch] !=
						  NULL);
					CL_ASSERT(p_lash->
						  cdg_vertex_matrix[v_lane][dest_switch]
						  [i_next_switch2] != NULL);

					cycle_found =
					    cycle_exists(p_lash->
							 cdg_vertex_matrix[v_lane][i]
							 [i_next_switch], NULL, NULL, 1);
					cycle_found2 =
					    cycle_exists(p_lash->
							 cdg_vertex_matrix[v_lane][dest_switch]
							 [i_next_switch2], NULL, NULL, 1);

					for (j = 0; j < num_switches; j++)
						for (k = 0; k < num_switches; k++)
							if (p_lash->
							    cdg_vertex_matrix[v_lane][j][k] !=
							    NULL) {
								p_lash->
								    cdg_vertex_matrix[v_lane][j]
								    [k]->visiting_number = 0;
								p_lash->
								    cdg_vertex_matrix[v_lane][j]
								    [k]->seen = 0;
							}

					if (cycle_found == 1 || cycle_found2 == 1) {
						remove_temp_depend_for_sp(p_lash, i, dest_switch,
									  v_lane);
						remove_temp_depend_for_sp(p_lash, dest_switch, i,
									  v_lane);
						v_lane++;
					} else {
						set_temp_depend_to_permanent_for_sp(p_lash, i,
										    dest_switch,
										    v_lane);
						set_temp_depend_to_permanent_for_sp(p_lash,
										    dest_switch, i,
										    v_lane);
						stop = 1;
						p_lash->num_mst_in_lane[v_lane]++;
						p_lash->num_mst_in_lane[v_lane]++;
					}
				}

				switches[i]->routing_table[dest_switch].lane = v_lane + start_vl;
				switches[dest_switch]->routing_table[i].lane = v_lane + start_vl;

				if (cycle_found == 1 || cycle_found2 == 1) {
					if (++lanes_needed > p_lash->vl_min)
						goto Error_Not_Enough_Lanes;

					if (generate_cdg_for_sp(p_lash, i, dest_switch, v_lane) ||
					    generate_cdg_for_sp(p_lash, dest_switch, i, v_lane)) {
						OSM_LOG(p_log, OSM_LOG_ERROR,
							"ERR 4D08: generate_cdg_for_sp failed\n");
						goto Exit;
					}

					set_temp_depend_to_permanent_for_sp(p_lash, i, dest_switch,
									    v_lane);
					set_temp_depend_to_permanent_for_sp(p_lash, dest_switch, i,
									    v_lane);

					p_lash->num_mst_in_lane[v_lane]++;
					p_lash->num_mst_in_lane[v_lane]++;
				}
				p_lash->virtual_location[i][dest_switch][v_lane] = 1;
				p_lash->virtual_location[dest_switch][i][v_lane] = 1;

				switch_bitmap[i * num_switches + dest_switch] = 1;
				switch_bitmap[dest_switch * num_switches + i] = 1;
			}
	}

	for (i = 0; i < lanes_needed; i++)
		OSM_LOG(p_log, OSM_LOG_INFO, "Lanes in layer %d: %d\n",
			i, p_lash->num_mst_in_lane[i]);

	OSM_LOG(p_log, OSM_LOG_INFO,
		"Lanes needed: %d, Balancing\n", lanes_needed);

	if (balance_virtual_lanes(p_lash, lanes_needed)) {
		OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D09: Balancing failed\n");
		goto Exit;
	}

	for (i = 0; i < lanes_needed; i++)
		OSM_LOG(p_log, OSM_LOG_INFO, "Lanes in layer %d: %d\n",
			i, p_lash->num_mst_in_lane[i]);

	status = 0;
	goto Exit;

Error_Not_Enough_Lanes:
	OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D02: "
		"Lane requirements (%d) exceed available lanes (%d)"
		" with starting lane (%d)\n",
		lanes_needed, p_lash->vl_min, start_vl);
Exit:
	if (switch_bitmap)
		free(switch_bitmap);
	OSM_LOG_EXIT(p_log);
	return status;
}

static unsigned get_lash_id(osm_switch_t * p_sw)
{
	return ((switch_t *) p_sw->priv)->id;
}

static int get_next_port(switch_t *sw, int link)
{
	link_t *l = sw->node->links[link];
	int port = l->next_port++;

	/*
	 * note if not doing mesh analysis
	 * then num_ports is always 1
	 */
	if (l->next_port >= l->num_ports)
		l->next_port = 0;

	return l->ports[port];
}

static void populate_fwd_tbls(lash_t * p_lash)
{
	osm_log_t *p_log = &p_lash->p_osm->log;
	osm_subn_t *p_subn = &p_lash->p_osm->subn;
	osm_switch_t *p_sw, *p_next_sw, *p_dst_sw;
	osm_port_t *port;
	uint16_t max_lid_ho, lid;

	OSM_LOG_ENTER(p_log);

	p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl);

	/* Go through each switch individually */
	while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) {
		uint64_t current_guid;
		switch_t *sw;
		p_sw = p_next_sw;
		p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);

		max_lid_ho = p_sw->max_lid_ho;
		current_guid = p_sw->p_node->node_info.port_guid;
		sw = p_sw->priv;

		memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);

		for (lid = 1; lid <= max_lid_ho; lid++) {
			port = osm_get_port_by_lid_ho(p_subn, lid);
			if (!port)
				continue;

			p_dst_sw = get_osm_switch_from_port(port);
			if (p_dst_sw == p_sw) {
				uint8_t egress_port = port->p_node->sw ? 0 :
					port->p_physp->p_remote_physp->port_num;
				p_sw->new_lft[lid] = egress_port;
				OSM_LOG(p_log, OSM_LOG_VERBOSE,
					"LASH fwd MY SRC SRC GUID 0x%016" PRIx64
					" src lash id (%d), src lid no (%u) src lash port (%d) "
					"DST GUID 0x%016" PRIx64
					" src lash id (%d), src lash port (%d)\n",
					cl_ntoh64(current_guid), -1, lid,
					egress_port, cl_ntoh64(current_guid),
					-1, egress_port);
			} else if (p_dst_sw) {
				unsigned dst_lash_switch_id =
				    get_lash_id(p_dst_sw);
				uint8_t lash_egress_port =
				    (uint8_t) sw->
				    routing_table[dst_lash_switch_id].out_link;
				uint8_t physical_egress_port =
					get_next_port(sw, lash_egress_port);

				p_sw->new_lft[lid] = physical_egress_port;
				OSM_LOG(p_log, OSM_LOG_VERBOSE,
					"LASH fwd SRC GUID 0x%016" PRIx64
					" src lash id (%d), "
					"src lid no (%u) src lash port (%d) "
					"DST GUID 0x%016" PRIx64
					" src lash id (%d), src lash port (%d)\n",
					cl_ntoh64(current_guid), sw->id, lid,
					lash_egress_port,
					cl_ntoh64(p_dst_sw->p_node->node_info.
						  port_guid),
					dst_lash_switch_id,
					physical_egress_port);
			}
		}		/* for */
	}
	OSM_LOG_EXIT(p_log);
}

static void osm_lash_process_switch(lash_t * p_lash, osm_switch_t * p_sw)
{
	osm_log_t *p_log = &p_lash->p_osm->log;
	int i, port_count;
	osm_physp_t *p_current_physp, *p_remote_physp;
	unsigned switch_a_lash_id, switch_b_lash_id;

	OSM_LOG_ENTER(p_log);

	switch_a_lash_id = get_lash_id(p_sw);
	port_count = osm_node_get_num_physp(p_sw->p_node);

	/* starting at port 1, ignoring management port on switch */
	for (i = 1; i < port_count; i++) {

		p_current_physp = osm_node_get_physp_ptr(p_sw->p_node, i);
		if (p_current_physp) {
			p_remote_physp = p_current_physp->p_remote_physp;
			if (p_remote_physp && p_remote_physp->p_node->sw) {
				int physical_port_a_num =
				    osm_physp_get_port_num(p_current_physp);
				int physical_port_b_num =
				    osm_physp_get_port_num(p_remote_physp);
				switch_b_lash_id =
				    get_lash_id(p_remote_physp->p_node->sw);

				connect_switches(p_lash, switch_a_lash_id,
						 switch_b_lash_id,
						 physical_port_a_num);
				OSM_LOG(p_log, OSM_LOG_VERBOSE,
					"LASH SUCCESS connected G 0x%016" PRIx64
					" , lash_id(%u), P(%u) " " to G 0x%016"
					PRIx64 " , lash_id(%u) , P(%u)\n",
					cl_ntoh64(osm_physp_get_port_guid
						  (p_current_physp)),
					switch_a_lash_id, physical_port_a_num,
					cl_ntoh64(osm_physp_get_port_guid
						  (p_remote_physp)),
					switch_b_lash_id, physical_port_b_num);
			}
		}
	}

	OSM_LOG_EXIT(p_log);
}

static void lash_cleanup(lash_t * p_lash)
{
	osm_subn_t *p_subn = &p_lash->p_osm->subn;
	osm_switch_t *p_next_sw, *p_sw;

	/* drop any existing references to old lash switches */
	p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl);
	while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) {
		p_sw = p_next_sw;
		p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
		p_sw->priv = NULL;
	}

	if (p_lash->switches) {
		unsigned id;
		for (id = 0; ((int)id) < p_lash->num_switches; id++)
			if (p_lash->switches[id])
				switch_delete(p_lash, p_lash->switches[id]);
		free(p_lash->switches);
	}
	p_lash->switches = NULL;
}

/*
  static int  discover_network_properties()
  Traverse the topology of the network in order to determine
   - the maximum number of switches,
   - the minimum number of virtual layers
*/

static int discover_network_properties(lash_t * p_lash)
{
	int i, id = 0;
	uint8_t vl_min;
	osm_subn_t *p_subn = &p_lash->p_osm->subn;
	osm_switch_t *p_next_sw, *p_sw;
	osm_log_t *p_log = &p_lash->p_osm->log;

	p_lash->num_switches = cl_qmap_count(&p_subn->sw_guid_tbl);

	p_lash->switches = calloc(p_lash->num_switches, sizeof(switch_t *));
	if (!p_lash->switches)
		return -1;

	vl_min = 5;		/* set to a high value */

	p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl);
	while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) {
		uint16_t port_count;
		p_sw = p_next_sw;
		p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);

		p_lash->switches[id] = switch_create(p_lash, id, p_sw);
		if (!p_lash->switches[id])
			return -1;
		id++;

		port_count = osm_node_get_num_physp(p_sw->p_node);

		/* Note, ignoring port 0. management port */
		for (i = 1; i < port_count; i++) {
			osm_physp_t *p_current_physp =
			    osm_node_get_physp_ptr(p_sw->p_node, i);

			if (p_current_physp
			    && p_current_physp->p_remote_physp) {

				ib_port_info_t *p_port_info =
				    &p_current_physp->port_info;
				uint8_t port_vl_min =
				    ib_port_info_get_op_vls(p_port_info);
				if (port_vl_min && port_vl_min < vl_min)
					vl_min = port_vl_min;
			}
		}		/* for */
	}			/* while */

	vl_min = 1 << (vl_min - 1);
	if (vl_min > 15)
		vl_min = 15;

	if (p_lash->p_osm->subn.opt.lash_start_vl >= vl_min) {
		OSM_LOG(p_log, OSM_LOG_ERROR, "ERR 4D03: "
			"Start VL(%d) too high for min operational vl(%d)\n",
			p_lash->p_osm->subn.opt.lash_start_vl, vl_min);
		return -1;
	}

	p_lash->vl_min = vl_min - p_lash->p_osm->subn.opt.lash_start_vl;

	OSM_LOG(p_log, OSM_LOG_INFO,
		"min operational vl(%d) start vl(%d) max_switches(%d)\n",
		p_lash->vl_min, p_lash->p_osm->subn.opt.lash_start_vl,
		p_lash->num_switches);
	return 0;
}

static void process_switches(lash_t * p_lash)
{
	osm_switch_t *p_sw, *p_next_sw;
	osm_subn_t *p_subn = &p_lash->p_osm->subn;

	/* Go through each switch and process it. i.e build the connection
	   structure required by LASH */
	p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl);
	while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) {
		p_sw = p_next_sw;
		p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);

		osm_lash_process_switch(p_lash, p_sw);
	}
}

static int lash_process(void *context)
{
	lash_t *p_lash = context;
	osm_log_t *p_log = &p_lash->p_osm->log;
	int status = 0;

	OSM_LOG_ENTER(p_log);

	p_lash->balance_limit = 6;

	/* everything starts here */
	lash_cleanup(p_lash);

	status = discover_network_properties(p_lash);
	if (status)
		goto Exit;

	status = init_lash_structures(p_lash);
	if (status)
		goto Exit;

	process_switches(p_lash);

	status = lash_core(p_lash);
	if (status)
		goto Exit;

	populate_fwd_tbls(p_lash);

Exit:
	if (p_lash->vl_min)
		free_lash_structures(p_lash);
	OSM_LOG_EXIT(p_log);

	return status;
}

static lash_t *lash_create(osm_opensm_t * p_osm)
{
	lash_t *p_lash;

	p_lash = calloc(1, sizeof(lash_t));
	if (!p_lash)
		return NULL;

	p_lash->p_osm = p_osm;

	return p_lash;
}

static void lash_delete(void *context)
{
	lash_t *p_lash = context;

	if (p_lash->switches) {
		unsigned id;
		for (id = 0; ((int)id) < p_lash->num_switches; id++)
			if (p_lash->switches[id])
				switch_delete(p_lash, p_lash->switches[id]);
		free(p_lash->switches);
	}

	free(p_lash);
}

static uint8_t get_lash_sl(void *context, uint8_t path_sl_hint,
			   const ib_net16_t slid, const ib_net16_t dlid)
{
	unsigned dst_id;
	unsigned src_id;
	osm_port_t *p_src_port, *p_dst_port;
	osm_switch_t *p_sw;
	lash_t *p_lash = context;
	osm_opensm_t *p_osm = p_lash->p_osm;

	if (!(p_osm->routing_engine_used &&
	      p_osm->routing_engine_used->type == OSM_ROUTING_ENGINE_TYPE_LASH))
		return OSM_DEFAULT_SL;

	p_src_port = osm_get_port_by_lid(&p_osm->subn, slid);
	if (!p_src_port)
		return OSM_DEFAULT_SL;

	p_dst_port = osm_get_port_by_lid(&p_osm->subn, dlid);
	if (!p_dst_port)
		return OSM_DEFAULT_SL;

	p_sw = get_osm_switch_from_port(p_dst_port);
	if (!p_sw || !p_sw->priv)
		return OSM_DEFAULT_SL;
	dst_id = get_lash_id(p_sw);

	p_sw = get_osm_switch_from_port(p_src_port);
	if (!p_sw || !p_sw->priv)
		return OSM_DEFAULT_SL;

	src_id = get_lash_id(p_sw);
	if (src_id == dst_id)
		return p_osm->subn.opt.lash_start_vl;

	return (uint8_t) ((switch_t *) p_sw->priv)->routing_table[dst_id].lane;
}

int osm_ucast_lash_setup(struct osm_routing_engine *r, osm_opensm_t *p_osm)
{
	lash_t *p_lash = lash_create(p_osm);
	if (!p_lash)
		return -1;

	r->context = p_lash;
	r->ucast_build_fwd_tables = lash_process;
	r->path_sl = get_lash_sl;
	r->destroy = lash_delete;

	return 0;
}