Hey Sasha,

I was originally going to submit this later on, but given the recent
"rebalancing switch connections" threads, I figure perhaps was now a
good time to post this patch.

When we turn on lmc > 0, we noticed that sometimes extra lids from a
port would be forwarded through one parent switch than another.  For
example, suppose LMC = 2 and we are trying to route lids (1,2,3,4).  The
lids can be forwarded out of 8 ports, which go to two different
switches.  We would see something like this:

switch port 1 (to switch A): 1
switch port 2 (to switch A): 3
switch port 3 (to switch A): 4
switch port 4 (to switch A):
switch port 5 (to switch B): 2
switch port 6 (to switch B)
switch port 7 (to switch B):
switch port 8 (to switch B):

This occurs because the routing for LMC only favors those sys_guids and
node_guids that have not been seen before.  But it does not consider how
many times we have routed through a sys_guid/node_guid before.

The patch is fairly straight forward.  We just count how many times we
have forwarded to a sys_guid/node_guid before.  If there is a port that
has an equal number of paths to another port, but has not been forwarded
out as much, we pick that port.  Most of the patch is architectural
changes.  I stuff the sys_guid, node_guid, and a counter inside one
struct and array, because we can't count properly using the multiple
uint64_t arrays from before.

Thanks,
Al

-- 
Albert Chu
[EMAIL PROTECTED]
925-422-5311
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory
>From 662577241ee4083820a007ba3c54cc1b5c2cdd6a Mon Sep 17 00:00:00 2001
From: Albert L. Chu <[EMAIL PROTECTED]>
Date: Mon, 3 Mar 2008 10:39:43 -0800
Subject: [PATCH] support balanced multi-lid routing


Signed-off-by: Albert L. Chu <[EMAIL PROTECTED]>
---
 opensm/include/opensm/osm_switch.h |   55 ++++++++---
 opensm/opensm/osm_dump.c           |    2 +-
 opensm/opensm/osm_switch.c         |  190 +++++++++++++++++++++++++++---------
 opensm/opensm/osm_ucast_mgr.c      |   51 ++++------
 4 files changed, 205 insertions(+), 93 deletions(-)

diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
index e2fe86d..a4a6404 100644
--- a/opensm/include/opensm/osm_switch.h
+++ b/opensm/include/opensm/osm_switch.h
@@ -158,6 +158,33 @@ typedef struct _osm_switch {
 *	Switch object
 *********/
 
+/****s* OpenSM: Switch/osm_switch_guid_count_t
+* NAME
+*       osm_switch_guid_count_t
+*
+* DESCRIPTION
+*       Stores system and node guids and the number of
+*	times a switch has forwarded to it.
+*
+* SYNOPSIS
+*/
+typedef struct _osm_switch_guid_count {
+	uint64_t sys_guid;
+	uint64_t node_guid;
+	unsigned int forwarded_to;
+} osm_switch_guid_count_t;
+/*
+* FIELDS
+*       sys_guid 
+*               A system guid.
+*
+*       node_guid
+*               A node guid.
+*
+*       forwarded_to
+*               A count of lids forwarded to the sys_guid/node_guid.
+*********/
+
 /****f* OpenSM: Switch/osm_switch_delete
 * NAME
 *	osm_switch_delete
@@ -959,10 +986,9 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 			  IN const uint16_t lid_ho,
 			  IN const boolean_t ignore_existing,
 			  IN const boolean_t dor,
-			  IN OUT uint64_t * remote_sys_guids,
-			  IN OUT uint16_t * p_num_used_sys,
-			  IN OUT uint64_t * remote_node_guids,
-			  IN OUT uint16_t * p_num_used_nodes);
+			  IN OUT osm_switch_guid_count_t * remote_guids,
+			  IN OUT uint16_t * p_num_remote_guids,
+			  IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used);
 /*
 * PARAMETERS
 *	p_sw
@@ -984,21 +1010,18 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 *	dor
 *		[in] If TRUE, Dimension Order Routing will be done.
 *
-*	remote_sys_guids
-*		[in out] The array of remote system guids already used to
-*		route the other lids of the same target port (if LMC > 0).
+*	p_remote_guids
+*		[in out] The array of remote guids already used to route
+*		the other lids of the same target port (if LMC > 0 and
+*		lid_balancing TRUE)
 *
-*	p_num_used_sys
-*		[in out] The number of remote systems used for routing to
+*	p_num_remote_guids
+*		[in out] The number of remote guids used for routing to
 *		the port.
 *
-*	remote_node_guids
-*		[in out] The array of remote node guids already used to route
-*		the other lids of the same target port (if LMC > 0).
-*
-*	p_num_used_nodes
-*		[in out] The number of remote nodes used for routing to
-*		the port.
+*	p_remote_guid_count_used
+*		[in out] The specific osm_switch_guid_count_t used
+*		in switch recommendations.
 *
 * RETURN VALUE
 *	Returns the recommended port on which to route this LID.
diff --git a/opensm/opensm/osm_dump.c b/opensm/opensm/osm_dump.c
index 9f638b3..b74dc67 100644
--- a/opensm/opensm/osm_dump.c
+++ b/opensm/opensm/osm_dump.c
@@ -236,7 +236,7 @@ static void dump_ucast_routes(cl_map_item_t * p_map_item, void *cxt)
 			best_port = osm_switch_recommend_path(p_sw, p_port,
 							      lid_ho, TRUE, dor,
 							      NULL, NULL,
-							      NULL, NULL);
+							      NULL);
 			fprintf(file, "No %u hop path possible via port %u!",
 				best_hops, best_port);
 		}
diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
index d74cb6c..d98341a 100644
--- a/opensm/opensm/osm_switch.c
+++ b/opensm/opensm/osm_switch.c
@@ -219,16 +219,104 @@ osm_switch_get_fwd_tbl_block(IN const osm_switch_t * const p_sw,
 
 /**********************************************************************
  **********************************************************************/
+static osm_switch_guid_count_t *
+osm_switch_find_guid_common(IN const osm_switch_t * const p_sw,
+			    IN osm_switch_guid_count_t * remote_guids,
+			    IN uint16_t * p_num_remote_guids,
+			    IN uint8_t port_num,
+			    IN int find_sys_guid,
+			    IN int find_node_guid)
+{	
+	osm_switch_guid_count_t *p_remote_guid = NULL;
+	osm_physp_t *p_physp;
+	osm_physp_t *p_rem_physp;
+	osm_node_t *p_rem_node;
+	uint64_t sys_guid;
+	uint64_t node_guid;
+	int i;
+	
+	CL_ASSERT(p_sw);
+	CL_ASSERT(remote_guids);
+	CL_ASSERT(p_num_remote_guids);
+
+	p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
+	p_rem_physp = osm_physp_get_remote(p_physp);
+	p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
+	sys_guid = p_rem_node->node_info.sys_guid;
+	node_guid = p_rem_node->node_info.node_guid;
+
+	for (i = 0; i < *p_num_remote_guids; i++) {
+		if ((!find_sys_guid 
+		     || remote_guids[i].sys_guid == sys_guid)
+		    && (!find_node_guid
+			|| remote_guids[i].node_guid == node_guid)) {
+			p_remote_guid = &remote_guids[i];
+			break;
+		}
+		if (remote_guids[i].sys_guid == sys_guid
+		    && remote_guids[i].node_guid == node_guid) {
+			p_remote_guid = &remote_guids[i];
+			break;
+		}
+	}
+
+	return p_remote_guid;
+}
+
+static osm_switch_guid_count_t *
+osm_switch_find_sys_guid_count(IN const osm_switch_t * const p_sw,
+			       IN osm_switch_guid_count_t * remote_guids,
+			       IN uint16_t * p_num_remote_guids,
+			       IN uint8_t port_num)
+{
+	return osm_switch_find_guid_common(p_sw,
+					   remote_guids,
+					   p_num_remote_guids,
+					   port_num,
+					   1,
+					   0);
+}
+
+static osm_switch_guid_count_t *
+osm_switch_find_node_guid_count(IN const osm_switch_t * const p_sw,
+				IN osm_switch_guid_count_t * remote_guids,
+				IN uint16_t * p_num_remote_guids,
+				IN uint8_t port_num)
+{
+	return osm_switch_find_guid_common(p_sw,
+					   remote_guids,
+					   p_num_remote_guids,
+					   port_num,
+					   0,
+					   1);
+}
+
+static osm_switch_guid_count_t *
+osm_switch_find_guid_count(IN const osm_switch_t * const p_sw,
+			   IN osm_switch_guid_count_t * remote_guids,
+			   IN uint16_t * p_num_remote_guids,
+			   IN uint8_t port_num)
+{
+	return osm_switch_find_guid_common(p_sw,
+					   remote_guids,
+					   p_num_remote_guids,
+					   port_num,
+					   1,
+					   1);
+}
+			   
+
+/**********************************************************************
+ **********************************************************************/
 uint8_t
 osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 			  IN osm_port_t * p_port,
 			  IN const uint16_t lid_ho,
 			  IN const boolean_t ignore_existing,
 			  IN const boolean_t dor,
-			  IN OUT uint64_t * remote_sys_guids,
-			  IN OUT uint16_t * p_num_used_sys,
-			  IN OUT uint64_t * remote_node_guids,
-			  IN OUT uint16_t * p_num_used_nodes)
+			  IN OUT osm_switch_guid_count_t * remote_guids,
+			  IN OUT uint16_t * p_num_remote_guids,
+			  IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used)
 {
 	/*
 	   We support an enhanced LMC aware routing mode:
@@ -237,14 +325,12 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 	   and try and avoid routing again through the same
 	   system / node.
 
-	   If the procedure is provided with the tracking arrays
-	   and counters we can conduct this algorithm.
+	   If this procedure is provided with the tracking array
+	   and counter we can conduct this algorithm.
 	 */
-	boolean_t routing_for_lmc = remote_sys_guids && remote_node_guids &&
-	    p_num_used_sys && p_num_used_nodes;
-	boolean_t sys_used, node_used;
+	boolean_t routing_for_lmc = remote_guids && p_num_remote_guids
+		&& p_remote_guid_count_used;
 	uint16_t base_lid;
-	uint16_t i;
 	uint8_t hops;
 	uint8_t least_hops;
 	uint8_t port_num;
@@ -256,6 +342,7 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 	 */
 	uint32_t least_paths_other_sys = 0xFFFFFFFF;
 	uint32_t least_paths_other_nodes = 0xFFFFFFFF;
+	uint32_t least_forwarded_to = 0xFFFFFFFF;
 	uint32_t check_count;
 	uint8_t best_port = 0;
 	/*
@@ -269,6 +356,7 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 	osm_physp_t *p_rem_physp;
 	osm_node_t *p_rem_node;
 	osm_node_t *p_rem_node_first = NULL;
+	osm_switch_guid_count_t *p_remote_guid = NULL;
 
 	CL_ASSERT(lid_ho > 0);
 
@@ -381,50 +469,38 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 		   it.
 		 */
 		if (routing_for_lmc) {
-#if 0
-			printf("LID:0x%X SYS:%d NODE:%d\n", lid_ho,
-			       *p_num_used_sys, *p_num_used_nodes);
-#endif
-
-			/* Get the Remote Node */
-			p_rem_physp = osm_physp_get_remote(p_physp);
-			p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
-
 			/* Is the sys guid already used ? */
-			sys_used = FALSE;
-			for (i = 0; !sys_used && (i < *p_num_used_sys); i++)
-				if (!memcmp(&p_rem_node->node_info.sys_guid,
-					    &remote_sys_guids[i],
-					    sizeof(uint64_t)))
-					sys_used = TRUE;
+			p_remote_guid = osm_switch_find_sys_guid_count(p_sw,
+								       remote_guids,
+								       p_num_remote_guids,
+								       port_num);
 
 			/* If not update the least hops for this case */
-			if (!sys_used) {
+			if (!p_remote_guid) {
 				if (check_count < least_paths_other_sys) {
 					least_paths_other_sys = check_count;
 					best_port_other_sys = port_num;
+					least_forwarded_to = 0;
 				}
 			} else {	/* same sys found - try node */
 				/* Else is the node guid already used ? */
-				node_used = FALSE;
-				for (i = 0;
-				     !node_used && (i < *p_num_used_nodes); i++)
-					if (!memcmp
-					    (&p_rem_node->node_info.node_guid,
-					     &remote_node_guids[i],
-					     sizeof(uint64_t)))
-						node_used = TRUE;
+				p_remote_guid = osm_switch_find_node_guid_count(p_sw,
+										remote_guids,
+										p_num_remote_guids,
+										port_num);
 
 				/* If not update the least hops for this case */
-				if (!node_used
+				if (!p_remote_guid
 				    && check_count < least_paths_other_nodes) {
 					least_paths_other_nodes = check_count;
 					best_port_other_node = port_num;
+					least_forwarded_to = 0;
 				}
+				/* else prior sys and node guid already used */
 
 			}	/* same sys found */
 		}
-
+	
 		/* routing for LMC mode */
 		/*
 		   the count is min but also lower then the max subscribed
@@ -447,6 +523,17 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 			port_found = TRUE;
 			best_port = port_num;
 			least_paths = check_count;
+			if (routing_for_lmc 
+			    && p_remote_guid
+			    && p_remote_guid->forwarded_to < least_forwarded_to)
+				least_forwarded_to = p_remote_guid->forwarded_to;
+		}
+		else if (routing_for_lmc 
+			 && p_remote_guid
+			 && check_count == least_paths
+			 && p_remote_guid->forwarded_to < least_forwarded_to) {
+			least_forwarded_to = p_remote_guid->forwarded_to;
+			best_port = port_num;
 		}
 	}
 
@@ -465,17 +552,28 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
 			best_port = best_port_other_node;
 
 		/* track the remote node and system of the port used. */
-		p_physp = osm_node_get_physp_ptr(p_sw->p_node, best_port);
-		p_rem_physp = osm_physp_get_remote(p_physp);
-		p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
-		memcpy(&remote_node_guids[*p_num_used_nodes],
-		       &(p_rem_node->node_info.node_guid), sizeof(uint64_t));
-		(*p_num_used_nodes)++;
-		memcpy(&remote_sys_guids[*p_num_used_sys],
-		       &(p_rem_node->node_info.sys_guid), sizeof(uint64_t));
-		(*p_num_used_sys)++;
+		p_remote_guid = osm_switch_find_guid_count(p_sw,
+							   remote_guids,
+							   p_num_remote_guids,
+							   best_port);
+
+		if (!p_remote_guid) {
+			/* track the remote node and system of the port used. */
+			p_physp = osm_node_get_physp_ptr(p_sw->p_node, best_port);
+			p_rem_physp = osm_physp_get_remote(p_physp);
+			p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
+			memcpy(&(remote_guids[*p_num_remote_guids].sys_guid),
+			       &(p_rem_node->node_info.sys_guid),
+			       sizeof(uint64_t));
+			memcpy(&(remote_guids[*p_num_remote_guids].node_guid),
+				       &(p_rem_node->node_info.node_guid),
+			       sizeof(uint64_t));
+			remote_guids[*p_num_remote_guids].forwarded_to = 0;
+			(*p_num_remote_guids)++;
+		}
+		*p_remote_guid_count_used = p_remote_guid;
 	}
-
+	
 	return (best_port);
 }
 
diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
index 1aa5ea9..d7fc4d3 100644
--- a/opensm/opensm/osm_ucast_mgr.c
+++ b/opensm/opensm/osm_ucast_mgr.c
@@ -209,31 +209,21 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
 	   in providing better routing in LMC > 0 situations
 	 */
 	uint16_t lids_per_port = 1 << p_mgr->p_subn->opt.lmc;
-	uint64_t *remote_sys_guids = NULL;
-	uint64_t *remote_node_guids = NULL;
-	uint16_t num_used_sys = 0;
-	uint16_t num_used_nodes = 0;
+	osm_switch_guid_count_t *remote_guids = NULL;
+	uint16_t num_used_guids = 0;
+	osm_switch_guid_count_t *p_remote_guid_used = NULL;
 
 	OSM_LOG_ENTER(p_mgr->p_log);
 
 	if (lids_per_port > 1) {
-		remote_sys_guids = malloc(sizeof(uint64_t) * lids_per_port);
-		if (remote_sys_guids == NULL) {
-			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A09: "
+		remote_guids = malloc(sizeof(osm_switch_guid_count_t) * lids_per_port);
+		if (remote_guids == NULL) {
+			osm_log(p_mgr->p_log, OSM_LOG_ERROR,
+				"__osm_ucast_mgr_process_port: ERR 3A0B: "
 				"Cannot allocate array. Insufficient memory\n");
 			goto Exit;
 		}
-
-		memset(remote_sys_guids, 0, sizeof(uint64_t) * lids_per_port);
-
-		remote_node_guids = malloc(sizeof(uint64_t) * lids_per_port);
-		if (remote_node_guids == NULL) {
-			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0A: "
-				"Cannot allocate array. Insufficient memory\n");
-			goto Exit;
-		}
-
-		memset(remote_node_guids, 0, sizeof(uint64_t) * lids_per_port);
+		memset(remote_guids, 0, sizeof(osm_switch_guid_count_t) * lids_per_port);
 	}
 
 	osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho);
@@ -272,22 +262,22 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
 	 */
 	for (lid_ho = min_lid_ho; lid_ho <= max_lid_ho; lid_ho++) {
 		/* Use the enhanced algorithm only for LMC > 0 */
-		if (lids_per_port > 1)
+		if (lids_per_port > 1) {
+			p_remote_guid_used = NULL;
 			port = osm_switch_recommend_path(p_sw, p_port, lid_ho,
 							 p_mgr->p_subn->
 							 ignore_existing_lfts,
 							 p_mgr->is_dor,
-							 remote_sys_guids,
-							 &num_used_sys,
-							 remote_node_guids,
-							 &num_used_nodes);
+							 remote_guids,
+							 &num_used_guids,
+							 &p_remote_guid_used);
+		}
 		else
 			port = osm_switch_recommend_path(p_sw, p_port, lid_ho,
 							 p_mgr->p_subn->
 							 ignore_existing_lfts,
 							 p_mgr->is_dor,
-							 NULL, NULL, NULL,
-							 NULL);
+							 NULL, NULL, NULL);
 
 		/*
 		   There might be no path to the target
@@ -341,15 +331,16 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
 		   Write it to the forwarding tables.
 		 */
 		p_mgr->lft_buf[lid_ho] = port;
-		if (!is_ignored_by_port_prof)
+		if (!is_ignored_by_port_prof) {
 			osm_switch_count_path(p_sw, port);
+			if (p_remote_guid_used)
+				p_remote_guid_used->forwarded_to++;
+		}
 	}
 
 Exit:
-	if (remote_sys_guids)
-		free(remote_sys_guids);
-	if (remote_node_guids)
-		free(remote_node_guids);
+	if (remote_guids)
+		free(remote_guids);
 	OSM_LOG_EXIT(p_mgr->p_log);
 }
 
-- 
1.5.1

_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to