Hey Sasha,
I was originally going to submit this later on, but given the recent
"rebalancing switch connections" threads, I figure perhaps was now a
good time to post this patch.
When we turn on lmc > 0, we noticed that sometimes extra lids from a
port would be forwarded through one parent switch than another. For
example, suppose LMC = 2 and we are trying to route lids (1,2,3,4). The
lids can be forwarded out of 8 ports, which go to two different
switches. We would see something like this:
switch port 1 (to switch A): 1
switch port 2 (to switch A): 3
switch port 3 (to switch A): 4
switch port 4 (to switch A):
switch port 5 (to switch B): 2
switch port 6 (to switch B)
switch port 7 (to switch B):
switch port 8 (to switch B):
This occurs because the routing for LMC only favors those sys_guids and
node_guids that have not been seen before. But it does not consider how
many times we have routed through a sys_guid/node_guid before.
The patch is fairly straight forward. We just count how many times we
have forwarded to a sys_guid/node_guid before. If there is a port that
has an equal number of paths to another port, but has not been forwarded
out as much, we pick that port. Most of the patch is architectural
changes. I stuff the sys_guid, node_guid, and a counter inside one
struct and array, because we can't count properly using the multiple
uint64_t arrays from before.
Thanks,
Al
--
Albert Chu
[EMAIL PROTECTED]
925-422-5311
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory
>From 662577241ee4083820a007ba3c54cc1b5c2cdd6a Mon Sep 17 00:00:00 2001
From: Albert L. Chu <[EMAIL PROTECTED]>
Date: Mon, 3 Mar 2008 10:39:43 -0800
Subject: [PATCH] support balanced multi-lid routing
Signed-off-by: Albert L. Chu <[EMAIL PROTECTED]>
---
opensm/include/opensm/osm_switch.h | 55 ++++++++---
opensm/opensm/osm_dump.c | 2 +-
opensm/opensm/osm_switch.c | 190 +++++++++++++++++++++++++++---------
opensm/opensm/osm_ucast_mgr.c | 51 ++++------
4 files changed, 205 insertions(+), 93 deletions(-)
diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
index e2fe86d..a4a6404 100644
--- a/opensm/include/opensm/osm_switch.h
+++ b/opensm/include/opensm/osm_switch.h
@@ -158,6 +158,33 @@ typedef struct _osm_switch {
* Switch object
*********/
+/****s* OpenSM: Switch/osm_switch_guid_count_t
+* NAME
+* osm_switch_guid_count_t
+*
+* DESCRIPTION
+* Stores system and node guids and the number of
+* times a switch has forwarded to it.
+*
+* SYNOPSIS
+*/
+typedef struct _osm_switch_guid_count {
+ uint64_t sys_guid;
+ uint64_t node_guid;
+ unsigned int forwarded_to;
+} osm_switch_guid_count_t;
+/*
+* FIELDS
+* sys_guid
+* A system guid.
+*
+* node_guid
+* A node guid.
+*
+* forwarded_to
+* A count of lids forwarded to the sys_guid/node_guid.
+*********/
+
/****f* OpenSM: Switch/osm_switch_delete
* NAME
* osm_switch_delete
@@ -959,10 +986,9 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
IN const uint16_t lid_ho,
IN const boolean_t ignore_existing,
IN const boolean_t dor,
- IN OUT uint64_t * remote_sys_guids,
- IN OUT uint16_t * p_num_used_sys,
- IN OUT uint64_t * remote_node_guids,
- IN OUT uint16_t * p_num_used_nodes);
+ IN OUT osm_switch_guid_count_t * remote_guids,
+ IN OUT uint16_t * p_num_remote_guids,
+ IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used);
/*
* PARAMETERS
* p_sw
@@ -984,21 +1010,18 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
* dor
* [in] If TRUE, Dimension Order Routing will be done.
*
-* remote_sys_guids
-* [in out] The array of remote system guids already used to
-* route the other lids of the same target port (if LMC > 0).
+* p_remote_guids
+* [in out] The array of remote guids already used to route
+* the other lids of the same target port (if LMC > 0 and
+* lid_balancing TRUE)
*
-* p_num_used_sys
-* [in out] The number of remote systems used for routing to
+* p_num_remote_guids
+* [in out] The number of remote guids used for routing to
* the port.
*
-* remote_node_guids
-* [in out] The array of remote node guids already used to route
-* the other lids of the same target port (if LMC > 0).
-*
-* p_num_used_nodes
-* [in out] The number of remote nodes used for routing to
-* the port.
+* p_remote_guid_count_used
+* [in out] The specific osm_switch_guid_count_t used
+* in switch recommendations.
*
* RETURN VALUE
* Returns the recommended port on which to route this LID.
diff --git a/opensm/opensm/osm_dump.c b/opensm/opensm/osm_dump.c
index 9f638b3..b74dc67 100644
--- a/opensm/opensm/osm_dump.c
+++ b/opensm/opensm/osm_dump.c
@@ -236,7 +236,7 @@ static void dump_ucast_routes(cl_map_item_t * p_map_item, void *cxt)
best_port = osm_switch_recommend_path(p_sw, p_port,
lid_ho, TRUE, dor,
NULL, NULL,
- NULL, NULL);
+ NULL);
fprintf(file, "No %u hop path possible via port %u!",
best_hops, best_port);
}
diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
index d74cb6c..d98341a 100644
--- a/opensm/opensm/osm_switch.c
+++ b/opensm/opensm/osm_switch.c
@@ -219,16 +219,104 @@ osm_switch_get_fwd_tbl_block(IN const osm_switch_t * const p_sw,
/**********************************************************************
**********************************************************************/
+static osm_switch_guid_count_t *
+osm_switch_find_guid_common(IN const osm_switch_t * const p_sw,
+ IN osm_switch_guid_count_t * remote_guids,
+ IN uint16_t * p_num_remote_guids,
+ IN uint8_t port_num,
+ IN int find_sys_guid,
+ IN int find_node_guid)
+{
+ osm_switch_guid_count_t *p_remote_guid = NULL;
+ osm_physp_t *p_physp;
+ osm_physp_t *p_rem_physp;
+ osm_node_t *p_rem_node;
+ uint64_t sys_guid;
+ uint64_t node_guid;
+ int i;
+
+ CL_ASSERT(p_sw);
+ CL_ASSERT(remote_guids);
+ CL_ASSERT(p_num_remote_guids);
+
+ p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
+ p_rem_physp = osm_physp_get_remote(p_physp);
+ p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
+ sys_guid = p_rem_node->node_info.sys_guid;
+ node_guid = p_rem_node->node_info.node_guid;
+
+ for (i = 0; i < *p_num_remote_guids; i++) {
+ if ((!find_sys_guid
+ || remote_guids[i].sys_guid == sys_guid)
+ && (!find_node_guid
+ || remote_guids[i].node_guid == node_guid)) {
+ p_remote_guid = &remote_guids[i];
+ break;
+ }
+ if (remote_guids[i].sys_guid == sys_guid
+ && remote_guids[i].node_guid == node_guid) {
+ p_remote_guid = &remote_guids[i];
+ break;
+ }
+ }
+
+ return p_remote_guid;
+}
+
+static osm_switch_guid_count_t *
+osm_switch_find_sys_guid_count(IN const osm_switch_t * const p_sw,
+ IN osm_switch_guid_count_t * remote_guids,
+ IN uint16_t * p_num_remote_guids,
+ IN uint8_t port_num)
+{
+ return osm_switch_find_guid_common(p_sw,
+ remote_guids,
+ p_num_remote_guids,
+ port_num,
+ 1,
+ 0);
+}
+
+static osm_switch_guid_count_t *
+osm_switch_find_node_guid_count(IN const osm_switch_t * const p_sw,
+ IN osm_switch_guid_count_t * remote_guids,
+ IN uint16_t * p_num_remote_guids,
+ IN uint8_t port_num)
+{
+ return osm_switch_find_guid_common(p_sw,
+ remote_guids,
+ p_num_remote_guids,
+ port_num,
+ 0,
+ 1);
+}
+
+static osm_switch_guid_count_t *
+osm_switch_find_guid_count(IN const osm_switch_t * const p_sw,
+ IN osm_switch_guid_count_t * remote_guids,
+ IN uint16_t * p_num_remote_guids,
+ IN uint8_t port_num)
+{
+ return osm_switch_find_guid_common(p_sw,
+ remote_guids,
+ p_num_remote_guids,
+ port_num,
+ 1,
+ 1);
+}
+
+
+/**********************************************************************
+ **********************************************************************/
uint8_t
osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
IN osm_port_t * p_port,
IN const uint16_t lid_ho,
IN const boolean_t ignore_existing,
IN const boolean_t dor,
- IN OUT uint64_t * remote_sys_guids,
- IN OUT uint16_t * p_num_used_sys,
- IN OUT uint64_t * remote_node_guids,
- IN OUT uint16_t * p_num_used_nodes)
+ IN OUT osm_switch_guid_count_t * remote_guids,
+ IN OUT uint16_t * p_num_remote_guids,
+ IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used)
{
/*
We support an enhanced LMC aware routing mode:
@@ -237,14 +325,12 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
and try and avoid routing again through the same
system / node.
- If the procedure is provided with the tracking arrays
- and counters we can conduct this algorithm.
+ If this procedure is provided with the tracking array
+ and counter we can conduct this algorithm.
*/
- boolean_t routing_for_lmc = remote_sys_guids && remote_node_guids &&
- p_num_used_sys && p_num_used_nodes;
- boolean_t sys_used, node_used;
+ boolean_t routing_for_lmc = remote_guids && p_num_remote_guids
+ && p_remote_guid_count_used;
uint16_t base_lid;
- uint16_t i;
uint8_t hops;
uint8_t least_hops;
uint8_t port_num;
@@ -256,6 +342,7 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
*/
uint32_t least_paths_other_sys = 0xFFFFFFFF;
uint32_t least_paths_other_nodes = 0xFFFFFFFF;
+ uint32_t least_forwarded_to = 0xFFFFFFFF;
uint32_t check_count;
uint8_t best_port = 0;
/*
@@ -269,6 +356,7 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
osm_physp_t *p_rem_physp;
osm_node_t *p_rem_node;
osm_node_t *p_rem_node_first = NULL;
+ osm_switch_guid_count_t *p_remote_guid = NULL;
CL_ASSERT(lid_ho > 0);
@@ -381,50 +469,38 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
it.
*/
if (routing_for_lmc) {
-#if 0
- printf("LID:0x%X SYS:%d NODE:%d\n", lid_ho,
- *p_num_used_sys, *p_num_used_nodes);
-#endif
-
- /* Get the Remote Node */
- p_rem_physp = osm_physp_get_remote(p_physp);
- p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
-
/* Is the sys guid already used ? */
- sys_used = FALSE;
- for (i = 0; !sys_used && (i < *p_num_used_sys); i++)
- if (!memcmp(&p_rem_node->node_info.sys_guid,
- &remote_sys_guids[i],
- sizeof(uint64_t)))
- sys_used = TRUE;
+ p_remote_guid = osm_switch_find_sys_guid_count(p_sw,
+ remote_guids,
+ p_num_remote_guids,
+ port_num);
/* If not update the least hops for this case */
- if (!sys_used) {
+ if (!p_remote_guid) {
if (check_count < least_paths_other_sys) {
least_paths_other_sys = check_count;
best_port_other_sys = port_num;
+ least_forwarded_to = 0;
}
} else { /* same sys found - try node */
/* Else is the node guid already used ? */
- node_used = FALSE;
- for (i = 0;
- !node_used && (i < *p_num_used_nodes); i++)
- if (!memcmp
- (&p_rem_node->node_info.node_guid,
- &remote_node_guids[i],
- sizeof(uint64_t)))
- node_used = TRUE;
+ p_remote_guid = osm_switch_find_node_guid_count(p_sw,
+ remote_guids,
+ p_num_remote_guids,
+ port_num);
/* If not update the least hops for this case */
- if (!node_used
+ if (!p_remote_guid
&& check_count < least_paths_other_nodes) {
least_paths_other_nodes = check_count;
best_port_other_node = port_num;
+ least_forwarded_to = 0;
}
+ /* else prior sys and node guid already used */
} /* same sys found */
}
-
+
/* routing for LMC mode */
/*
the count is min but also lower then the max subscribed
@@ -447,6 +523,17 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
port_found = TRUE;
best_port = port_num;
least_paths = check_count;
+ if (routing_for_lmc
+ && p_remote_guid
+ && p_remote_guid->forwarded_to < least_forwarded_to)
+ least_forwarded_to = p_remote_guid->forwarded_to;
+ }
+ else if (routing_for_lmc
+ && p_remote_guid
+ && check_count == least_paths
+ && p_remote_guid->forwarded_to < least_forwarded_to) {
+ least_forwarded_to = p_remote_guid->forwarded_to;
+ best_port = port_num;
}
}
@@ -465,17 +552,28 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
best_port = best_port_other_node;
/* track the remote node and system of the port used. */
- p_physp = osm_node_get_physp_ptr(p_sw->p_node, best_port);
- p_rem_physp = osm_physp_get_remote(p_physp);
- p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
- memcpy(&remote_node_guids[*p_num_used_nodes],
- &(p_rem_node->node_info.node_guid), sizeof(uint64_t));
- (*p_num_used_nodes)++;
- memcpy(&remote_sys_guids[*p_num_used_sys],
- &(p_rem_node->node_info.sys_guid), sizeof(uint64_t));
- (*p_num_used_sys)++;
+ p_remote_guid = osm_switch_find_guid_count(p_sw,
+ remote_guids,
+ p_num_remote_guids,
+ best_port);
+
+ if (!p_remote_guid) {
+ /* track the remote node and system of the port used. */
+ p_physp = osm_node_get_physp_ptr(p_sw->p_node, best_port);
+ p_rem_physp = osm_physp_get_remote(p_physp);
+ p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
+ memcpy(&(remote_guids[*p_num_remote_guids].sys_guid),
+ &(p_rem_node->node_info.sys_guid),
+ sizeof(uint64_t));
+ memcpy(&(remote_guids[*p_num_remote_guids].node_guid),
+ &(p_rem_node->node_info.node_guid),
+ sizeof(uint64_t));
+ remote_guids[*p_num_remote_guids].forwarded_to = 0;
+ (*p_num_remote_guids)++;
+ }
+ *p_remote_guid_count_used = p_remote_guid;
}
-
+
return (best_port);
}
diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
index 1aa5ea9..d7fc4d3 100644
--- a/opensm/opensm/osm_ucast_mgr.c
+++ b/opensm/opensm/osm_ucast_mgr.c
@@ -209,31 +209,21 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
in providing better routing in LMC > 0 situations
*/
uint16_t lids_per_port = 1 << p_mgr->p_subn->opt.lmc;
- uint64_t *remote_sys_guids = NULL;
- uint64_t *remote_node_guids = NULL;
- uint16_t num_used_sys = 0;
- uint16_t num_used_nodes = 0;
+ osm_switch_guid_count_t *remote_guids = NULL;
+ uint16_t num_used_guids = 0;
+ osm_switch_guid_count_t *p_remote_guid_used = NULL;
OSM_LOG_ENTER(p_mgr->p_log);
if (lids_per_port > 1) {
- remote_sys_guids = malloc(sizeof(uint64_t) * lids_per_port);
- if (remote_sys_guids == NULL) {
- OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A09: "
+ remote_guids = malloc(sizeof(osm_switch_guid_count_t) * lids_per_port);
+ if (remote_guids == NULL) {
+ osm_log(p_mgr->p_log, OSM_LOG_ERROR,
+ "__osm_ucast_mgr_process_port: ERR 3A0B: "
"Cannot allocate array. Insufficient memory\n");
goto Exit;
}
-
- memset(remote_sys_guids, 0, sizeof(uint64_t) * lids_per_port);
-
- remote_node_guids = malloc(sizeof(uint64_t) * lids_per_port);
- if (remote_node_guids == NULL) {
- OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0A: "
- "Cannot allocate array. Insufficient memory\n");
- goto Exit;
- }
-
- memset(remote_node_guids, 0, sizeof(uint64_t) * lids_per_port);
+ memset(remote_guids, 0, sizeof(osm_switch_guid_count_t) * lids_per_port);
}
osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho);
@@ -272,22 +262,22 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
*/
for (lid_ho = min_lid_ho; lid_ho <= max_lid_ho; lid_ho++) {
/* Use the enhanced algorithm only for LMC > 0 */
- if (lids_per_port > 1)
+ if (lids_per_port > 1) {
+ p_remote_guid_used = NULL;
port = osm_switch_recommend_path(p_sw, p_port, lid_ho,
p_mgr->p_subn->
ignore_existing_lfts,
p_mgr->is_dor,
- remote_sys_guids,
- &num_used_sys,
- remote_node_guids,
- &num_used_nodes);
+ remote_guids,
+ &num_used_guids,
+ &p_remote_guid_used);
+ }
else
port = osm_switch_recommend_path(p_sw, p_port, lid_ho,
p_mgr->p_subn->
ignore_existing_lfts,
p_mgr->is_dor,
- NULL, NULL, NULL,
- NULL);
+ NULL, NULL, NULL);
/*
There might be no path to the target
@@ -341,15 +331,16 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
Write it to the forwarding tables.
*/
p_mgr->lft_buf[lid_ho] = port;
- if (!is_ignored_by_port_prof)
+ if (!is_ignored_by_port_prof) {
osm_switch_count_path(p_sw, port);
+ if (p_remote_guid_used)
+ p_remote_guid_used->forwarded_to++;
+ }
}
Exit:
- if (remote_sys_guids)
- free(remote_sys_guids);
- if (remote_node_guids)
- free(remote_node_guids);
+ if (remote_guids)
+ free(remote_guids);
OSM_LOG_EXIT(p_mgr->p_log);
}
--
1.5.1
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general