From: Pradosh Mohapatra <[email protected]>
A fat tree topology running IBGP gets into two issues with anycast address
routing. Consider the following topology:
R9 R10
x x
R3 R4 R7 R8
x x
R1 R2 R5 R6
| | | |
10/8 10/8 10/8 S
Let's remind ourselves of BGP decision process steps:
1. Highest Local Preference
2. Shortest AS Path Length
3. Lowest Origin Type
4. Lowest MED (Multi-Exit Discriminator)
5. Prefer External to Internal
6. Closest Egress (Lowest IGP Distance)
7. Tie Breaking (Lowest-Router-ID)
8. Tie Breaking (Lowest-cluster-list length)
9. Tie Breaking (Lowest-neighbor-address)
Without any policies, steps 1-6 will almost always evaluate identically for
all paths received on any router in the above topology. Let's assume that
the router-ids follow the following inequality: R1 < R2 < R5 < R6. Owing to
the 7th step above, all routers will now choose R1's path as the best. This
is undesirable. As an example, traffic from S to 10/8 will follow the path
S -> R6 -> R7 -> R9 -> R4 -> R2 -> 10/8 instead of S -> R6 -> R7 -> R5 -> 10/8.
Furthermore, once R7 (& R8) chooses R1's path as the best, it would withdraw
its path learned through (R5, R6) from (R9, R10). This leads to inefficient
load balancing - e.g. R9 can't do ECMP across all available egresses -
(R1, R2, R5).
The patch addresses these issues by noting that that cluster list is always
carried along with the routes and its length is a good indicator of IBGP
hops. It thus makes sense to compare that as an extension to metric after
step 6. That automatically ensures correct multipath computation.
Unfortunately a partial deployment of this in a generic topology (note:
fat-tree/clos topologies work fine) may lead to potential loops. It needs
to be looked into.
Signed-off-by: Pradosh Mohapatra <[email protected]>
Reviewed-by: Dinesh G Dutt <[email protected]>
---
bgpd/bgp_attr.h | 4 ++
bgpd/bgp_mpath.c | 4 +-
bgpd/bgp_mpath.h | 3 +-
bgpd/bgp_route.c | 35 ++++++++----
bgpd/bgp_vty.c | 140 ++++++++++++++++++++++++------------------------
bgpd/bgpd.h | 2 +
tests/bgp_mpath_test.c | 4 +-
7 files changed, 110 insertions(+), 82 deletions(-)
diff --git a/bgpd/bgp_attr.h b/bgpd/bgp_attr.h
index b59fa8e..ac346e5 100644
--- a/bgpd/bgp_attr.h
+++ b/bgpd/bgp_attr.h
@@ -132,6 +132,10 @@ struct transit
#define ATTR_FLAG_BIT(X) (1 << ((X) - 1))
+#define BGP_CLUSTER_LIST_LENGTH(attr) \
+ (((attr)->flag & ATTR_FLAG_BIT(BGP_ATTR_CLUSTER_LIST)) ? \
+ (attr)->extra->cluster->length : 0)
+
typedef enum {
BGP_ATTR_PARSE_PROCEED = 0,
BGP_ATTR_PARSE_ERROR = -1,
diff --git a/bgpd/bgp_mpath.c b/bgpd/bgp_mpath.c
index 7999d16..5590b05 100644
--- a/bgpd/bgp_mpath.c
+++ b/bgpd/bgp_mpath.c
@@ -46,7 +46,7 @@
*/
int
bgp_maximum_paths_set (struct bgp *bgp, afi_t afi, safi_t safi,
- int peertype, u_int16_t maxpaths)
+ int peertype, u_int16_t maxpaths, u_int16_t options)
{
if (!bgp || (afi >= AFI_MAX) || (safi >= SAFI_MAX))
return -1;
@@ -55,6 +55,7 @@ bgp_maximum_paths_set (struct bgp *bgp, afi_t afi, safi_t
safi,
{
case BGP_PEER_IBGP:
bgp->maxpaths[afi][safi].maxpaths_ibgp = maxpaths;
+ bgp->maxpaths[afi][safi].ibgp_flags |= options;
break;
case BGP_PEER_EBGP:
bgp->maxpaths[afi][safi].maxpaths_ebgp = maxpaths;
@@ -82,6 +83,7 @@ bgp_maximum_paths_unset (struct bgp *bgp, afi_t afi, safi_t
safi,
{
case BGP_PEER_IBGP:
bgp->maxpaths[afi][safi].maxpaths_ibgp = BGP_DEFAULT_MAXPATHS;
+ bgp->maxpaths[afi][safi].ibgp_flags = 0;
break;
case BGP_PEER_EBGP:
bgp->maxpaths[afi][safi].maxpaths_ebgp = BGP_DEFAULT_MAXPATHS;
diff --git a/bgpd/bgp_mpath.h b/bgpd/bgp_mpath.h
index 37b9ac8..0645e6c 100644
--- a/bgpd/bgp_mpath.h
+++ b/bgpd/bgp_mpath.h
@@ -49,7 +49,8 @@ struct bgp_info_mpath
};
/* Functions to support maximum-paths configuration */
-extern int bgp_maximum_paths_set (struct bgp *, afi_t, safi_t, int, u_int16_t);
+extern int bgp_maximum_paths_set (struct bgp *, afi_t, safi_t, int, u_int16_t,
+ u_int16_t);
extern int bgp_maximum_paths_unset (struct bgp *, afi_t, safi_t, int);
/* Functions used by bgp_best_selection to record current
diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
index 316fa5a..24c17d6 100644
--- a/bgpd/bgp_route.c
+++ b/bgpd/bgp_route.c
@@ -324,7 +324,7 @@ bgp_med_value (struct attr *attr, struct bgp *bgp)
/* Compare two bgp route entity. br is preferable then return 1. */
static int
bgp_info_cmp (struct bgp *bgp, struct bgp_info *new, struct bgp_info *exist,
- int *paths_eq)
+ int *paths_eq, struct bgp_maxpaths_cfg *mpath_cfg)
{
struct attr *newattr, *existattr;
struct attr_extra *newattre, *existattre;
@@ -477,6 +477,26 @@ bgp_info_cmp (struct bgp *bgp, struct bgp_info *new,
struct bgp_info *exist,
if (newm > existm)
ret = 0;
+ /* 8.1. Same IGP metric. Compare the cluster list length as
+ representative of IGP hops metric. Rewrite the metric value
+ pair (newm, existm) with the cluster list length. Prefer the
+ path with smaller cluster list length. */
+ if (newm == existm)
+ {
+ if (peer_sort (new->peer) == BGP_PEER_IBGP
+ && peer_sort (exist->peer) == BGP_PEER_IBGP
+ && CHECK_FLAG (mpath_cfg->ibgp_flags,
+ BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN))
+ {
+ newm = BGP_CLUSTER_LIST_LENGTH(new->attr);
+ existm = BGP_CLUSTER_LIST_LENGTH(exist->attr);
+ if (newm < existm)
+ ret = 1;
+ if (newm > existm)
+ ret = 0;
+ }
+ }
+
/* 9. Maximum path check. */
if (newm == existm)
{
@@ -544,12 +564,8 @@ bgp_info_cmp (struct bgp *bgp, struct bgp_info *new,
struct bgp_info *exist,
return 0;
/* 12. Cluster length comparision. */
- new_cluster = exist_cluster = 0;
-
- if (newattr->flag & ATTR_FLAG_BIT(BGP_ATTR_CLUSTER_LIST))
- new_cluster = newattre->cluster->length;
- if (existattr->flag & ATTR_FLAG_BIT(BGP_ATTR_CLUSTER_LIST))
- exist_cluster = existattre->cluster->length;
+ new_cluster = BGP_CLUSTER_LIST_LENGTH(new->attr);
+ exist_cluster = BGP_CLUSTER_LIST_LENGTH(exist->attr);
if (new_cluster < exist_cluster)
return 1;
@@ -1377,7 +1393,8 @@ bgp_best_selection (struct bgp *bgp, struct bgp_node *rn,
{
if (CHECK_FLAG (ri2->flags, BGP_INFO_SELECTED))
old_select = ri2;
- if (bgp_info_cmp (bgp, ri2, new_select, &paths_eq))
+ if (bgp_info_cmp (bgp, ri2, new_select, &paths_eq,
+ mpath_cfg))
{
bgp_info_unset_flag (rn, new_select,
BGP_INFO_DMED_SELECTED);
new_select = ri2;
@@ -1436,7 +1453,7 @@ bgp_best_selection (struct bgp *bgp, struct bgp_node *rn,
bgp_info_unset_flag (rn, ri, BGP_INFO_DMED_CHECK);
bgp_info_unset_flag (rn, ri, BGP_INFO_DMED_SELECTED);
- if (bgp_info_cmp (bgp, ri, new_select, &paths_eq))
+ if (bgp_info_cmp (bgp, ri, new_select, &paths_eq, mpath_cfg))
{
if (do_mpath && bgp_flag_check (bgp, BGP_FLAG_DETERMINISTIC_MED))
bgp_mp_dmed_deselect (new_select);
diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c
index 3f2c49a..0bd2ebe 100644
--- a/bgpd/bgp_vty.c
+++ b/bgpd/bgp_vty.c
@@ -651,28 +651,41 @@ DEFUN (no_bgp_confederation_peers,
return CMD_SUCCESS;
}
-/* Maximum-paths configuration */
-DEFUN (bgp_maxpaths,
- bgp_maxpaths_cmd,
- "maximum-paths <1-255>",
- "Forward packets over multiple paths\n"
- "Number of paths\n")
+/**
+ * Central routine for maximum-paths configuration.
+ * @peer_type: BGP_PEER_EBGP or BGP_PEER_IBGP
+ * @set: 1 for setting values, 0 for removing the max-paths config.
+ */
+static int
+bgp_maxpaths_config_vty (struct vty *vty, int peer_type, const char *mpaths,
+ u_int16_t options, int set)
{
struct bgp *bgp;
- u_int16_t maxpaths;
+ u_int16_t maxpaths = 0;
int ret;
+ afi_t afi;
+ safi_t safi;
bgp = vty->index;
+ afi = bgp_node_afi (vty);
+ safi = bgp_node_safi (vty);
- VTY_GET_INTEGER_RANGE ("maximum-paths", maxpaths, argv[0], 1, 255);
+ if (set)
+ {
+ VTY_GET_INTEGER_RANGE ("maximum-paths", maxpaths, mpaths, 1, 255);
+ ret = bgp_maximum_paths_set (bgp, afi, safi, peer_type, maxpaths,
+ options);
+ }
+ else
+ ret = bgp_maximum_paths_unset (bgp, afi, safi, peer_type);
- ret = bgp_maximum_paths_set (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
- BGP_PEER_EBGP, maxpaths);
if (ret < 0)
{
vty_out (vty,
- "%% Failed to set maximum-paths %u for afi %u, safi %u%s",
- maxpaths, bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
+ "%% Failed to %sset maximum-paths %s %u for afi %u, safi %u%s",
+ (set == 1) ? "" : "un",
+ (peer_type == BGP_PEER_EBGP) ? "ebgp" : "ibgp",
+ maxpaths, afi, safi, VTY_NEWLINE);
return CMD_WARNING;
}
@@ -684,6 +697,17 @@ DEFUN (bgp_maxpaths,
return CMD_SUCCESS;
}
+
+/* Maximum-paths configuration */
+DEFUN (bgp_maxpaths,
+ bgp_maxpaths_cmd,
+ "maximum-paths <1-255>",
+ "Forward packets over multiple paths\n"
+ "Number of paths\n")
+{
+ return bgp_maxpaths_config_vty(vty, BGP_PEER_EBGP, argv[0], 0, 1);
+}
+
DEFUN (bgp_maxpaths_ibgp,
bgp_maxpaths_ibgp_cmd,
"maximum-paths ibgp <1-255>",
@@ -691,30 +715,19 @@ DEFUN (bgp_maxpaths_ibgp,
"iBGP-multipath\n"
"Number of paths\n")
{
- struct bgp *bgp;
- u_int16_t maxpaths;
- int ret;
-
- bgp = vty->index;
-
- VTY_GET_INTEGER_RANGE ("maximum-paths", maxpaths, argv[0], 1, 255);
-
- ret = bgp_maximum_paths_set (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
- BGP_PEER_IBGP, maxpaths);
- if (ret < 0)
- {
- vty_out (vty,
- "%% Failed to set maximum-paths ibgp %u for afi %u, safi %u%s",
- maxpaths, bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
- return CMD_WARNING;
- }
-
- if ((MULTIPATH_NUM != 0) && (maxpaths > MULTIPATH_NUM))
- vty_out (vty,
- "%% Warning: maximum-paths set to %d is greater than %d that zebra
is compiled to support%s",
- maxpaths, MULTIPATH_NUM, VTY_NEWLINE);
+ return bgp_maxpaths_config_vty(vty, BGP_PEER_IBGP, argv[0], 0, 1);
+}
- return CMD_SUCCESS;
+DEFUN (bgp_maxpaths_ibgp_cluster,
+ bgp_maxpaths_ibgp_cluster_cmd,
+ "maximum-paths ibgp <1-255> equal-cluster-length",
+ "Forward packets over multiple paths\n"
+ "iBGP-multipath\n"
+ "Number of paths\n"
+ "Match the cluster length\n")
+{
+ return bgp_maxpaths_config_vty(vty, BGP_PEER_IBGP, argv[0],
+ BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN, 1);
}
DEFUN (no_bgp_maxpaths,
@@ -724,22 +737,7 @@ DEFUN (no_bgp_maxpaths,
"Forward packets over multiple paths\n"
"Number of paths\n")
{
- struct bgp *bgp;
- int ret;
-
- bgp = vty->index;
-
- ret = bgp_maximum_paths_unset (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
- BGP_PEER_EBGP);
- if (ret < 0)
- {
- vty_out (vty,
- "%% Failed to unset maximum-paths for afi %u, safi %u%s",
- bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
- return CMD_WARNING;
- }
-
- return CMD_SUCCESS;
+ return bgp_maxpaths_config_vty(vty, BGP_PEER_EBGP, NULL, 0, 0);
}
ALIAS (no_bgp_maxpaths,
@@ -757,22 +755,7 @@ DEFUN (no_bgp_maxpaths_ibgp,
"iBGP-multipath\n"
"Number of paths\n")
{
- struct bgp *bgp;
- int ret;
-
- bgp = vty->index;
-
- ret = bgp_maximum_paths_unset (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
- BGP_PEER_IBGP);
- if (ret < 0)
- {
- vty_out (vty,
- "%% Failed to unset maximum-paths ibgp for afi %u, safi %u%s",
- bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
- return CMD_WARNING;
- }
-
- return CMD_SUCCESS;
+ return bgp_maxpaths_config_vty(vty, BGP_PEER_IBGP, NULL, 0, 0);
}
ALIAS (no_bgp_maxpaths_ibgp,
@@ -783,6 +766,15 @@ ALIAS (no_bgp_maxpaths_ibgp,
"iBGP-multipath\n"
"Number of paths\n")
+ALIAS (no_bgp_maxpaths_ibgp,
+ no_bgp_maxpaths_ibgp_cluster_cmd,
+ "no maximum-paths ibgp <1-255> equal-cluster-length",
+ NO_STR
+ "Forward packets over multiple paths\n"
+ "iBGP-multipath\n"
+ "Number of paths\n"
+ "Match the cluster length\n")
+
int
bgp_config_write_maxpaths (struct vty *vty, struct bgp *bgp, afi_t afi,
safi_t safi, int *write)
@@ -797,8 +789,12 @@ bgp_config_write_maxpaths (struct vty *vty, struct bgp
*bgp, afi_t afi,
if (bgp->maxpaths[afi][safi].maxpaths_ibgp != BGP_DEFAULT_MAXPATHS)
{
bgp_config_write_family_header (vty, afi, safi, write);
- vty_out (vty, " maximum-paths ibgp %d%s",
- bgp->maxpaths[afi][safi].maxpaths_ibgp, VTY_NEWLINE);
+ vty_out (vty, " maximum-paths ibgp %d",
+ bgp->maxpaths[afi][safi].maxpaths_ibgp);
+ if (CHECK_FLAG (bgp->maxpaths[afi][safi].ibgp_flags,
+ BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN))
+ vty_out (vty, " equal-cluster-length");
+ vty_out (vty, "%s", VTY_NEWLINE);
}
return 0;
@@ -9183,14 +9179,20 @@ bgp_vty_init (void)
install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_cmd);
install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_arg_cmd);
install_element (BGP_NODE, &bgp_maxpaths_ibgp_cmd);
+ install_element(BGP_NODE, &bgp_maxpaths_ibgp_cluster_cmd);
install_element (BGP_NODE, &no_bgp_maxpaths_ibgp_cmd);
install_element (BGP_NODE, &no_bgp_maxpaths_ibgp_arg_cmd);
+ install_element (BGP_NODE, &no_bgp_maxpaths_ibgp_cluster_cmd);
install_element (BGP_IPV4_NODE, &bgp_maxpaths_ibgp_cmd);
+ install_element(BGP_IPV4_NODE, &bgp_maxpaths_ibgp_cluster_cmd);
install_element (BGP_IPV4_NODE, &no_bgp_maxpaths_ibgp_cmd);
+ install_element (BGP_IPV4_NODE, &no_bgp_maxpaths_ibgp_cluster_cmd);
install_element (BGP_IPV4_NODE, &no_bgp_maxpaths_ibgp_arg_cmd);
install_element (BGP_IPV6_NODE, &bgp_maxpaths_ibgp_cmd);
+ install_element(BGP_IPV6_NODE, &bgp_maxpaths_ibgp_cluster_cmd);
install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_ibgp_cmd);
install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_ibgp_arg_cmd);
+ install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_ibgp_cluster_cmd);
/* "timers bgp" commands. */
install_element (BGP_NODE, &bgp_timers_cmd);
diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
index 7ae0acb..f5e6936 100644
--- a/bgpd/bgpd.h
+++ b/bgpd/bgpd.h
@@ -171,6 +171,8 @@ struct bgp
struct bgp_maxpaths_cfg {
u_int16_t maxpaths_ebgp;
u_int16_t maxpaths_ibgp;
+ u_int16_t ibgp_flags;
+#define BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN (1 << 0)
} maxpaths[AFI_MAX][SAFI_MAX];
};
diff --git a/tests/bgp_mpath_test.c b/tests/bgp_mpath_test.c
index 3594753..812c1f6 100644
--- a/tests/bgp_mpath_test.c
+++ b/tests/bgp_mpath_test.c
@@ -157,9 +157,9 @@ run_bgp_cfg_maximum_paths (testcase_t *t)
for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
{
/* test bgp_maximum_paths_set */
- api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_EBGP, 10);
+ api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_EBGP, 10,
0);
EXPECT_TRUE (api_result == 0, test_result);
- api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_IBGP, 10);
+ api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_IBGP, 10,
0);
EXPECT_TRUE (api_result == 0, test_result);
EXPECT_TRUE (bgp->maxpaths[afi][safi].maxpaths_ebgp == 10,
test_result);
EXPECT_TRUE (bgp->maxpaths[afi][safi].maxpaths_ibgp == 10,
test_result);
--
1.7.10.4
_______________________________________________
Quagga-dev mailing list
[email protected]
https://lists.quagga.net/mailman/listinfo/quagga-dev