From: Pradosh Mohapatra <[email protected]>

A fat tree topology running IBGP gets into two issues with anycast address
routing. Consider the following topology:

        R9   R10
          x x
  R3   R4     R7   R8
     x           x
  R1   R2     R5   R6
  |    |      |    |
 10/8 10/8  10/8   S

Let's remind ourselves of BGP decision process steps:

1. Highest Local Preference
2. Shortest AS Path Length
3. Lowest Origin Type
4. Lowest MED (Multi-Exit Discriminator)
5. Prefer External to Internal
6. Closest Egress (Lowest IGP Distance)
7. Tie Breaking (Lowest-Router-ID)
8. Tie Breaking (Lowest-cluster-list length)
9. Tie Breaking (Lowest-neighbor-address)

Without any policies, steps 1-6 will almost always evaluate identically for
all paths received on any router in the above topology. Let's assume that
the router-ids follow the following inequality: R1 < R2 < R5 < R6. Owing to
the 7th step above, all routers will now choose R1's path as the best. This
is undesirable. As an example, traffic from S to 10/8 will follow the path
S -> R6 -> R7 -> R9 -> R4 -> R2 -> 10/8 instead of S -> R6 -> R7 -> R5 -> 10/8.
Furthermore, once R7 (& R8) chooses R1's path as the best, it would withdraw
its path learned through (R5, R6) from (R9, R10). This leads to inefficient
load balancing - e.g. R9 can't do ECMP across all available egresses -
(R1, R2, R5).

The patch addresses these issues by noting that that cluster list is always
carried along with the routes and its length is a good indicator of IBGP
hops. It thus makes sense to compare that as an extension to metric after
step 6. That automatically ensures correct multipath computation.

Unfortunately a partial deployment of this in a generic topology (note:
fat-tree/clos topologies work fine) may lead to potential loops. It needs
to be looked into.

Signed-off-by: Pradosh Mohapatra <[email protected]>
Reviewed-by: Dinesh G Dutt <[email protected]>
---
 bgpd/bgp_attr.h        |    4 ++
 bgpd/bgp_mpath.c       |    4 +-
 bgpd/bgp_mpath.h       |    3 +-
 bgpd/bgp_route.c       |   35 ++++++++----
 bgpd/bgp_vty.c         |  140 ++++++++++++++++++++++++------------------------
 bgpd/bgpd.h            |    2 +
 tests/bgp_mpath_test.c |    4 +-
 7 files changed, 110 insertions(+), 82 deletions(-)

diff --git a/bgpd/bgp_attr.h b/bgpd/bgp_attr.h
index b59fa8e..ac346e5 100644
--- a/bgpd/bgp_attr.h
+++ b/bgpd/bgp_attr.h
@@ -132,6 +132,10 @@ struct transit
 
 #define ATTR_FLAG_BIT(X)  (1 << ((X) - 1))
 
+#define BGP_CLUSTER_LIST_LENGTH(attr)                          \
+  (((attr)->flag & ATTR_FLAG_BIT(BGP_ATTR_CLUSTER_LIST)) ?     \
+   (attr)->extra->cluster->length : 0)
+
 typedef enum {
  BGP_ATTR_PARSE_PROCEED = 0,
  BGP_ATTR_PARSE_ERROR = -1,
diff --git a/bgpd/bgp_mpath.c b/bgpd/bgp_mpath.c
index 7999d16..5590b05 100644
--- a/bgpd/bgp_mpath.c
+++ b/bgpd/bgp_mpath.c
@@ -46,7 +46,7 @@
  */
 int
 bgp_maximum_paths_set (struct bgp *bgp, afi_t afi, safi_t safi,
-                       int peertype, u_int16_t maxpaths)
+                       int peertype, u_int16_t maxpaths, u_int16_t options)
 {
   if (!bgp || (afi >= AFI_MAX) || (safi >= SAFI_MAX))
     return -1;
@@ -55,6 +55,7 @@ bgp_maximum_paths_set (struct bgp *bgp, afi_t afi, safi_t 
safi,
     {
     case BGP_PEER_IBGP:
       bgp->maxpaths[afi][safi].maxpaths_ibgp = maxpaths;
+      bgp->maxpaths[afi][safi].ibgp_flags |= options;
       break;
     case BGP_PEER_EBGP:
       bgp->maxpaths[afi][safi].maxpaths_ebgp = maxpaths;
@@ -82,6 +83,7 @@ bgp_maximum_paths_unset (struct bgp *bgp, afi_t afi, safi_t 
safi,
     {
     case BGP_PEER_IBGP:
       bgp->maxpaths[afi][safi].maxpaths_ibgp = BGP_DEFAULT_MAXPATHS;
+      bgp->maxpaths[afi][safi].ibgp_flags = 0;
       break;
     case BGP_PEER_EBGP:
       bgp->maxpaths[afi][safi].maxpaths_ebgp = BGP_DEFAULT_MAXPATHS;
diff --git a/bgpd/bgp_mpath.h b/bgpd/bgp_mpath.h
index 37b9ac8..0645e6c 100644
--- a/bgpd/bgp_mpath.h
+++ b/bgpd/bgp_mpath.h
@@ -49,7 +49,8 @@ struct bgp_info_mpath
 };
 
 /* Functions to support maximum-paths configuration */
-extern int bgp_maximum_paths_set (struct bgp *, afi_t, safi_t, int, u_int16_t);
+extern int bgp_maximum_paths_set (struct bgp *, afi_t, safi_t, int, u_int16_t,
+                                 u_int16_t);
 extern int bgp_maximum_paths_unset (struct bgp *, afi_t, safi_t, int);
 
 /* Functions used by bgp_best_selection to record current
diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
index 316fa5a..24c17d6 100644
--- a/bgpd/bgp_route.c
+++ b/bgpd/bgp_route.c
@@ -324,7 +324,7 @@ bgp_med_value (struct attr *attr, struct bgp *bgp)
 /* Compare two bgp route entity.  br is preferable then return 1. */
 static int
 bgp_info_cmp (struct bgp *bgp, struct bgp_info *new, struct bgp_info *exist,
-             int *paths_eq)
+             int *paths_eq, struct bgp_maxpaths_cfg *mpath_cfg)
 {
   struct attr *newattr, *existattr;
   struct attr_extra *newattre, *existattre;
@@ -477,6 +477,26 @@ bgp_info_cmp (struct bgp *bgp, struct bgp_info *new, 
struct bgp_info *exist,
   if (newm > existm)
     ret = 0;
 
+  /* 8.1. Same IGP metric. Compare the cluster list length as
+     representative of IGP hops metric. Rewrite the metric value
+     pair (newm, existm) with the cluster list length. Prefer the
+     path with smaller cluster list length.                       */
+  if (newm == existm)
+    {
+      if (peer_sort (new->peer) == BGP_PEER_IBGP
+         && peer_sort (exist->peer) == BGP_PEER_IBGP
+         && CHECK_FLAG (mpath_cfg->ibgp_flags,
+                        BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN))
+       {
+         newm = BGP_CLUSTER_LIST_LENGTH(new->attr);
+         existm = BGP_CLUSTER_LIST_LENGTH(exist->attr);
+         if (newm < existm)
+           ret = 1;
+         if (newm > existm)
+           ret = 0;
+       }
+    }
+
   /* 9. Maximum path check. */
   if (newm == existm)
     {
@@ -544,12 +564,8 @@ bgp_info_cmp (struct bgp *bgp, struct bgp_info *new, 
struct bgp_info *exist,
     return 0;
 
   /* 12. Cluster length comparision. */
-  new_cluster = exist_cluster = 0;
-
-  if (newattr->flag & ATTR_FLAG_BIT(BGP_ATTR_CLUSTER_LIST))
-    new_cluster = newattre->cluster->length;
-  if (existattr->flag & ATTR_FLAG_BIT(BGP_ATTR_CLUSTER_LIST))
-    exist_cluster = existattre->cluster->length;
+  new_cluster = BGP_CLUSTER_LIST_LENGTH(new->attr);
+  exist_cluster = BGP_CLUSTER_LIST_LENGTH(exist->attr);
 
   if (new_cluster < exist_cluster)
     return 1;
@@ -1377,7 +1393,8 @@ bgp_best_selection (struct bgp *bgp, struct bgp_node *rn,
                {
                  if (CHECK_FLAG (ri2->flags, BGP_INFO_SELECTED))
                    old_select = ri2;
-                 if (bgp_info_cmp (bgp, ri2, new_select, &paths_eq))
+                 if (bgp_info_cmp (bgp, ri2, new_select, &paths_eq,
+                                   mpath_cfg))
                    {
                      bgp_info_unset_flag (rn, new_select, 
BGP_INFO_DMED_SELECTED);
                      new_select = ri2;
@@ -1436,7 +1453,7 @@ bgp_best_selection (struct bgp *bgp, struct bgp_node *rn,
       bgp_info_unset_flag (rn, ri, BGP_INFO_DMED_CHECK);
       bgp_info_unset_flag (rn, ri, BGP_INFO_DMED_SELECTED);
 
-      if (bgp_info_cmp (bgp, ri, new_select, &paths_eq))
+      if (bgp_info_cmp (bgp, ri, new_select, &paths_eq, mpath_cfg))
        {
          if (do_mpath && bgp_flag_check (bgp, BGP_FLAG_DETERMINISTIC_MED))
            bgp_mp_dmed_deselect (new_select);
diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c
index 3f2c49a..0bd2ebe 100644
--- a/bgpd/bgp_vty.c
+++ b/bgpd/bgp_vty.c
@@ -651,28 +651,41 @@ DEFUN (no_bgp_confederation_peers,
   return CMD_SUCCESS;
 }
 
-/* Maximum-paths configuration */
-DEFUN (bgp_maxpaths,
-       bgp_maxpaths_cmd,
-       "maximum-paths <1-255>",
-       "Forward packets over multiple paths\n"
-       "Number of paths\n")
+/**
+ * Central routine for maximum-paths configuration.
+ * @peer_type: BGP_PEER_EBGP or BGP_PEER_IBGP
+ * @set: 1 for setting values, 0 for removing the max-paths config.
+ */
+static int
+bgp_maxpaths_config_vty (struct vty *vty, int peer_type, const char *mpaths,
+                        u_int16_t options, int set)
 {
   struct bgp *bgp;
-  u_int16_t maxpaths;
+  u_int16_t maxpaths = 0;
   int ret;
+  afi_t afi;
+  safi_t safi;
 
   bgp = vty->index;
+  afi = bgp_node_afi (vty);
+  safi = bgp_node_safi (vty);
 
-  VTY_GET_INTEGER_RANGE ("maximum-paths", maxpaths, argv[0], 1, 255);
+  if (set)
+    {
+      VTY_GET_INTEGER_RANGE ("maximum-paths", maxpaths, mpaths, 1, 255);
+      ret = bgp_maximum_paths_set (bgp, afi, safi, peer_type, maxpaths,
+                                  options);
+    }
+  else
+    ret = bgp_maximum_paths_unset (bgp, afi, safi, peer_type);
 
-  ret = bgp_maximum_paths_set (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
-                              BGP_PEER_EBGP, maxpaths);
   if (ret < 0)
     {
       vty_out (vty,
-              "%% Failed to set maximum-paths %u for afi %u, safi %u%s",
-              maxpaths, bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
+              "%% Failed to %sset maximum-paths %s %u for afi %u, safi %u%s",
+              (set == 1) ? "" : "un",
+              (peer_type == BGP_PEER_EBGP) ? "ebgp" : "ibgp",
+              maxpaths, afi, safi, VTY_NEWLINE);
       return CMD_WARNING;
     }
 
@@ -684,6 +697,17 @@ DEFUN (bgp_maxpaths,
   return CMD_SUCCESS;
 }
 
+
+/* Maximum-paths configuration */
+DEFUN (bgp_maxpaths,
+       bgp_maxpaths_cmd,
+       "maximum-paths <1-255>",
+       "Forward packets over multiple paths\n"
+       "Number of paths\n")
+{
+  return bgp_maxpaths_config_vty(vty, BGP_PEER_EBGP, argv[0], 0, 1);
+}
+
 DEFUN (bgp_maxpaths_ibgp,
        bgp_maxpaths_ibgp_cmd,
        "maximum-paths ibgp <1-255>",
@@ -691,30 +715,19 @@ DEFUN (bgp_maxpaths_ibgp,
        "iBGP-multipath\n"
        "Number of paths\n")
 {
-  struct bgp *bgp;
-  u_int16_t maxpaths;
-  int ret;
-
-  bgp = vty->index;
-
-  VTY_GET_INTEGER_RANGE ("maximum-paths", maxpaths, argv[0], 1, 255);
-
-  ret = bgp_maximum_paths_set (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
-                              BGP_PEER_IBGP, maxpaths);
-  if (ret < 0)
-    {
-      vty_out (vty,
-              "%% Failed to set maximum-paths ibgp %u for afi %u, safi %u%s",
-              maxpaths, bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
-      return CMD_WARNING;
-    }
-
-  if ((MULTIPATH_NUM != 0) && (maxpaths > MULTIPATH_NUM))
-    vty_out (vty,
-            "%% Warning: maximum-paths set to %d is greater than %d that zebra 
is compiled to support%s",
-            maxpaths, MULTIPATH_NUM, VTY_NEWLINE);
+  return bgp_maxpaths_config_vty(vty, BGP_PEER_IBGP, argv[0], 0, 1);
+}
 
-  return CMD_SUCCESS;
+DEFUN (bgp_maxpaths_ibgp_cluster,
+       bgp_maxpaths_ibgp_cluster_cmd,
+       "maximum-paths ibgp <1-255> equal-cluster-length",
+       "Forward packets over multiple paths\n"
+       "iBGP-multipath\n"
+       "Number of paths\n"
+       "Match the cluster length\n")
+{
+  return bgp_maxpaths_config_vty(vty, BGP_PEER_IBGP, argv[0],
+                                BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN, 1);
 }
 
 DEFUN (no_bgp_maxpaths,
@@ -724,22 +737,7 @@ DEFUN (no_bgp_maxpaths,
        "Forward packets over multiple paths\n"
        "Number of paths\n")
 {
-  struct bgp *bgp;
-  int ret;
-
-  bgp = vty->index;
-
-  ret = bgp_maximum_paths_unset (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
-                                BGP_PEER_EBGP);
-  if (ret < 0)
-    {
-      vty_out (vty,
-              "%% Failed to unset maximum-paths for afi %u, safi %u%s",
-              bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
-      return CMD_WARNING;
-    }
-
-  return CMD_SUCCESS;
+  return bgp_maxpaths_config_vty(vty, BGP_PEER_EBGP, NULL, 0, 0);
 }
 
 ALIAS (no_bgp_maxpaths,
@@ -757,22 +755,7 @@ DEFUN (no_bgp_maxpaths_ibgp,
        "iBGP-multipath\n"
        "Number of paths\n")
 {
-  struct bgp *bgp;
-  int ret;
-
-  bgp = vty->index;
-
-  ret = bgp_maximum_paths_unset (bgp, bgp_node_afi (vty), bgp_node_safi(vty),
-                                BGP_PEER_IBGP);
-  if (ret < 0)
-    {
-      vty_out (vty,
-              "%% Failed to unset maximum-paths ibgp for afi %u, safi %u%s",
-              bgp_node_afi (vty), bgp_node_safi(vty), VTY_NEWLINE);
-      return CMD_WARNING;
-    }
-
-  return CMD_SUCCESS;
+  return bgp_maxpaths_config_vty(vty, BGP_PEER_IBGP, NULL, 0, 0);
 }
 
 ALIAS (no_bgp_maxpaths_ibgp,
@@ -783,6 +766,15 @@ ALIAS (no_bgp_maxpaths_ibgp,
        "iBGP-multipath\n"
        "Number of paths\n")
 
+ALIAS (no_bgp_maxpaths_ibgp,
+       no_bgp_maxpaths_ibgp_cluster_cmd,
+       "no maximum-paths ibgp <1-255> equal-cluster-length",
+       NO_STR
+       "Forward packets over multiple paths\n"
+       "iBGP-multipath\n"
+       "Number of paths\n"
+       "Match the cluster length\n")
+
 int
 bgp_config_write_maxpaths (struct vty *vty, struct bgp *bgp, afi_t afi,
                           safi_t safi, int *write)
@@ -797,8 +789,12 @@ bgp_config_write_maxpaths (struct vty *vty, struct bgp 
*bgp, afi_t afi,
   if (bgp->maxpaths[afi][safi].maxpaths_ibgp != BGP_DEFAULT_MAXPATHS)
     {
       bgp_config_write_family_header (vty, afi, safi, write);
-      vty_out (vty, " maximum-paths ibgp %d%s",
-              bgp->maxpaths[afi][safi].maxpaths_ibgp, VTY_NEWLINE);
+      vty_out (vty, " maximum-paths ibgp %d",
+              bgp->maxpaths[afi][safi].maxpaths_ibgp);
+      if (CHECK_FLAG (bgp->maxpaths[afi][safi].ibgp_flags,
+                     BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN))
+       vty_out (vty, " equal-cluster-length");
+      vty_out (vty, "%s", VTY_NEWLINE);
     }
 
   return 0;
@@ -9183,14 +9179,20 @@ bgp_vty_init (void)
   install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_cmd);
   install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_arg_cmd);
   install_element (BGP_NODE, &bgp_maxpaths_ibgp_cmd);
+  install_element(BGP_NODE, &bgp_maxpaths_ibgp_cluster_cmd);
   install_element (BGP_NODE, &no_bgp_maxpaths_ibgp_cmd);
   install_element (BGP_NODE, &no_bgp_maxpaths_ibgp_arg_cmd);
+  install_element (BGP_NODE, &no_bgp_maxpaths_ibgp_cluster_cmd);
   install_element (BGP_IPV4_NODE, &bgp_maxpaths_ibgp_cmd);
+  install_element(BGP_IPV4_NODE, &bgp_maxpaths_ibgp_cluster_cmd);
   install_element (BGP_IPV4_NODE, &no_bgp_maxpaths_ibgp_cmd);
+  install_element (BGP_IPV4_NODE, &no_bgp_maxpaths_ibgp_cluster_cmd);
   install_element (BGP_IPV4_NODE, &no_bgp_maxpaths_ibgp_arg_cmd);
   install_element (BGP_IPV6_NODE, &bgp_maxpaths_ibgp_cmd);
+  install_element(BGP_IPV6_NODE, &bgp_maxpaths_ibgp_cluster_cmd);
   install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_ibgp_cmd);
   install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_ibgp_arg_cmd);
+  install_element (BGP_IPV6_NODE, &no_bgp_maxpaths_ibgp_cluster_cmd);
 
   /* "timers bgp" commands. */
   install_element (BGP_NODE, &bgp_timers_cmd);
diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
index 7ae0acb..f5e6936 100644
--- a/bgpd/bgpd.h
+++ b/bgpd/bgpd.h
@@ -171,6 +171,8 @@ struct bgp
   struct bgp_maxpaths_cfg {
     u_int16_t maxpaths_ebgp;
     u_int16_t maxpaths_ibgp;
+    u_int16_t ibgp_flags;
+#define BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN (1 << 0)
   } maxpaths[AFI_MAX][SAFI_MAX];
 };
 
diff --git a/tests/bgp_mpath_test.c b/tests/bgp_mpath_test.c
index 3594753..812c1f6 100644
--- a/tests/bgp_mpath_test.c
+++ b/tests/bgp_mpath_test.c
@@ -157,9 +157,9 @@ run_bgp_cfg_maximum_paths (testcase_t *t)
     for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
       {
         /* test bgp_maximum_paths_set */
-        api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_EBGP, 10);
+        api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_EBGP, 10, 
0);
         EXPECT_TRUE (api_result == 0, test_result);
-        api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_IBGP, 10);
+        api_result = bgp_maximum_paths_set (bgp, afi, safi, BGP_PEER_IBGP, 10, 
0);
         EXPECT_TRUE (api_result == 0, test_result);
         EXPECT_TRUE (bgp->maxpaths[afi][safi].maxpaths_ebgp == 10, 
test_result);
         EXPECT_TRUE (bgp->maxpaths[afi][safi].maxpaths_ibgp == 10, 
test_result);
-- 
1.7.10.4


_______________________________________________
Quagga-dev mailing list
[email protected]
https://lists.quagga.net/mailman/listinfo/quagga-dev

Reply via email to