On Thu, Jan 14, 2021 at 12:27:54PM +0100, Claudio Jeker wrote:
> Currently bgpd does not properly handle strict med route decisions.
> The problem is that the strict MED check only matters for aspaths with the
> same neighbor as. The route decision process currently stops as soon as
> the current prefix is better then the one checked in the list of prefixes.
> Now in some cases this results in unstable decisions because the order of
> insertions matter. Depending on the order any route may be selected.
> The med.sh regress test I added shows this issue. Depending on the order
> any of the 3 routes can be selected as best:
> 
> 1:
> flags ovs destination          gateway          lpref   med aspath origin
> *>      N 10.12.1.0/24         10.12.57.4        100    50 64501 64510 i
> *       N 10.12.1.0/24         10.12.57.2        100   100 64501 64510 i
> *       N 10.12.1.0/24         10.12.57.3        100   100 64502 64510 i
> 
> 2:
> flags ovs destination          gateway          lpref   med aspath origin
> *>      N 10.12.1.0/24         10.12.57.2        100   100 64501 64510 i
> *       N 10.12.1.0/24         10.12.57.3        100   100 64502 64510 i
> *       N 10.12.1.0/24         10.12.57.4        100    50 64501 64510 i
> 
> 3 (and the actual expected result):
> flags ovs destination          gateway          lpref   med aspath origin
> *>      N 10.12.1.0/24         10.12.57.3        100   100 64502 64510 i
> *       N 10.12.1.0/24         10.12.57.4        100    50 64501 64510 i
> *       N 10.12.1.0/24         10.12.57.2        100   100 64501 64510 i
> 
> Additionally removing a route requires to reevaluate part of the routes in
> some cases. For examle in case 3 if the middle route (with med 50) is
> removed then the last route actually becomes best (bgpid is lower).
> 
> The following diff fixes this issue hopefully. On insertion if decisions
> happen at or after the MED check (step 5) then all remaining routes need
> to be checked (until a check before step 5 happens). Routes matching on
> med that need to be re-evaluated are put on a redo queue and at the end of
> the decision process are put back to get their order right.
> 
> Something similar happens when removing a prefix. If the next prefix
> differ on a check after the MED check then again all those prefixes need
> to be rechecked and maybe re-evaluated.
> 
> This change is important but also rather complex. Please test and if
> possible validate that it does not cause troubles in your setup.
> Btw. this only matters for 'rde med compare strict' (default). 

I would really like some feedback on this. This is a major issue in bgpd
that finally needs to be fixed. Please help by testing this out.

-- 
:wq Claudio

Index: rde_decide.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.80
diff -u -p -r1.80 rde_decide.c
--- rde_decide.c        14 Jan 2021 08:29:26 -0000      1.80
+++ rde_decide.c        14 Jan 2021 10:16:30 -0000
@@ -26,7 +26,9 @@
 #include "rde.h"
 #include "log.h"
 
-int    prefix_cmp(struct prefix *, struct prefix *);
+int    prefix_cmp(struct prefix *, struct prefix *, int *);
+void   prefix_insert(struct prefix *, struct prefix *, struct rib_entry *);
+void   prefix_remove(struct prefix *, struct rib_entry *);
 /*
  * Decision Engine RFC implementation:
  *  Phase 1:
@@ -107,7 +109,7 @@ int prefix_cmp(struct prefix *, struct p
  * already added prefix.
  */
 int
-prefix_cmp(struct prefix *p1, struct prefix *p2)
+prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall)
 {
        struct rde_aspath       *asp1, *asp2;
        struct rde_peer         *peer1, *peer2;
@@ -115,6 +117,16 @@ prefix_cmp(struct prefix *p1, struct pre
        u_int32_t                p1id, p2id;
        int                      p1cnt, p2cnt, i;
 
+       /*
+        * If a match happens before the MED check then the list is
+        * correctly sorted. If a match happens after MED then it
+        * may further elements need to be checked to make sure that
+        * all path are considered that could affect this path.
+        * If the check happens to be on MED signal this by setting
+        * testall to 2.
+        */
+       *testall = 0;
+
        if (p1 == NULL)
                return -1;
        if (p2 == NULL)
@@ -166,10 +178,14 @@ prefix_cmp(struct prefix *p1, struct pre
        /*
         * 5. MED decision
         * Only comparable between the same neighboring AS or if
-        * 'rde med compare always' is set.
+        * 'rde med compare always' is set. In the first case
+        * set the testall flag since further elements need to be
+        * evaluated as well.
         */
        if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) ||
            aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) {
+               if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
+                       *testall = 2;
                /* lowest value wins */
                if (asp1->med < asp2->med)
                        return 1;
@@ -177,6 +193,9 @@ prefix_cmp(struct prefix *p1, struct pre
                        return -1;
        }
 
+       if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
+               *testall = 1;
+
        /*
         * 6. EBGP is cooler than IBGP
         * It is absolutely important that the ebgp value in peer_config.ebgp
@@ -264,6 +283,117 @@ prefix_cmp(struct prefix *p1, struct pre
        fatalx("Uh, oh a politician in the decision process");
 }
 
+void
+prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
+{
+       struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
+       struct prefix *xp, *np, *rp, *ip = ep;
+       int testall, selected = 0;
+
+       if (ep == NULL)
+               ep = LIST_FIRST(&re->prefix_h);
+
+       for (xp = ep; xp != NULL; xp = np) {
+               np = LIST_NEXT(xp, entry.list.rib);
+
+               if (prefix_cmp(new, xp, &testall) > 0) {
+                       /* p is preferred over xp */
+                       if (testall == 0)
+                               break;          /* we're done */
+                       else if (testall ==  2) {
+                               /*
+                                * MED inversion, take out prefix and
+                                * put it onto redo queue.
+                                */
+                               LIST_REMOVE(xp, entry.list.rib);
+                               if (LIST_EMPTY(&redo))
+                                       LIST_INSERT_HEAD(&redo, xp,
+                                           entry.list.rib);
+                               else
+                                       LIST_INSERT_AFTER(rp, xp,
+                                           entry.list.rib);
+                               rp = xp;
+                       } else {
+                               /*
+                                * lock insertion point and
+                                * continue on with scan
+                                */
+                               selected = 1;
+                               continue;
+                       }
+               } else {
+                       /*
+                        * p is less preferred, remember insertion point
+                        * If p got selected during a testall traverse 
+                        * do not alter the insertion point unless this
+                        * happened on an actual MED check.
+                        */
+                       if (testall == 2)
+                               selected = 0;
+                       if (!selected)
+                               ip = xp;
+               }
+       }
+
+       if (ip == NULL)
+               LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
+       else
+               LIST_INSERT_AFTER(ip, new, entry.list.rib);
+
+       /* Fixup MED order again */
+       while (!LIST_EMPTY(&redo)) {
+               xp = LIST_FIRST(&redo);
+               LIST_REMOVE(xp, entry.list.rib);
+
+               prefix_insert(xp, new, re);
+       }
+}
+
+void
+prefix_remove(struct prefix *old, struct rib_entry *re)
+{
+       struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
+       struct prefix *xp, *np, *rp;
+       int testall;
+
+       xp = LIST_NEXT(old, entry.list.rib);
+       LIST_REMOVE(old, entry.list.rib);
+       prefix_cmp(old, xp, &testall);
+       if (testall > 0) {
+               /* maybe MED route, scan tail for other possible routes */
+               for (; xp != NULL; xp = np) {
+                       np = LIST_NEXT(xp, entry.list.rib);
+
+                       /* only interested in the testall result */
+                       prefix_cmp(old, xp, &testall);
+                       if (testall == 0)
+                               break;          /* we're done */
+                       else if (testall ==  2) {
+                               /*
+                                * possible MED inversion, take out prefix and
+                                * put it onto redo queue.
+                                */
+                               LIST_REMOVE(xp, entry.list.rib);
+                               if (LIST_EMPTY(&redo))
+                                       LIST_INSERT_HEAD(&redo, xp,
+                                           entry.list.rib);
+                               else
+                                       LIST_INSERT_AFTER(rp, xp,
+                                           entry.list.rib);
+                               rp = xp;
+                       }
+               }
+       }
+
+       /* Fixup MED order again */
+       while (!LIST_EMPTY(&redo)) {
+               xp = LIST_FIRST(&redo);
+               LIST_REMOVE(xp, entry.list.rib);
+
+               prefix_insert(xp, NULL, re);
+       }
+}
+
 /*
  * Find the correct place to insert the prefix in the prefix list.
  * If the active prefix has changed we need to send an update.
@@ -294,27 +424,10 @@ prefix_evaluate(struct rib_entry *re, st
        }
 
        if (old != NULL)
-               LIST_REMOVE(old, entry.list.rib);
-               
-       if (new != NULL) {
-               if (LIST_EMPTY(&re->prefix_h))
-                       LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
-               else {
-                       LIST_FOREACH(xp, &re->prefix_h, entry.list.rib) {
-                               if (prefix_cmp(new, xp) > 0) {
-                                       LIST_INSERT_BEFORE(xp, new,
-                                           entry.list.rib);
-                                       break;
-                               } else if (LIST_NEXT(xp, entry.list.rib) ==
-                                   NULL) {
-                                       /* if xp last element ... */
-                                       LIST_INSERT_AFTER(xp, new,
-                                           entry.list.rib);
-                                       break;
-                               }
-                       }
-               }
-       }
+               prefix_remove(old, re);
+
+       if (new != NULL)
+               prefix_insert(new, NULL, re);
 
        xp = LIST_FIRST(&re->prefix_h);
        if (xp != NULL) {

Reply via email to