On Thu, Jan 14, 2021 at 12:27:54PM +0100, Claudio Jeker wrote:
> Currently bgpd does not properly handle strict med route decisions.
> The problem is that the strict MED check only matters for aspaths with the
> same neighbor as. The route decision process currently stops as soon as
> the current prefix is better then the one checked in the list of prefixes.
> Now in some cases this results in unstable decisions because the order of
> insertions matter. Depending on the order any route may be selected.
> The med.sh regress test I added shows this issue. Depending on the order
> any of the 3 routes can be selected as best:
>
> 1:
> flags ovs destination gateway lpref med aspath origin
> *> N 10.12.1.0/24 10.12.57.4 100 50 64501 64510 i
> * N 10.12.1.0/24 10.12.57.2 100 100 64501 64510 i
> * N 10.12.1.0/24 10.12.57.3 100 100 64502 64510 i
>
> 2:
> flags ovs destination gateway lpref med aspath origin
> *> N 10.12.1.0/24 10.12.57.2 100 100 64501 64510 i
> * N 10.12.1.0/24 10.12.57.3 100 100 64502 64510 i
> * N 10.12.1.0/24 10.12.57.4 100 50 64501 64510 i
>
> 3 (and the actual expected result):
> flags ovs destination gateway lpref med aspath origin
> *> N 10.12.1.0/24 10.12.57.3 100 100 64502 64510 i
> * N 10.12.1.0/24 10.12.57.4 100 50 64501 64510 i
> * N 10.12.1.0/24 10.12.57.2 100 100 64501 64510 i
>
> Additionally removing a route requires to reevaluate part of the routes in
> some cases. For examle in case 3 if the middle route (with med 50) is
> removed then the last route actually becomes best (bgpid is lower).
>
> The following diff fixes this issue hopefully. On insertion if decisions
> happen at or after the MED check (step 5) then all remaining routes need
> to be checked (until a check before step 5 happens). Routes matching on
> med that need to be re-evaluated are put on a redo queue and at the end of
> the decision process are put back to get their order right.
>
> Something similar happens when removing a prefix. If the next prefix
> differ on a check after the MED check then again all those prefixes need
> to be rechecked and maybe re-evaluated.
>
> This change is important but also rather complex. Please test and if
> possible validate that it does not cause troubles in your setup.
> Btw. this only matters for 'rde med compare strict' (default).
I would really like some feedback on this. This is a major issue in bgpd
that finally needs to be fixed. Please help by testing this out.
--
:wq Claudio
Index: rde_decide.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.80
diff -u -p -r1.80 rde_decide.c
--- rde_decide.c 14 Jan 2021 08:29:26 -0000 1.80
+++ rde_decide.c 14 Jan 2021 10:16:30 -0000
@@ -26,7 +26,9 @@
#include "rde.h"
#include "log.h"
-int prefix_cmp(struct prefix *, struct prefix *);
+int prefix_cmp(struct prefix *, struct prefix *, int *);
+void prefix_insert(struct prefix *, struct prefix *, struct rib_entry *);
+void prefix_remove(struct prefix *, struct rib_entry *);
/*
* Decision Engine RFC implementation:
* Phase 1:
@@ -107,7 +109,7 @@ int prefix_cmp(struct prefix *, struct p
* already added prefix.
*/
int
-prefix_cmp(struct prefix *p1, struct prefix *p2)
+prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall)
{
struct rde_aspath *asp1, *asp2;
struct rde_peer *peer1, *peer2;
@@ -115,6 +117,16 @@ prefix_cmp(struct prefix *p1, struct pre
u_int32_t p1id, p2id;
int p1cnt, p2cnt, i;
+ /*
+ * If a match happens before the MED check then the list is
+ * correctly sorted. If a match happens after MED then it
+ * may further elements need to be checked to make sure that
+ * all path are considered that could affect this path.
+ * If the check happens to be on MED signal this by setting
+ * testall to 2.
+ */
+ *testall = 0;
+
if (p1 == NULL)
return -1;
if (p2 == NULL)
@@ -166,10 +178,14 @@ prefix_cmp(struct prefix *p1, struct pre
/*
* 5. MED decision
* Only comparable between the same neighboring AS or if
- * 'rde med compare always' is set.
+ * 'rde med compare always' is set. In the first case
+ * set the testall flag since further elements need to be
+ * evaluated as well.
*/
if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) ||
aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) {
+ if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
+ *testall = 2;
/* lowest value wins */
if (asp1->med < asp2->med)
return 1;
@@ -177,6 +193,9 @@ prefix_cmp(struct prefix *p1, struct pre
return -1;
}
+ if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
+ *testall = 1;
+
/*
* 6. EBGP is cooler than IBGP
* It is absolutely important that the ebgp value in peer_config.ebgp
@@ -264,6 +283,117 @@ prefix_cmp(struct prefix *p1, struct pre
fatalx("Uh, oh a politician in the decision process");
}
+void
+prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
+{
+ struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
+ struct prefix *xp, *np, *rp, *ip = ep;
+ int testall, selected = 0;
+
+ if (ep == NULL)
+ ep = LIST_FIRST(&re->prefix_h);
+
+ for (xp = ep; xp != NULL; xp = np) {
+ np = LIST_NEXT(xp, entry.list.rib);
+
+ if (prefix_cmp(new, xp, &testall) > 0) {
+ /* p is preferred over xp */
+ if (testall == 0)
+ break; /* we're done */
+ else if (testall == 2) {
+ /*
+ * MED inversion, take out prefix and
+ * put it onto redo queue.
+ */
+ LIST_REMOVE(xp, entry.list.rib);
+ if (LIST_EMPTY(&redo))
+ LIST_INSERT_HEAD(&redo, xp,
+ entry.list.rib);
+ else
+ LIST_INSERT_AFTER(rp, xp,
+ entry.list.rib);
+ rp = xp;
+ } else {
+ /*
+ * lock insertion point and
+ * continue on with scan
+ */
+ selected = 1;
+ continue;
+ }
+ } else {
+ /*
+ * p is less preferred, remember insertion point
+ * If p got selected during a testall traverse
+ * do not alter the insertion point unless this
+ * happened on an actual MED check.
+ */
+ if (testall == 2)
+ selected = 0;
+ if (!selected)
+ ip = xp;
+ }
+ }
+
+ if (ip == NULL)
+ LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
+ else
+ LIST_INSERT_AFTER(ip, new, entry.list.rib);
+
+ /* Fixup MED order again */
+ while (!LIST_EMPTY(&redo)) {
+ xp = LIST_FIRST(&redo);
+ LIST_REMOVE(xp, entry.list.rib);
+
+ prefix_insert(xp, new, re);
+ }
+}
+
+void
+prefix_remove(struct prefix *old, struct rib_entry *re)
+{
+ struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
+ struct prefix *xp, *np, *rp;
+ int testall;
+
+ xp = LIST_NEXT(old, entry.list.rib);
+ LIST_REMOVE(old, entry.list.rib);
+ prefix_cmp(old, xp, &testall);
+ if (testall > 0) {
+ /* maybe MED route, scan tail for other possible routes */
+ for (; xp != NULL; xp = np) {
+ np = LIST_NEXT(xp, entry.list.rib);
+
+ /* only interested in the testall result */
+ prefix_cmp(old, xp, &testall);
+ if (testall == 0)
+ break; /* we're done */
+ else if (testall == 2) {
+ /*
+ * possible MED inversion, take out prefix and
+ * put it onto redo queue.
+ */
+ LIST_REMOVE(xp, entry.list.rib);
+ if (LIST_EMPTY(&redo))
+ LIST_INSERT_HEAD(&redo, xp,
+ entry.list.rib);
+ else
+ LIST_INSERT_AFTER(rp, xp,
+ entry.list.rib);
+ rp = xp;
+ }
+ }
+ }
+
+ /* Fixup MED order again */
+ while (!LIST_EMPTY(&redo)) {
+ xp = LIST_FIRST(&redo);
+ LIST_REMOVE(xp, entry.list.rib);
+
+ prefix_insert(xp, NULL, re);
+ }
+}
+
/*
* Find the correct place to insert the prefix in the prefix list.
* If the active prefix has changed we need to send an update.
@@ -294,27 +424,10 @@ prefix_evaluate(struct rib_entry *re, st
}
if (old != NULL)
- LIST_REMOVE(old, entry.list.rib);
-
- if (new != NULL) {
- if (LIST_EMPTY(&re->prefix_h))
- LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
- else {
- LIST_FOREACH(xp, &re->prefix_h, entry.list.rib) {
- if (prefix_cmp(new, xp) > 0) {
- LIST_INSERT_BEFORE(xp, new,
- entry.list.rib);
- break;
- } else if (LIST_NEXT(xp, entry.list.rib) ==
- NULL) {
- /* if xp last element ... */
- LIST_INSERT_AFTER(xp, new,
- entry.list.rib);
- break;
- }
- }
- }
- }
+ prefix_remove(old, re);
+
+ if (new != NULL)
+ prefix_insert(new, NULL, re);
xp = LIST_FIRST(&re->prefix_h);
if (xp != NULL) {