Currently bgpd does not properly handle strict med route decisions.
The problem is that the strict MED check only matters for aspaths with the
same neighbor as. The route decision process currently stops as soon as
the current prefix is better then the one checked in the list of prefixes.
Now in some cases this results in unstable decisions because the order of
insertions matter. Depending on the order any route may be selected.
The med.sh regress test I added shows this issue. Depending on the order
any of the 3 routes can be selected as best:
1:
flags ovs destination gateway lpref med aspath origin
*> N 10.12.1.0/24 10.12.57.4 100 50 64501 64510 i
* N 10.12.1.0/24 10.12.57.2 100 100 64501 64510 i
* N 10.12.1.0/24 10.12.57.3 100 100 64502 64510 i
2:
flags ovs destination gateway lpref med aspath origin
*> N 10.12.1.0/24 10.12.57.2 100 100 64501 64510 i
* N 10.12.1.0/24 10.12.57.3 100 100 64502 64510 i
* N 10.12.1.0/24 10.12.57.4 100 50 64501 64510 i
3 (and the actual expected result):
flags ovs destination gateway lpref med aspath origin
*> N 10.12.1.0/24 10.12.57.3 100 100 64502 64510 i
* N 10.12.1.0/24 10.12.57.4 100 50 64501 64510 i
* N 10.12.1.0/24 10.12.57.2 100 100 64501 64510 i
Additionally removing a route requires to reevaluate part of the routes in
some cases. For examle in case 3 if the middle route (with med 50) is
removed then the last route actually becomes best (bgpid is lower).
The following diff fixes this issue hopefully. On insertion if decisions
happen at or after the MED check (step 5) then all remaining routes need
to be checked (until a check before step 5 happens). Routes matching on
med that need to be re-evaluated are put on a redo queue and at the end of
the decision process are put back to get their order right.
Something similar happens when removing a prefix. If the next prefix
differ on a check after the MED check then again all those prefixes need
to be rechecked and maybe re-evaluated.
This change is important but also rather complex. Please test and if
possible validate that it does not cause troubles in your setup.
Btw. this only matters for 'rde med compare strict' (default).
--
:wq Claudio
Index: rde_decide.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.80
diff -u -p -r1.80 rde_decide.c
--- rde_decide.c 14 Jan 2021 08:29:26 -0000 1.80
+++ rde_decide.c 14 Jan 2021 10:16:30 -0000
@@ -26,7 +26,9 @@
#include "rde.h"
#include "log.h"
-int prefix_cmp(struct prefix *, struct prefix *);
+int prefix_cmp(struct prefix *, struct prefix *, int *);
+void prefix_insert(struct prefix *, struct prefix *, struct rib_entry *);
+void prefix_remove(struct prefix *, struct rib_entry *);
/*
* Decision Engine RFC implementation:
* Phase 1:
@@ -107,7 +109,7 @@ int prefix_cmp(struct prefix *, struct p
* already added prefix.
*/
int
-prefix_cmp(struct prefix *p1, struct prefix *p2)
+prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall)
{
struct rde_aspath *asp1, *asp2;
struct rde_peer *peer1, *peer2;
@@ -115,6 +117,16 @@ prefix_cmp(struct prefix *p1, struct pre
u_int32_t p1id, p2id;
int p1cnt, p2cnt, i;
+ /*
+ * If a match happens before the MED check then the list is
+ * correctly sorted. If a match happens after MED then it
+ * may further elements need to be checked to make sure that
+ * all path are considered that could affect this path.
+ * If the check happens to be on MED signal this by setting
+ * testall to 2.
+ */
+ *testall = 0;
+
if (p1 == NULL)
return -1;
if (p2 == NULL)
@@ -166,10 +178,14 @@ prefix_cmp(struct prefix *p1, struct pre
/*
* 5. MED decision
* Only comparable between the same neighboring AS or if
- * 'rde med compare always' is set.
+ * 'rde med compare always' is set. In the first case
+ * set the testall flag since further elements need to be
+ * evaluated as well.
*/
if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) ||
aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) {
+ if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
+ *testall = 2;
/* lowest value wins */
if (asp1->med < asp2->med)
return 1;
@@ -177,6 +193,9 @@ prefix_cmp(struct prefix *p1, struct pre
return -1;
}
+ if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
+ *testall = 1;
+
/*
* 6. EBGP is cooler than IBGP
* It is absolutely important that the ebgp value in peer_config.ebgp
@@ -264,6 +283,117 @@ prefix_cmp(struct prefix *p1, struct pre
fatalx("Uh, oh a politician in the decision process");
}
+void
+prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
+{
+ struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
+ struct prefix *xp, *np, *rp, *ip = ep;
+ int testall, selected = 0;
+
+ if (ep == NULL)
+ ep = LIST_FIRST(&re->prefix_h);
+
+ for (xp = ep; xp != NULL; xp = np) {
+ np = LIST_NEXT(xp, entry.list.rib);
+
+ if (prefix_cmp(new, xp, &testall) > 0) {
+ /* p is preferred over xp */
+ if (testall == 0)
+ break; /* we're done */
+ else if (testall == 2) {
+ /*
+ * MED inversion, take out prefix and
+ * put it onto redo queue.
+ */
+ LIST_REMOVE(xp, entry.list.rib);
+ if (LIST_EMPTY(&redo))
+ LIST_INSERT_HEAD(&redo, xp,
+ entry.list.rib);
+ else
+ LIST_INSERT_AFTER(rp, xp,
+ entry.list.rib);
+ rp = xp;
+ } else {
+ /*
+ * lock insertion point and
+ * continue on with scan
+ */
+ selected = 1;
+ continue;
+ }
+ } else {
+ /*
+ * p is less preferred, remember insertion point
+ * If p got selected during a testall traverse
+ * do not alter the insertion point unless this
+ * happened on an actual MED check.
+ */
+ if (testall == 2)
+ selected = 0;
+ if (!selected)
+ ip = xp;
+ }
+ }
+
+ if (ip == NULL)
+ LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
+ else
+ LIST_INSERT_AFTER(ip, new, entry.list.rib);
+
+ /* Fixup MED order again */
+ while (!LIST_EMPTY(&redo)) {
+ xp = LIST_FIRST(&redo);
+ LIST_REMOVE(xp, entry.list.rib);
+
+ prefix_insert(xp, new, re);
+ }
+}
+
+void
+prefix_remove(struct prefix *old, struct rib_entry *re)
+{
+ struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
+ struct prefix *xp, *np, *rp;
+ int testall;
+
+ xp = LIST_NEXT(old, entry.list.rib);
+ LIST_REMOVE(old, entry.list.rib);
+ prefix_cmp(old, xp, &testall);
+ if (testall > 0) {
+ /* maybe MED route, scan tail for other possible routes */
+ for (; xp != NULL; xp = np) {
+ np = LIST_NEXT(xp, entry.list.rib);
+
+ /* only interested in the testall result */
+ prefix_cmp(old, xp, &testall);
+ if (testall == 0)
+ break; /* we're done */
+ else if (testall == 2) {
+ /*
+ * possible MED inversion, take out prefix and
+ * put it onto redo queue.
+ */
+ LIST_REMOVE(xp, entry.list.rib);
+ if (LIST_EMPTY(&redo))
+ LIST_INSERT_HEAD(&redo, xp,
+ entry.list.rib);
+ else
+ LIST_INSERT_AFTER(rp, xp,
+ entry.list.rib);
+ rp = xp;
+ }
+ }
+ }
+
+ /* Fixup MED order again */
+ while (!LIST_EMPTY(&redo)) {
+ xp = LIST_FIRST(&redo);
+ LIST_REMOVE(xp, entry.list.rib);
+
+ prefix_insert(xp, NULL, re);
+ }
+}
+
/*
* Find the correct place to insert the prefix in the prefix list.
* If the active prefix has changed we need to send an update.
@@ -294,27 +424,10 @@ prefix_evaluate(struct rib_entry *re, st
}
if (old != NULL)
- LIST_REMOVE(old, entry.list.rib);
-
- if (new != NULL) {
- if (LIST_EMPTY(&re->prefix_h))
- LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
- else {
- LIST_FOREACH(xp, &re->prefix_h, entry.list.rib) {
- if (prefix_cmp(new, xp) > 0) {
- LIST_INSERT_BEFORE(xp, new,
- entry.list.rib);
- break;
- } else if (LIST_NEXT(xp, entry.list.rib) ==
- NULL) {
- /* if xp last element ... */
- LIST_INSERT_AFTER(xp, new,
- entry.list.rib);
- break;
- }
- }
- }
- }
+ prefix_remove(old, re);
+
+ if (new != NULL)
+ prefix_insert(new, NULL, re);
xp = LIST_FIRST(&re->prefix_h);
if (xp != NULL) {