On Tue, Feb 16, 2021 at 10:44:31AM +0100, Claudio Jeker wrote:
> On route-servers at IXPs there is often the wish to not have path hiding
> because of output filters. On way of solving this is to use per peer RIBs
> but the cost (in memory) is very high. This solves this issue in a
> different way but with the same (or similar result).
> 
> In bgpd all prefixes/paths are sorted by their preference and by default
> the best path is put into the Adj-RIB-Out (after that prefix passed the
> output filter). This diff changes this behaviour by falling back to the
> next prefix in case the current prefix was filtered. This stops path
> hiding and announces the next best route. The cost for doing this is a
> bump in CPU usage. On the other hand the memory consumption remains the
> same.
> 
> The feature can be enabled with 'rde evaluate all' globably or per peer.
> It is also to turn it off by using 'rde evaluate default' (in case a peer
> does not want this).

Better diff that fixes an issue in parse.y adding DEFAULT as keyword.

-- 
:wq Claudio

Index: bgpd.conf.5
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.conf.5,v
retrieving revision 1.208
diff -u -p -r1.208 bgpd.conf.5
--- bgpd.conf.5 16 Feb 2021 08:29:16 -0000      1.208
+++ bgpd.conf.5 16 Feb 2021 09:22:37 -0000
@@ -268,9 +268,18 @@ daemons, such as
 .Xr ospfd 8 .
 .Pp
 .It Xo
-.Ic rde
-.Ic med
-.Ic compare
+.Ic rde Ic evaluate
+.Pq Ic default Ns | Ns Ic all
+.Xc
+If set to
+.Ar all
+keep evaluating alternative paths in case the selected path is filtered
+out.
+By default if a path is filtered by the output filters then no alternative
+path is sent to this peer.
+.Pp
+.It Xo
+.Ic rde Ic med Ic compare
 .Pq Ic always Ns | Ns Ic strict
 .Xc
 If set to
@@ -1151,6 +1160,20 @@ setting.
 .Pp
 .It Ic remote-as Ar as-number
 Set the AS number of the remote system.
+.Pp
+.It Xo
+.Ic rde Ic evaluate
+.Pq Ic default Ns | Ns Ic all
+.Xc
+If set to
+.Ar all
+keep evaluating alternative paths in case the selected path is filtered
+out.
+By default if a path is filtered by the output filters then no alternative
+path is sent to this peer.
+The default is inherited from the global
+.Ic rde Ic evaluate
+setting.
 .Pp
 .It Ic rib Ar name
 Bind the neighbor to the specified RIB.
Index: bgpd.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v
retrieving revision 1.412
diff -u -p -r1.412 bgpd.h
--- bgpd.h      16 Feb 2021 08:29:16 -0000      1.412
+++ bgpd.h      16 Feb 2021 09:22:38 -0000
@@ -65,7 +65,8 @@
 #define        BGPD_FLAG_DECISION_ROUTEAGE     0x0100
 #define        BGPD_FLAG_DECISION_TRANS_AS     0x0200
 #define        BGPD_FLAG_DECISION_MED_ALWAYS   0x0400
-#define        BGPD_FLAG_NO_AS_SET             0x0800
+#define        BGPD_FLAG_DECISION_ALL_PATHS    0x0800
+#define        BGPD_FLAG_NO_AS_SET             0x1000
 
 #define        BGPD_LOG_UPDATES                0x0001
 
@@ -408,7 +409,8 @@ struct peer_config {
 
 #define PEERFLAG_TRANS_AS      0x01
 #define PEERFLAG_LOG_UPDATES   0x02
-#define PEERFLAG_NO_AS_SET     0x04
+#define PEERFLAG_EVALUATE_ALL  0x04
+#define PEERFLAG_NO_AS_SET     0x08
 
 enum network_type {
        NETWORK_DEFAULT,        /* from network statements */
Index: parse.y
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v
retrieving revision 1.413
diff -u -p -r1.413 parse.y
--- parse.y     16 Feb 2021 08:29:16 -0000      1.413
+++ parse.y     16 Feb 2021 10:28:39 -0000
@@ -791,6 +791,19 @@ conf_main  : AS as4number          {
                        }
                        free($4);
                }
+               | RDE EVALUATE STRING {
+                       if (!strcmp($3, "all"))
+                               conf->flags |= BGPD_FLAG_DECISION_ALL_PATHS;
+                       else if (!strcmp($3, "default"))
+                               conf->flags &= ~BGPD_FLAG_DECISION_ALL_PATHS;
+                       else {
+                               yyerror("rde evaluate: "
+                                   "unknown setting \"%s\"", $3);
+                               free($3);
+                               YYERROR;
+                       }
+                       free($3);
+               }
                | NEXTHOP QUALIFY VIA STRING    {
                        if (!strcmp($4, "bgp"))
                                conf->flags |= BGPD_FLAG_NEXTHOP_BGP;
@@ -1727,6 +1740,19 @@ peeropts : REMOTEAS as4number    {
                        else
                                curpeer->conf.flags &= ~PEERFLAG_NO_AS_SET;
                }
+               | RDE EVALUATE STRING {
+                       if (!strcmp($3, "all"))
+                               curpeer->conf.flags |= PEERFLAG_EVALUATE_ALL;
+                       else if (!strcmp($3, "default"))
+                               curpeer->conf.flags &= ~PEERFLAG_EVALUATE_ALL;
+                       else {
+                               yyerror("rde evaluate: "
+                                   "unknown setting \"%s\"", $3);
+                               free($3);
+                               YYERROR;
+                       }
+                       free($3);
+               }
                ;
 
 restart                : /* nada */            { $$ = 0; }
@@ -3888,6 +3914,8 @@ alloc_peer(void)
 
        if (conf->flags & BGPD_FLAG_DECISION_TRANS_AS)
                p->conf.flags |= PEERFLAG_TRANS_AS;
+       if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS)
+               p->conf.flags |= PEERFLAG_EVALUATE_ALL;
        if (conf->flags & BGPD_FLAG_NO_AS_SET)
                p->conf.flags |= PEERFLAG_NO_AS_SET;
 
@@ -3910,8 +3938,6 @@ new_peer(void)
                    sizeof(p->conf.descr)) >= sizeof(p->conf.descr))
                        fatalx("new_peer descr strlcpy");
                p->conf.groupid = curgroup->conf.id;
-               p->conf.local_as = curgroup->conf.local_as;
-               p->conf.local_short_as = curgroup->conf.local_short_as;
        }
        return (p);
 }
Index: printconf.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v
retrieving revision 1.146
diff -u -p -r1.146 printconf.c
--- printconf.c 16 Feb 2021 08:29:16 -0000      1.146
+++ printconf.c 16 Feb 2021 09:22:38 -0000
@@ -390,9 +390,10 @@ print_mainconf(struct bgpd_config *conf)
 
        if (conf->flags & BGPD_FLAG_DECISION_ROUTEAGE)
                printf("rde route-age evaluate\n");
-
        if (conf->flags & BGPD_FLAG_DECISION_MED_ALWAYS)
                printf("rde med compare always\n");
+       if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS)
+               printf("rde evaluate all\n");
 
        if (conf->flags & BGPD_FLAG_NO_AS_SET)
                printf("reject as-set yes\n");
@@ -681,6 +682,14 @@ print_peer(struct peer_config *p, struct
                printf("%s\tdepend on \"%s\"\n", c, p->if_depend);
        if (p->flags & PEERFLAG_TRANS_AS)
                printf("%s\ttransparent-as yes\n", c);
+
+       if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) {
+               if (!(p->flags & PEERFLAG_EVALUATE_ALL))
+                       printf("%s\trde evaluate default\n", c);
+       } else {
+               if (p->flags & PEERFLAG_EVALUATE_ALL)
+                       printf("%s\trde evaluate all\n", c);
+       }
 
        if (conf->flags & BGPD_FLAG_NO_AS_SET) {
                if (!(p->flags & PEERFLAG_NO_AS_SET))
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.515
diff -u -p -r1.515 rde.c
--- rde.c       16 Feb 2021 08:29:16 -0000      1.515
+++ rde.c       16 Feb 2021 09:22:38 -0000
@@ -2875,8 +2875,17 @@ rde_send_kroute(struct rib *rib, struct 
 /*
  * update specific functions
  */
+static int rde_eval_all;
+
+int
+rde_evaluate_all(void)
+{
+       return rde_eval_all;
+}
+
 void
-rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old)
+rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old,
+    int eval_all)
 {
        struct rde_peer *peer;
        u_int8_t         aid;
@@ -2889,7 +2898,7 @@ rde_generate_updates(struct rib *rib, st
        if (old == NULL && new == NULL)
                return;
 
-       if ((rib->flags & F_RIB_NOFIB) == 0)
+       if (!eval_all && (rib->flags & F_RIB_NOFIB) == 0)
                rde_send_kroute(rib, new, old);
 
        if (new)
@@ -2897,13 +2906,22 @@ rde_generate_updates(struct rib *rib, st
        else
                aid = old->pt->aid;
 
+       rde_eval_all = 0;
        LIST_FOREACH(peer, &peerlist, peer_l) {
-               if (peer->conf.id == 0)
-                       continue;
-               if (peer->loc_rib_id != rib->id)
+               /* skip ourself */
+               if (peer == peerself)
                        continue;
                if (peer->state != PEER_UP)
                        continue;
+               /* handle evaluate all, keep track if it is needed */
+               if (peer->conf.flags & PEERFLAG_EVALUATE_ALL)
+                       rde_eval_all = 1;
+               else if (eval_all)
+                       /* skip default peers if the best path didn't change */
+                       continue;
+               /* skip peers using a different rib */
+               if (peer->loc_rib_id != rib->id)
+                       continue;
                /* check if peer actually supports the address family */
                if (peer->capa.mp[aid] == 0)
                        continue;
@@ -3556,7 +3574,7 @@ rde_softreconfig_sync_reeval(struct rib_
                                nexthop_unlink(p);
                }
                if (re->active) {
-                       rde_generate_updates(rib, NULL, re->active);
+                       rde_generate_updates(rib, NULL, re->active, 0);
                        re->active = NULL;
                }
                return;
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.236
diff -u -p -r1.236 rde.h
--- rde.h       13 Jan 2021 11:34:01 -0000      1.236
+++ rde.h       11 Feb 2021 10:52:10 -0000
@@ -371,8 +371,9 @@ void                rde_send_nexthop(struct bgpd_addr 
 void           rde_pftable_add(u_int16_t, struct prefix *);
 void           rde_pftable_del(u_int16_t, struct prefix *);
 
+int            rde_evaluate_all(void);
 void           rde_generate_updates(struct rib *, struct prefix *,
-                   struct prefix *);
+                   struct prefix *, int);
 u_int32_t      rde_local_as(void);
 int            rde_decisionflags(void);
 int            rde_as4byte(struct rde_peer *);
@@ -486,6 +487,7 @@ communities_unref(struct rde_community *
 int    community_to_rd(struct community *, u_int64_t *);
 
 /* rde_decide.c */
+int    prefix_eligible(struct prefix *);
 void   prefix_evaluate(struct rib_entry *, struct prefix *, struct prefix *);
 
 /* rde_filter.c */
Index: rde_decide.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.81
diff -u -p -r1.81 rde_decide.c
--- rde_decide.c        2 Feb 2021 15:24:43 -0000       1.81
+++ rde_decide.c        11 Feb 2021 10:53:30 -0000
@@ -91,8 +91,8 @@ void  prefix_remove(struct prefix *, stru
 
 /*
  * Decision Engine OUR implementation:
- * Our implementation has only one RIB. The filtering is done first. The
- * filtering calculates the preference and stores it in LOCAL_PREF (Phase 1).
+ * The filtering is done first. The filtering calculates the preference and
+ * stores it in LOCAL_PREF (Phase 1).
  * Ineligible routes are flagged as ineligible via nexthop_add().
  * Phase 3 is done together with Phase 2.
  * In following cases a prefix needs to be reevaluated:
@@ -284,6 +284,13 @@ prefix_cmp(struct prefix *p1, struct pre
        fatalx("Uh, oh a politician in the decision process");
 }
 
+/*
+ * Insert a prefix keeping the total order of the list. For routes
+ * that may depend on a MED selection the set is scanned until the
+ * condition is cleared. If a MED inversion is detected the respective
+ * prefix is taken of the rib list and put onto a redo queue. All
+ * prefixes on the redo queue are re-inserted at the end.
+ */
 void
 prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
 {
@@ -351,6 +358,15 @@ prefix_insert(struct prefix *new, struct
        }
 }
 
+/*
+ * Remove a prefix from the RIB list ensuring that the total order of the
+ * list remains intact. All routes that differ in the MED are taken of the
+ * list and put on the redo list. To figure out if a route could cause a
+ * resort because of a MED check the next prefix of the to-remove prefix
+ * is compared with the old prefix. A full scan is only done if the next
+ * route differs because of the MED or later checks.
+ * Again at the end all routes on the redo queue are reinserted.
+ */
 void
 prefix_remove(struct prefix *old, struct rib_entry *re)
 {
@@ -397,6 +413,23 @@ prefix_remove(struct prefix *old, struct
        }
 }
 
+/* helper function to check if a prefix is valid to be selected */
+int
+prefix_eligible(struct prefix *p)
+{
+       struct rde_aspath *asp = prefix_aspath(p);
+       struct nexthop *nh = prefix_nexthop(p);
+
+       /* The aspath needs to be loop and error free */
+       if (asp == NULL || asp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR))
+               return 0;
+       /* The nexthop needs to exist and be reachable */
+       if (nh == NULL || nh->state != NEXTHOP_REACH)
+               return 0;
+
+       return 1;
+}
+
 /*
  * Find the correct place to insert the prefix in the prefix list.
  * If the active prefix has changed we need to send an update.
@@ -420,7 +453,7 @@ prefix_evaluate(struct rib_entry *re, st
                         * active. Clean up now to ensure that the RIB
                         * is consistant.
                         */
-                       rde_generate_updates(re_rib(re), NULL, re->active);
+                       rde_generate_updates(re_rib(re), NULL, re->active, 0);
                        re->active = NULL;
                }
                return;
@@ -433,15 +466,8 @@ prefix_evaluate(struct rib_entry *re, st
                prefix_insert(new, NULL, re);
 
        xp = LIST_FIRST(&re->prefix_h);
-       if (xp != NULL) {
-               struct rde_aspath *xasp = prefix_aspath(xp);
-               if (xasp == NULL ||
-                   xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) ||
-                   (prefix_nexthop(xp) != NULL && prefix_nexthop(xp)->state !=
-                   NEXTHOP_REACH))
-                       /* xp is ineligible */
-                       xp = NULL;
-       }
+       if (xp != NULL && !prefix_eligible(xp))
+               xp = NULL;
 
        /*
         * If the active prefix changed or the active prefix was removed
@@ -453,7 +479,17 @@ prefix_evaluate(struct rib_entry *re, st
                 * but remember that xp may be NULL aka ineligible.
                 * Additional decision may be made by the called functions.
                 */
-               rde_generate_updates(re_rib(re), xp, re->active);
+               rde_generate_updates(re_rib(re), xp, re->active, 0);
                re->active = xp;
+               return;
        }
+
+       /*
+        * If there are peers with 'rde evaluate all' every update needs
+        * to be passed on (not only a change of the best prefix).
+        * rde_generate_updates() will then take care of distribution.
+        */
+       if (rde_evaluate_all())
+               if ((new != NULL && prefix_eligible(new)) || old != NULL)
+                       rde_generate_updates(re_rib(re), re->active, NULL, 1);
 }
Index: rde_update.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v
retrieving revision 1.124
diff -u -p -r1.124 rde_update.c
--- rde_update.c        9 Jan 2021 16:49:41 -0000       1.124
+++ rde_update.c        11 Feb 2021 11:33:32 -0000
@@ -23,6 +23,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <siphash.h>
+#include <stdio.h>
 
 #include "bgpd.h"
 #include "rde.h"
@@ -49,26 +50,22 @@ up_test_update(struct rde_peer *peer, st
 {
        struct rde_aspath       *asp;
        struct rde_community    *comm;
-       struct rde_peer         *prefp;
+       struct rde_peer         *frompeer;
 
-       if (p == NULL)
-               /* no prefix available */
-               return (0);
-
-       prefp = prefix_peer(p);
+       frompeer = prefix_peer(p);
        asp = prefix_aspath(p);
        comm = prefix_communities(p);
 
-       if (peer == prefp)
-               /* Do not send routes back to sender */
-               return (0);
-
        if (asp == NULL || asp->flags & F_ATTR_PARSE_ERR)
                fatalx("try to send out a botched path");
        if (asp->flags & F_ATTR_LOOP)
                fatalx("try to send out a looped path");
 
-       if (!prefp->conf.ebgp && !peer->conf.ebgp) {
+       if (peer == frompeer)
+               /* Do not send routes back to sender */
+               return (0);
+
+       if (!frompeer->conf.ebgp && !peer->conf.ebgp) {
                /*
                 * route reflector redistribution rules:
                 * 1. if announce is set                -> announce
@@ -77,7 +74,7 @@ up_test_update(struct rde_peer *peer, st
                 * 4. old non-client, new client        -> yes
                 * 5. old client, new client            -> yes
                 */
-               if (prefp->conf.reflector_client == 0 &&
+               if (frompeer->conf.reflector_client == 0 &&
                    peer->conf.reflector_client == 0 &&
                    (asp->flags & F_PREFIX_ANNOUNCED) == 0)
                        /* Do not redistribute updates to ibgp peers */
@@ -101,14 +98,12 @@ void
 up_generate_updates(struct filter_head *rules, struct rde_peer *peer,
     struct prefix *new, struct prefix *old)
 {
-       struct filterstate              state;
-       struct bgpd_addr                addr;
-
-       if (peer->state != PEER_UP)
-               return;
+       struct filterstate      state;
+       struct bgpd_addr        addr;
+       int                     need_withdraw;
 
+again:
        if (new == NULL) {
-withdraw:
                if (old == NULL)
                        /* no prefix to withdraw */
                        return;
@@ -121,8 +116,28 @@ withdraw:
                        peer->up_wcnt++;
                }
        } else {
-               if (!up_test_update(peer, new)) {
-                       goto withdraw;
+               need_withdraw = 0;
+               /*
+                * up_test_update() needs to run before the output filters
+                * else the well known communities wont work properly.
+                * The output filters would not be able to add well known
+                * communities.
+                */
+               if (!up_test_update(peer, new))
+                       need_withdraw = 1;
+
+               /*
+                * if 'rde evaluate all' is set for this peer then
+                * delay the the withdraw because of up_test_update().
+                * The filters may actually skip this prefix and so this
+                * decision needs to be delayed.
+                * For the default mode we can just give up here and
+                * skip the filters.
+                */
+               if (need_withdraw &&
+                   !(peer->conf.flags & PEERFLAG_EVALUATE_ALL)) {
+                       new = NULL;
+                       goto again;
                }
 
                rde_filterstate_prep(&state, prefix_aspath(new),
@@ -133,7 +148,18 @@ withdraw:
                    new->pt->prefixlen, prefix_vstate(new), &state) ==
                    ACTION_DENY) {
                        rde_filterstate_clean(&state);
-                       goto withdraw;
+                       if (peer->conf.flags & PEERFLAG_EVALUATE_ALL)
+                               new = LIST_NEXT(new, entry.list.rib);
+                       else
+                               new = NULL;
+                       if (new != NULL && !prefix_eligible(new))
+                               new = NULL;
+                       goto again;
+               }
+
+               if (need_withdraw) {
+                       new = NULL;
+                       goto again;
                }
 
                /* only send update if path changed */

Reply via email to