On Tue, Feb 16, 2021 at 10:44:31AM +0100, Claudio Jeker wrote: > On route-servers at IXPs there is often the wish to not have path hiding > because of output filters. On way of solving this is to use per peer RIBs > but the cost (in memory) is very high. This solves this issue in a > different way but with the same (or similar result). > > In bgpd all prefixes/paths are sorted by their preference and by default > the best path is put into the Adj-RIB-Out (after that prefix passed the > output filter). This diff changes this behaviour by falling back to the > next prefix in case the current prefix was filtered. This stops path > hiding and announces the next best route. The cost for doing this is a > bump in CPU usage. On the other hand the memory consumption remains the > same. > > The feature can be enabled with 'rde evaluate all' globably or per peer. > It is also to turn it off by using 'rde evaluate default' (in case a peer > does not want this).
Better diff that fixes an issue in parse.y adding DEFAULT as keyword. -- :wq Claudio Index: bgpd.conf.5 =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.conf.5,v retrieving revision 1.208 diff -u -p -r1.208 bgpd.conf.5 --- bgpd.conf.5 16 Feb 2021 08:29:16 -0000 1.208 +++ bgpd.conf.5 16 Feb 2021 09:22:37 -0000 @@ -268,9 +268,18 @@ daemons, such as .Xr ospfd 8 . .Pp .It Xo -.Ic rde -.Ic med -.Ic compare +.Ic rde Ic evaluate +.Pq Ic default Ns | Ns Ic all +.Xc +If set to +.Ar all +keep evaluating alternative paths in case the selected path is filtered +out. +By default if a path is filtered by the output filters then no alternative +path is sent to this peer. +.Pp +.It Xo +.Ic rde Ic med Ic compare .Pq Ic always Ns | Ns Ic strict .Xc If set to @@ -1151,6 +1160,20 @@ setting. .Pp .It Ic remote-as Ar as-number Set the AS number of the remote system. +.Pp +.It Xo +.Ic rde Ic evaluate +.Pq Ic default Ns | Ns Ic all +.Xc +If set to +.Ar all +keep evaluating alternative paths in case the selected path is filtered +out. +By default if a path is filtered by the output filters then no alternative +path is sent to this peer. +The default is inherited from the global +.Ic rde Ic evaluate +setting. .Pp .It Ic rib Ar name Bind the neighbor to the specified RIB. Index: bgpd.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v retrieving revision 1.412 diff -u -p -r1.412 bgpd.h --- bgpd.h 16 Feb 2021 08:29:16 -0000 1.412 +++ bgpd.h 16 Feb 2021 09:22:38 -0000 @@ -65,7 +65,8 @@ #define BGPD_FLAG_DECISION_ROUTEAGE 0x0100 #define BGPD_FLAG_DECISION_TRANS_AS 0x0200 #define BGPD_FLAG_DECISION_MED_ALWAYS 0x0400 -#define BGPD_FLAG_NO_AS_SET 0x0800 +#define BGPD_FLAG_DECISION_ALL_PATHS 0x0800 +#define BGPD_FLAG_NO_AS_SET 0x1000 #define BGPD_LOG_UPDATES 0x0001 @@ -408,7 +409,8 @@ struct peer_config { #define PEERFLAG_TRANS_AS 0x01 #define PEERFLAG_LOG_UPDATES 0x02 -#define PEERFLAG_NO_AS_SET 0x04 +#define PEERFLAG_EVALUATE_ALL 0x04 +#define PEERFLAG_NO_AS_SET 0x08 enum network_type { NETWORK_DEFAULT, /* from network statements */ Index: parse.y =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v retrieving revision 1.413 diff -u -p -r1.413 parse.y --- parse.y 16 Feb 2021 08:29:16 -0000 1.413 +++ parse.y 16 Feb 2021 10:28:39 -0000 @@ -791,6 +791,19 @@ conf_main : AS as4number { } free($4); } + | RDE EVALUATE STRING { + if (!strcmp($3, "all")) + conf->flags |= BGPD_FLAG_DECISION_ALL_PATHS; + else if (!strcmp($3, "default")) + conf->flags &= ~BGPD_FLAG_DECISION_ALL_PATHS; + else { + yyerror("rde evaluate: " + "unknown setting \"%s\"", $3); + free($3); + YYERROR; + } + free($3); + } | NEXTHOP QUALIFY VIA STRING { if (!strcmp($4, "bgp")) conf->flags |= BGPD_FLAG_NEXTHOP_BGP; @@ -1727,6 +1740,19 @@ peeropts : REMOTEAS as4number { else curpeer->conf.flags &= ~PEERFLAG_NO_AS_SET; } + | RDE EVALUATE STRING { + if (!strcmp($3, "all")) + curpeer->conf.flags |= PEERFLAG_EVALUATE_ALL; + else if (!strcmp($3, "default")) + curpeer->conf.flags &= ~PEERFLAG_EVALUATE_ALL; + else { + yyerror("rde evaluate: " + "unknown setting \"%s\"", $3); + free($3); + YYERROR; + } + free($3); + } ; restart : /* nada */ { $$ = 0; } @@ -3888,6 +3914,8 @@ alloc_peer(void) if (conf->flags & BGPD_FLAG_DECISION_TRANS_AS) p->conf.flags |= PEERFLAG_TRANS_AS; + if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) + p->conf.flags |= PEERFLAG_EVALUATE_ALL; if (conf->flags & BGPD_FLAG_NO_AS_SET) p->conf.flags |= PEERFLAG_NO_AS_SET; @@ -3910,8 +3938,6 @@ new_peer(void) sizeof(p->conf.descr)) >= sizeof(p->conf.descr)) fatalx("new_peer descr strlcpy"); p->conf.groupid = curgroup->conf.id; - p->conf.local_as = curgroup->conf.local_as; - p->conf.local_short_as = curgroup->conf.local_short_as; } return (p); } Index: printconf.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v retrieving revision 1.146 diff -u -p -r1.146 printconf.c --- printconf.c 16 Feb 2021 08:29:16 -0000 1.146 +++ printconf.c 16 Feb 2021 09:22:38 -0000 @@ -390,9 +390,10 @@ print_mainconf(struct bgpd_config *conf) if (conf->flags & BGPD_FLAG_DECISION_ROUTEAGE) printf("rde route-age evaluate\n"); - if (conf->flags & BGPD_FLAG_DECISION_MED_ALWAYS) printf("rde med compare always\n"); + if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) + printf("rde evaluate all\n"); if (conf->flags & BGPD_FLAG_NO_AS_SET) printf("reject as-set yes\n"); @@ -681,6 +682,14 @@ print_peer(struct peer_config *p, struct printf("%s\tdepend on \"%s\"\n", c, p->if_depend); if (p->flags & PEERFLAG_TRANS_AS) printf("%s\ttransparent-as yes\n", c); + + if (conf->flags & BGPD_FLAG_DECISION_ALL_PATHS) { + if (!(p->flags & PEERFLAG_EVALUATE_ALL)) + printf("%s\trde evaluate default\n", c); + } else { + if (p->flags & PEERFLAG_EVALUATE_ALL) + printf("%s\trde evaluate all\n", c); + } if (conf->flags & BGPD_FLAG_NO_AS_SET) { if (!(p->flags & PEERFLAG_NO_AS_SET)) Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v retrieving revision 1.515 diff -u -p -r1.515 rde.c --- rde.c 16 Feb 2021 08:29:16 -0000 1.515 +++ rde.c 16 Feb 2021 09:22:38 -0000 @@ -2875,8 +2875,17 @@ rde_send_kroute(struct rib *rib, struct /* * update specific functions */ +static int rde_eval_all; + +int +rde_evaluate_all(void) +{ + return rde_eval_all; +} + void -rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old) +rde_generate_updates(struct rib *rib, struct prefix *new, struct prefix *old, + int eval_all) { struct rde_peer *peer; u_int8_t aid; @@ -2889,7 +2898,7 @@ rde_generate_updates(struct rib *rib, st if (old == NULL && new == NULL) return; - if ((rib->flags & F_RIB_NOFIB) == 0) + if (!eval_all && (rib->flags & F_RIB_NOFIB) == 0) rde_send_kroute(rib, new, old); if (new) @@ -2897,13 +2906,22 @@ rde_generate_updates(struct rib *rib, st else aid = old->pt->aid; + rde_eval_all = 0; LIST_FOREACH(peer, &peerlist, peer_l) { - if (peer->conf.id == 0) - continue; - if (peer->loc_rib_id != rib->id) + /* skip ourself */ + if (peer == peerself) continue; if (peer->state != PEER_UP) continue; + /* handle evaluate all, keep track if it is needed */ + if (peer->conf.flags & PEERFLAG_EVALUATE_ALL) + rde_eval_all = 1; + else if (eval_all) + /* skip default peers if the best path didn't change */ + continue; + /* skip peers using a different rib */ + if (peer->loc_rib_id != rib->id) + continue; /* check if peer actually supports the address family */ if (peer->capa.mp[aid] == 0) continue; @@ -3556,7 +3574,7 @@ rde_softreconfig_sync_reeval(struct rib_ nexthop_unlink(p); } if (re->active) { - rde_generate_updates(rib, NULL, re->active); + rde_generate_updates(rib, NULL, re->active, 0); re->active = NULL; } return; Index: rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v retrieving revision 1.236 diff -u -p -r1.236 rde.h --- rde.h 13 Jan 2021 11:34:01 -0000 1.236 +++ rde.h 11 Feb 2021 10:52:10 -0000 @@ -371,8 +371,9 @@ void rde_send_nexthop(struct bgpd_addr void rde_pftable_add(u_int16_t, struct prefix *); void rde_pftable_del(u_int16_t, struct prefix *); +int rde_evaluate_all(void); void rde_generate_updates(struct rib *, struct prefix *, - struct prefix *); + struct prefix *, int); u_int32_t rde_local_as(void); int rde_decisionflags(void); int rde_as4byte(struct rde_peer *); @@ -486,6 +487,7 @@ communities_unref(struct rde_community * int community_to_rd(struct community *, u_int64_t *); /* rde_decide.c */ +int prefix_eligible(struct prefix *); void prefix_evaluate(struct rib_entry *, struct prefix *, struct prefix *); /* rde_filter.c */ Index: rde_decide.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v retrieving revision 1.81 diff -u -p -r1.81 rde_decide.c --- rde_decide.c 2 Feb 2021 15:24:43 -0000 1.81 +++ rde_decide.c 11 Feb 2021 10:53:30 -0000 @@ -91,8 +91,8 @@ void prefix_remove(struct prefix *, stru /* * Decision Engine OUR implementation: - * Our implementation has only one RIB. The filtering is done first. The - * filtering calculates the preference and stores it in LOCAL_PREF (Phase 1). + * The filtering is done first. The filtering calculates the preference and + * stores it in LOCAL_PREF (Phase 1). * Ineligible routes are flagged as ineligible via nexthop_add(). * Phase 3 is done together with Phase 2. * In following cases a prefix needs to be reevaluated: @@ -284,6 +284,13 @@ prefix_cmp(struct prefix *p1, struct pre fatalx("Uh, oh a politician in the decision process"); } +/* + * Insert a prefix keeping the total order of the list. For routes + * that may depend on a MED selection the set is scanned until the + * condition is cleared. If a MED inversion is detected the respective + * prefix is taken of the rib list and put onto a redo queue. All + * prefixes on the redo queue are re-inserted at the end. + */ void prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re) { @@ -351,6 +358,15 @@ prefix_insert(struct prefix *new, struct } } +/* + * Remove a prefix from the RIB list ensuring that the total order of the + * list remains intact. All routes that differ in the MED are taken of the + * list and put on the redo list. To figure out if a route could cause a + * resort because of a MED check the next prefix of the to-remove prefix + * is compared with the old prefix. A full scan is only done if the next + * route differs because of the MED or later checks. + * Again at the end all routes on the redo queue are reinserted. + */ void prefix_remove(struct prefix *old, struct rib_entry *re) { @@ -397,6 +413,23 @@ prefix_remove(struct prefix *old, struct } } +/* helper function to check if a prefix is valid to be selected */ +int +prefix_eligible(struct prefix *p) +{ + struct rde_aspath *asp = prefix_aspath(p); + struct nexthop *nh = prefix_nexthop(p); + + /* The aspath needs to be loop and error free */ + if (asp == NULL || asp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR)) + return 0; + /* The nexthop needs to exist and be reachable */ + if (nh == NULL || nh->state != NEXTHOP_REACH) + return 0; + + return 1; +} + /* * Find the correct place to insert the prefix in the prefix list. * If the active prefix has changed we need to send an update. @@ -420,7 +453,7 @@ prefix_evaluate(struct rib_entry *re, st * active. Clean up now to ensure that the RIB * is consistant. */ - rde_generate_updates(re_rib(re), NULL, re->active); + rde_generate_updates(re_rib(re), NULL, re->active, 0); re->active = NULL; } return; @@ -433,15 +466,8 @@ prefix_evaluate(struct rib_entry *re, st prefix_insert(new, NULL, re); xp = LIST_FIRST(&re->prefix_h); - if (xp != NULL) { - struct rde_aspath *xasp = prefix_aspath(xp); - if (xasp == NULL || - xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) || - (prefix_nexthop(xp) != NULL && prefix_nexthop(xp)->state != - NEXTHOP_REACH)) - /* xp is ineligible */ - xp = NULL; - } + if (xp != NULL && !prefix_eligible(xp)) + xp = NULL; /* * If the active prefix changed or the active prefix was removed @@ -453,7 +479,17 @@ prefix_evaluate(struct rib_entry *re, st * but remember that xp may be NULL aka ineligible. * Additional decision may be made by the called functions. */ - rde_generate_updates(re_rib(re), xp, re->active); + rde_generate_updates(re_rib(re), xp, re->active, 0); re->active = xp; + return; } + + /* + * If there are peers with 'rde evaluate all' every update needs + * to be passed on (not only a change of the best prefix). + * rde_generate_updates() will then take care of distribution. + */ + if (rde_evaluate_all()) + if ((new != NULL && prefix_eligible(new)) || old != NULL) + rde_generate_updates(re_rib(re), re->active, NULL, 1); } Index: rde_update.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v retrieving revision 1.124 diff -u -p -r1.124 rde_update.c --- rde_update.c 9 Jan 2021 16:49:41 -0000 1.124 +++ rde_update.c 11 Feb 2021 11:33:32 -0000 @@ -23,6 +23,7 @@ #include <stdlib.h> #include <string.h> #include <siphash.h> +#include <stdio.h> #include "bgpd.h" #include "rde.h" @@ -49,26 +50,22 @@ up_test_update(struct rde_peer *peer, st { struct rde_aspath *asp; struct rde_community *comm; - struct rde_peer *prefp; + struct rde_peer *frompeer; - if (p == NULL) - /* no prefix available */ - return (0); - - prefp = prefix_peer(p); + frompeer = prefix_peer(p); asp = prefix_aspath(p); comm = prefix_communities(p); - if (peer == prefp) - /* Do not send routes back to sender */ - return (0); - if (asp == NULL || asp->flags & F_ATTR_PARSE_ERR) fatalx("try to send out a botched path"); if (asp->flags & F_ATTR_LOOP) fatalx("try to send out a looped path"); - if (!prefp->conf.ebgp && !peer->conf.ebgp) { + if (peer == frompeer) + /* Do not send routes back to sender */ + return (0); + + if (!frompeer->conf.ebgp && !peer->conf.ebgp) { /* * route reflector redistribution rules: * 1. if announce is set -> announce @@ -77,7 +74,7 @@ up_test_update(struct rde_peer *peer, st * 4. old non-client, new client -> yes * 5. old client, new client -> yes */ - if (prefp->conf.reflector_client == 0 && + if (frompeer->conf.reflector_client == 0 && peer->conf.reflector_client == 0 && (asp->flags & F_PREFIX_ANNOUNCED) == 0) /* Do not redistribute updates to ibgp peers */ @@ -101,14 +98,12 @@ void up_generate_updates(struct filter_head *rules, struct rde_peer *peer, struct prefix *new, struct prefix *old) { - struct filterstate state; - struct bgpd_addr addr; - - if (peer->state != PEER_UP) - return; + struct filterstate state; + struct bgpd_addr addr; + int need_withdraw; +again: if (new == NULL) { -withdraw: if (old == NULL) /* no prefix to withdraw */ return; @@ -121,8 +116,28 @@ withdraw: peer->up_wcnt++; } } else { - if (!up_test_update(peer, new)) { - goto withdraw; + need_withdraw = 0; + /* + * up_test_update() needs to run before the output filters + * else the well known communities wont work properly. + * The output filters would not be able to add well known + * communities. + */ + if (!up_test_update(peer, new)) + need_withdraw = 1; + + /* + * if 'rde evaluate all' is set for this peer then + * delay the the withdraw because of up_test_update(). + * The filters may actually skip this prefix and so this + * decision needs to be delayed. + * For the default mode we can just give up here and + * skip the filters. + */ + if (need_withdraw && + !(peer->conf.flags & PEERFLAG_EVALUATE_ALL)) { + new = NULL; + goto again; } rde_filterstate_prep(&state, prefix_aspath(new), @@ -133,7 +148,18 @@ withdraw: new->pt->prefixlen, prefix_vstate(new), &state) == ACTION_DENY) { rde_filterstate_clean(&state); - goto withdraw; + if (peer->conf.flags & PEERFLAG_EVALUATE_ALL) + new = LIST_NEXT(new, entry.list.rib); + else + new = NULL; + if (new != NULL && !prefix_eligible(new)) + new = NULL; + goto again; + } + + if (need_withdraw) { + new = NULL; + goto again; } /* only send update if path changed */