This is a bit bigger then planned but the change is a bit more
substantial. First of all this removes 'route-collector yes|no' from the
config. The old route-collector mode is removed because it causes
confusion and the same can be done by using 'rde rib Loc-RIB no evaluate'.
On top of this it is now possible to change those flags during runtime
and the softreload process will take care and make sure all is applied
correctly. Turning the evaluation process off and on again turned out to
be a bit more tricky than expected. So a few additional changes had to be
added to the decision process (making sure that an update received during
the reload run does not leave behind pointers to freed objects).
Please test and review :)
--
:wq Claudio
Index: bgpd.conf.5
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.conf.5,v
retrieving revision 1.191
diff -u -p -r1.191 bgpd.conf.5
--- bgpd.conf.5 17 Jun 2019 14:00:45 -0000 1.191
+++ bgpd.conf.5 25 Jul 2019 09:07:22 -0000
@@ -334,16 +334,6 @@ In this case the decision process is no
The default is
.Ic ignore .
.Pp
-.It Xo
-.Ic route-collector
-.Pq Ic yes Ns | Ns Ic no
-.Xc
-If set to
-.Ic yes ,
-the route selection process is turned off.
-The default is
-.Ic no .
-.Pp
.It Ic router-id Ar address
Set the router ID to the given IP address, which must be local to the
machine.
Index: bgpd.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v
retrieving revision 1.390
diff -u -p -r1.390 bgpd.h
--- bgpd.h 23 Jul 2019 06:26:44 -0000 1.390
+++ bgpd.h 25 Jul 2019 09:26:02 -0000
@@ -58,10 +58,9 @@
#define BGPD_OPT_NOACTION 0x0004
#define BGPD_OPT_FORCE_DEMOTE 0x0008
-#define BGPD_FLAG_NO_EVALUATE 0x0002
#define BGPD_FLAG_REFLECTOR 0x0004
-#define BGPD_FLAG_NEXTHOP_BGP 0x0080
-#define BGPD_FLAG_NEXTHOP_DEFAULT 0x1000
+#define BGPD_FLAG_NEXTHOP_BGP 0x0010
+#define BGPD_FLAG_NEXTHOP_DEFAULT 0x0020
#define BGPD_FLAG_DECISION_MASK 0x0f00
#define BGPD_FLAG_DECISION_ROUTEAGE 0x0100
#define BGPD_FLAG_DECISION_TRANS_AS 0x0200
Index: parse.y
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v
retrieving revision 1.396
diff -u -p -r1.396 parse.y
--- parse.y 24 Jul 2019 20:25:27 -0000 1.396
+++ parse.y 25 Jul 2019 09:01:04 -0000
@@ -203,7 +203,7 @@ typedef struct {
%token ANNOUNCE CAPABILITIES REFRESH AS4BYTE CONNECTRETRY
%token DEMOTE ENFORCE NEIGHBORAS ASOVERRIDE REFLECTOR DEPEND DOWN
%token DUMP IN OUT SOCKET RESTRICTED
-%token LOG ROUTECOLL TRANSPARENT
+%token LOG TRANSPARENT
%token TCP MD5SIG PASSWORD KEY TTLSECURITY
%token ALLOW DENY MATCH
%token QUICK
@@ -614,12 +614,6 @@ conf_main : AS as4number {
else
rr->flags &= ~F_RIB_NOFIBSYNC;
}
- | ROUTECOLL yesno {
- if ($2 == 1)
- conf->flags |= BGPD_FLAG_NO_EVALUATE;
- else
- conf->flags &= ~BGPD_FLAG_NO_EVALUATE;
- }
| TRANSPARENT yesno {
if ($2 == 1)
conf->flags |= BGPD_FLAG_DECISION_TRANS_AS;
@@ -2843,7 +2837,6 @@ lookup(char *s)
{ "restricted", RESTRICTED},
{ "rib", RIB},
{ "roa-set", ROASET },
- { "route-collector", ROUTECOLL},
{ "route-reflector", REFLECTOR},
{ "router-id", ROUTERID},
{ "rtable", RTABLE},
Index: printconf.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v
retrieving revision 1.138
diff -u -p -r1.138 printconf.c
--- printconf.c 24 Jul 2019 20:25:27 -0000 1.138
+++ printconf.c 25 Jul 2019 09:02:30 -0000
@@ -386,9 +386,6 @@ print_mainconf(struct bgpd_config *conf)
if (conf->connectretry != INTERVAL_CONNECTRETRY)
printf("connect-retry %u\n", conf->connectretry);
- if (conf->flags & BGPD_FLAG_NO_EVALUATE)
- printf("route-collector yes\n");
-
if (conf->flags & BGPD_FLAG_DECISION_ROUTEAGE)
printf("rde route-age evaluate\n");
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.479
diff -u -p -r1.479 rde.c
--- rde.c 24 Jul 2019 20:25:27 -0000 1.479
+++ rde.c 25 Jul 2019 09:02:09 -0000
@@ -81,6 +81,9 @@ static void rde_softreconfig_out_done(v
static void rde_softreconfig_done(void);
static void rde_softreconfig_out(struct rib_entry *, void *);
static void rde_softreconfig_in(struct rib_entry *, void *);
+static void rde_softreconfig_sync_reeval(struct rib_entry *, void *);
+static void rde_softreconfig_sync_fib(struct rib_entry *, void *);
+static void rde_softreconfig_sync_done(void *, u_int8_t);
int rde_update_queue_pending(void);
void rde_update_queue_runner(void);
void rde_update6_queue_runner(u_int8_t);
@@ -802,6 +805,7 @@ rde_dispatch_imsg_parent(struct imsgbuf
if (!rib_valid(rid))
continue;
ribs[rid].state = RECONF_DELETE;
+ ribs[rid].fibstate = RECONF_NONE;
}
break;
case IMSG_RECONF_RIB:
@@ -810,29 +814,18 @@ rde_dispatch_imsg_parent(struct imsgbuf
fatalx("IMSG_RECONF_RIB bad len");
memcpy(&rn, imsg.data, sizeof(rn));
rib = rib_byid(rib_find(rn.name));
- if (rib == NULL)
+ if (rib == NULL) {
rib = rib_new(rn.name, rn.rtableid, rn.flags);
- else if (
- (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) !=
- (rn.flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) ||
- (rib->rtableid != rn.rtableid &&
- !(rn.flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)))) {
- struct filter_head *in_rules;
- struct rib_desc *ribd = rib_desc(rib);
- /*
- * Big hammer in the F_RIB_NOFIB case but
- * not often enough used to optimise it more.
- * Need to save the filters so that they're not
- * lost. If the rtableid changes but there is
- * no FIB no action is needed.
- */
- in_rules = ribd->in_rules;
- ribd->in_rules = NULL;
- rib_free(rib);
- rib = rib_new(rn.name, rn.rtableid, rn.flags);
- ribd->in_rules = in_rules;
- } else
+ } else if (rib->flags == rn.flags &&
+ rib->rtableid == rn.rtableid) {
+ /* no change to rib apart from filters */
rib_desc(rib)->state = RECONF_KEEP;
+ } else {
+ /* reload rib because somehing changed */
+ rib->flags_tmp = rn.flags;
+ rib->rtableid_tmp = rn.rtableid;
+ rib_desc(rib)->state = RECONF_RELOAD;
+ }
break;
case IMSG_RECONF_FILTER:
if (imsg.hdr.len - IMSG_HEADER_SIZE !=
@@ -2698,6 +2691,14 @@ rde_l3vpn_import(struct rde_community *c
}
void
+rde_send_kroute_flush(struct rib *rib)
+{
+ if (imsg_compose(ibuf_main, IMSG_KROUTE_FLUSH, rib->rtableid, 0, -1,
+ NULL, 0) == -1)
+ fatal("%s %d imsg_compose error", __func__, __LINE__);
+}
+
+void
rde_send_kroute(struct rib *rib, struct prefix *new, struct prefix *old)
{
struct kroute_full kr;
@@ -3010,11 +3011,11 @@ rde_send_pftable_commit(void)
* nexthop specific functions
*/
void
-rde_send_nexthop(struct bgpd_addr *next, int valid)
+rde_send_nexthop(struct bgpd_addr *next, int insert)
{
int type;
- if (valid)
+ if (insert)
type = IMSG_NEXTHOP_ADD;
else
type = IMSG_NEXTHOP_REMOVE;
@@ -3038,13 +3039,6 @@ rde_reload_done(void)
softreconfig = 0;
- /* first merge the main config */
- if ((conf->flags & BGPD_FLAG_NO_EVALUATE) &&
- (nconf->flags & BGPD_FLAG_NO_EVALUATE) == 0) {
- log_warnx("disabling of route-collector mode ignored");
- nconf->flags |= BGPD_FLAG_NO_EVALUATE;
- }
-
SIMPLEQ_INIT(&prefixsets_old);
SIMPLEQ_INIT(&originsets_old);
SIMPLEQ_CONCAT(&prefixsets_old, &conf->rde_prefixsets);
@@ -3052,7 +3046,9 @@ rde_reload_done(void)
roa_old = conf->rde_roa;
as_sets_old = conf->as_sets;
+ /* merge the main config */
copy_config(conf, nconf);
+
/* need to copy the sets and roa table and clear them in nconf */
SIMPLEQ_CONCAT(&conf->rde_prefixsets, &nconf->rde_prefixsets);
SIMPLEQ_CONCAT(&conf->rde_originsets, &nconf->rde_originsets);
@@ -3129,6 +3125,7 @@ rde_reload_done(void)
rde_softreconfig_in_done, NULL) == -1)
fatal("%s: prefix_dump_new", __func__);
log_peer_info(&peer->conf, "flushing Adj-RIB-Out");
+ softreconfig++; /* account for the running flush */
continue;
}
if (!rde_filter_equal(out_rules, out_rules_tmp, peer)) {
@@ -3153,6 +3150,10 @@ rde_reload_done(void)
case RECONF_DELETE:
rib_free(&ribs[rid].rib);
break;
+ case RECONF_RELOAD:
+ rib_update(&ribs[rid].rib);
+ ribs[rid].state = RECONF_KEEP;
+ /* FALLTHROUGH */
case RECONF_KEEP:
if (rde_filter_equal(ribs[rid].in_rules,
ribs[rid].in_rules_tmp, NULL))
@@ -3164,12 +3165,10 @@ rde_reload_done(void)
reload++;
break;
case RECONF_REINIT:
+ /* new rib */
ribs[rid].state = RECONF_RELOAD;
reload++;
break;
- case RECONF_RELOAD:
- log_warnx("Bad rib reload state");
- /* FALLTHROUGH */
case RECONF_NONE:
break;
}
@@ -3205,12 +3204,29 @@ rde_softreconfig_in_done(void *arg, u_in
log_info("softreconfig in done");
}
- /* now do the Adj-RIB-Out sync */
+ /* now do the Adj-RIB-Out sync and a possible FIB sync */
softreconfig = 0;
for (rid = 0; rid < rib_size; rid++) {
if (!rib_valid(rid))
continue;
ribs[rid].state = RECONF_NONE;
+ if (ribs[rid].fibstate == RECONF_RELOAD) {
+ if (rib_dump_new(rid, AID_UNSPEC, RDE_RUNNER_ROUNDS,
+ &ribs[rid], rde_softreconfig_sync_fib,
+ rde_softreconfig_sync_done, NULL) == -1)
+ fatal("%s: rib_dump_new", __func__);
+ softreconfig++;
+ log_info("starting fib sync for rib %s",
+ ribs[rid].name);
+ } else if (ribs[rid].fibstate == RECONF_REINIT) {
+ if (rib_dump_new(rid, AID_UNSPEC, RDE_RUNNER_ROUNDS,
+ &ribs[rid], rde_softreconfig_sync_reeval,
+ rde_softreconfig_sync_done, NULL) == -1)
+ fatal("%s: rib_dump_new", __func__);
+ softreconfig++;
+ log_info("starting re-evaluation of rib %s",
+ ribs[rid].name);
+ }
}
LIST_FOREACH(peer, &peerlist, peer_l) {
@@ -3252,14 +3268,11 @@ rde_softreconfig_out_done(void *arg, u_i
struct rib_desc *rib = arg;
/* this RIB dump is done */
- softreconfig--;
log_info("softreconfig out done for %s", rib->name);
- /* but other dumps are still running */
- if (softreconfig > 0)
- return;
-
- rde_softreconfig_done();
+ /* check if other dumps are still running */
+ if (--softreconfig == 0)
+ rde_softreconfig_done();
}
static void
@@ -3364,6 +3377,68 @@ rde_softreconfig_out(struct rib_entry *r
}
}
+static void
+rde_softreconfig_sync_reeval(struct rib_entry *re, void *arg)
+{
+ struct prefix_list prefixes;
+ struct prefix *p, *next;
+ struct rib_desc *rd = arg;
+
+ if (rd->rib.flags & F_RIB_NOEVALUATE) {
+ /*
+ * evaluation process is turned off
+ * so remove all prefixes from adj-rib-out
+ * also unlink nexthop if it was linked
+ */
+ LIST_FOREACH(p, &re->prefix_h, entry.list.rib) {
+ if (p->flags & PREFIX_NEXTHOP_LINKED)
+ nexthop_unlink(p);
+ }
+ if (re->active) {
+ rde_generate_updates(re_rib(re), NULL, re->active);
+ re->active = NULL;
+ }
+ return;
+ }
+
+ /* evaluation process is turned on, so evaluate all prefixes again */
+ re->active = NULL;
+ prefixes = re->prefix_h;
+ LIST_INIT(&re->prefix_h);
+
+ LIST_FOREACH_SAFE(p, &prefixes, entry.list.rib, next) {
+ /* need to re-link the nexthop if not already linked */
+ if ((p->flags & PREFIX_NEXTHOP_LINKED) == 0)
+ nexthop_link(p);
+ LIST_REMOVE(p, entry.list.rib);
+ prefix_evaluate(p, re);
+ }
+}
+
+static void
+rde_softreconfig_sync_fib(struct rib_entry *re, void *bula)
+{
+ if (re->active)
+ rde_send_kroute(re_rib(re), re->active, NULL);
+}
+
+static void
+rde_softreconfig_sync_done(void *arg, u_int8_t aid)
+{
+ struct rib_desc *rd = arg;
+
+ /* this RIB dump is done */
+ if (rd->fibstate == RECONF_RELOAD)
+ log_info("fib sync done for %s", rd->name);
+ else
+ log_info("re-evaluation done for %s", rd->name);
+ rd->fibstate = RECONF_NONE;
+
+ /* check if other dumps are still running */
+ if (--softreconfig == 0)
+ rde_softreconfig_done();
+}
+
/*
* generic helper function
*/
@@ -3374,16 +3449,6 @@ rde_local_as(void)
}
int
-rde_noevaluate(void)
-{
- /* do not run while cleaning up */
- if (rde_quit)
- return (1);
-
- return (conf->flags & BGPD_FLAG_NO_EVALUATE);
-}
-
-int
rde_decisionflags(void)
{
return (conf->flags & BGPD_FLAG_DECISION_MASK);
@@ -3618,13 +3683,6 @@ peer_up(u_int32_t id, struct session_up
}
peer->state = PEER_UP;
-
- if (rde_noevaluate())
- /*
- * no need to dump the table to the peer, there are no active
- * prefixes anyway. This is a speed up hack.
- */
- return;
for (i = 0; i < AID_MAX; i++) {
if (peer->capa.mp[i])
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.221
diff -u -p -r1.221 rde.h
--- rde.h 22 Jul 2019 07:32:16 -0000 1.221
+++ rde.h 25 Jul 2019 09:26:34 -0000
@@ -52,7 +52,9 @@ struct rib_entry {
struct rib {
struct rib_tree tree;
u_int rtableid;
+ u_int rtableid_tmp;
u_int16_t flags;
+ u_int16_t flags_tmp;
u_int16_t id;
};
@@ -65,7 +67,7 @@ struct rib_desc {
struct rib rib;
struct filter_head *in_rules;
struct filter_head *in_rules_tmp;
- enum reconf_action state;
+ enum reconf_action state, fibstate;
};
/*
@@ -356,6 +358,7 @@ int mrt_dump_v2_hdr(struct mrt *, struc
void mrt_dump_upcall(struct rib_entry *, void *);
/* rde.c */
+void rde_send_kroute_flush(struct rib *);
void rde_send_kroute(struct rib *, struct prefix *, struct prefix *);
void rde_send_nexthop(struct bgpd_addr *, int);
void rde_send_pftable(u_int16_t, struct bgpd_addr *,
@@ -365,7 +368,6 @@ void rde_send_pftable_commit(void);
void rde_generate_updates(struct rib *, struct prefix *,
struct prefix *);
u_int32_t rde_local_as(void);
-int rde_noevaluate(void);
int rde_decisionflags(void);
int rde_as4byte(struct rde_peer *);
struct rde_peer *peer_get(u_int32_t);
@@ -507,6 +509,7 @@ extern u_int16_t rib_size;
extern struct rib_desc *ribs;
struct rib *rib_new(char *, u_int, u_int16_t);
+void rib_update(struct rib *);
struct rib *rib_byid(u_int16_t);
u_int16_t rib_find(char *);
struct rib_desc *rib_desc(struct rib *);
Index: rde_decide.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.76
diff -u -p -r1.76 rde_decide.c
--- rde_decide.c 22 Jul 2019 07:34:16 -0000 1.76
+++ rde_decide.c 25 Jul 2019 08:58:48 -0000
@@ -242,11 +242,20 @@ prefix_evaluate(struct prefix *p, struct
{
struct prefix *xp;
- if (re_rib(re)->flags & F_RIB_NOEVALUATE || rde_noevaluate()) {
+ if (re_rib(re)->flags & F_RIB_NOEVALUATE) {
/* decision process is turned off */
if (p != NULL)
LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib);
- re->active = NULL;
+ if (re->active) {
+ /*
+ * During reloads it is possible that the decision
+ * process is turned off but prefixes are still
+ * active. Clean up now to ensure that the RIB
+ * is consistant.
+ */
+ rde_generate_updates(re_rib(re), NULL, re->active);
+ re->active = NULL;
+ }
return;
}
Index: rde_prefix.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_prefix.c,v
retrieving revision 1.39
diff -u -p -r1.39 rde_prefix.c
--- rde_prefix.c 1 Jul 2019 07:07:08 -0000 1.39
+++ rde_prefix.c 24 Jul 2019 11:43:34 -0000
@@ -65,8 +65,15 @@ pt_init(void)
void
pt_shutdown(void)
{
+ struct pt_entry *pte;
if (!RB_EMPTY(&pttable))
log_debug("pt_shutdown: tree is not empty.");
+ RB_FOREACH(pte, pt_tree, &pttable) {
+ struct bgpd_addr addr;
+ pt_getaddr(pte, &addr);
+ log_debug(" %s/%d refcnt %d", log_addr(&addr),
+ pte->prefixlen, pte->refcnt);
+ }
}
void
Index: rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
retrieving revision 1.201
diff -u -p -r1.201 rde_rib.c
--- rde_rib.c 24 Jul 2019 20:54:54 -0000 1.201
+++ rde_rib.c 25 Jul 2019 09:04:20 -0000
@@ -178,6 +178,36 @@ rib_new(char *name, u_int rtableid, u_in
return (&ribs[id].rib);
}
+/*
+ * This function is only called when the FIB information of a RIB changed.
+ * It will flush the FIB if there was one previously and change the fibstate
+ * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB)
+ * or RECONF_REINIT (rerun the route decision process for every element)
+ * depending on the new flags.
+ */
+void
+rib_update(struct rib *rib)
+{
+ struct rib_desc *rd = rib_desc(rib);
+
+ /* flush fib first if there was one */
+ if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
+ rde_send_kroute_flush(rib);
+
+ /* if no evaluate changes then a full reinit is needed */
+ if ((rib->flags & F_RIB_NOEVALUATE) !=
+ (rib->flags_tmp & F_RIB_NOEVALUATE))
+ rd->fibstate = RECONF_REINIT;
+
+ rib->flags = rib->flags_tmp;
+ rib->rtableid = rib->rtableid_tmp;
+
+ /* reload fib if there is no reinit pending and there will be a fib */
+ if (rd->fibstate != RECONF_REINIT &&
+ (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
+ rd->fibstate = RECONF_RELOAD;
+}
+
struct rib *
rib_byid(u_int16_t rid)
{
@@ -212,12 +242,20 @@ rib_desc(struct rib *rib)
void
rib_free(struct rib *rib)
{
- struct rib_desc *rd;
+ struct rib_desc *rd = rib_desc(rib);
struct rib_entry *re, *xre;
- struct prefix *p, *np;
+ struct prefix *p;
rib_dump_abort(rib->id);
+ /*
+ * flush the rib, disable route evaluation and fib sync to speed up
+ * the prefix removal. Nothing depends on this data anymore.
+ */
+ if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
+ rde_send_kroute_flush(rib);
+ rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB;
+
for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) {
xre = RB_NEXT(rib_tree, rib_tree(rib), re);
@@ -229,7 +267,6 @@ rib_free(struct rib *rib)
*/
while ((p = LIST_FIRST(&re->prefix_h))) {
struct rde_aspath *asp = prefix_aspath(p);
- np = LIST_NEXT(p, entry.list.rib);
if (asp && asp->pftableid) {
struct bgpd_addr addr;
@@ -239,13 +276,10 @@ rib_free(struct rib *rib)
p->pt->prefixlen, 1);
}
prefix_destroy(p);
- if (np == NULL)
- break;
}
}
if (rib->id <= RIB_LOC_START)
return; /* never remove the default ribs */
- rd = &ribs[rib->id];
filterlist_free(rd->in_rules_tmp);
filterlist_free(rd->in_rules);
memset(rd, 0, sizeof(struct rib_desc));
@@ -259,9 +293,18 @@ rib_shutdown(void)
for (id = 0; id < rib_size; id++) {
if (!rib_valid(id))
continue;
- if (!RB_EMPTY(rib_tree(&ribs[id].rib)))
+ if (!RB_EMPTY(rib_tree(&ribs[id].rib))) {
+ struct rib_entry *re;
log_warnx("%s: rib %s is not empty", __func__,
ribs[id].name);
+ RB_FOREACH(re, rib_tree, rib_tree(&ribs[id].rib)) {
+ struct bgpd_addr addr;
+ pt_getaddr(re->prefix, &addr);
+ log_debug(" %s/%d empty %d locked %d",
+ log_addr(&addr), re->prefix->prefixlen,
+ rib_empty(re), re_is_locked(re));
+ }
+ }
rib_free(&ribs[id].rib);
}
for (id = 0; id <= RIB_LOC_START; id++) {
@@ -1765,13 +1808,6 @@ nexthop_update(struct kroute_nexthop *ms
memcpy(&nh->nexthop_net, &msg->net,
sizeof(nh->nexthop_net));
nh->nexthop_netlen = msg->netlen;
-
- if (rde_noevaluate())
- /*
- * if the decision process is turned off there is no need
- * for the prefix list walk.
- */
- return;
nh->next_prefix = LIST_FIRST(&nh->prefix_h);
TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l);