Claudio Jeker(cje...@diehard.n-r-g.com) on 2019.06.17 21:34:30 +0200: > Hi, > > Now that the community rewrite is in here another diff to make bgpd more > awesome. There was one loop left that did not run asynchronous compared to > the main event loop. This loop was in nexthop_update() now a route server > or a big route reflector would be hit hard by this loop since it will send > out many updates when a nexthop becomes valid. I have seen times where the > RDE was busy for 15min in nexthop_update(). > > This diff changes nexthop_update() to do this work asynchronous and > therefor returns to the main event loop much more often. > Nexthops now have a prefix pointer next_prefix which is used for looping > through all prefixes. The for loop is broken up in RDE_RUNNER_ROUNDS steps > which should hopefully return quickly. > > Additionally prefixes belonging to a RIB that is not evaluated (no > decision process running) are no longer linked into this nexthop list > which should also shorten the loop a bit (at least Adj-RIB-In and > Adj-RIB-Out are now skipped). To make this work correctly the linking and > unlinking of prefixes has been adjusted to always work in the right way > (nexhop last on link and first on unlink).
reads ok, one comment. > When testing please watch out for this message: > prefix_updateall: prefix with F_RIB_NOEVALUATE hit > This should never happen. > > I'm running this on a few routers with good success. Please check if you see > a) worse convergence time (updates propagating much slower) > b) better responsiveness (to e.g. bgpctl commands) i see things much faster, actually too fast, i cant meaningdul test bgpctl responsiveness. > There is some potential for tuning but I would prefer to do this at a > later stage. sure. one question about that: is the TAILQ_INSERT_TAIL/TAILQ_INSERT_HEAD optimal? > -- > :wq Claudio > > Index: rde.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v > retrieving revision 1.470 > diff -u -p -r1.470 rde.c > --- rde.c 17 Jun 2019 13:35:43 -0000 1.470 > +++ rde.c 17 Jun 2019 13:56:16 -0000 > @@ -256,9 +256,6 @@ rde_main(int debug, int verbose) > set_pollfd(&pfd[PFD_PIPE_SESSION], ibuf_se); > set_pollfd(&pfd[PFD_PIPE_SESSION_CTL], ibuf_se_ctl); > > - if (rib_dump_pending() || rde_update_queue_pending()) > - timeout = 0; > - > i = PFD_PIPE_COUNT; > for (mctx = LIST_FIRST(&rde_mrts); mctx != 0; mctx = xmctx) { > xmctx = LIST_NEXT(mctx, entry); > @@ -275,6 +272,10 @@ rde_main(int debug, int verbose) > } > } > > + if (rib_dump_pending() || rde_update_queue_pending() || > + nexthop_pending()) > + timeout = 0; > + > if (poll(pfd, i, timeout) == -1) { > if (errno != EINTR) > fatal("poll error"); > @@ -311,12 +312,13 @@ rde_main(int debug, int verbose) > mctx = LIST_NEXT(mctx, entry); > } > > + rib_dump_runner(); > + nexthop_runner(); > if (ibuf_se && ibuf_se->w.queued < SESS_MSG_HIGH_MARK) { > rde_update_queue_runner(); > for (aid = AID_INET6; aid < AID_MAX; aid++) > rde_update6_queue_runner(aid); > } > - rib_dump_runner(); > } > > /* do not clean up on shutdown on production, it takes ages. */ > Index: rde.h > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v > retrieving revision 1.214 > diff -u -p -r1.214 rde.h > --- rde.h 17 Jun 2019 13:35:43 -0000 1.214 > +++ rde.h 17 Jun 2019 13:56:16 -0000 > @@ -228,12 +228,15 @@ struct rde_aspath { > enum nexthop_state { > NEXTHOP_LOOKUP, > NEXTHOP_UNREACH, > - NEXTHOP_REACH > + NEXTHOP_REACH, > + NEXTHOP_FLAPPED > }; > > struct nexthop { > LIST_ENTRY(nexthop) nexthop_l; > + TAILQ_ENTRY(nexthop) runner_l; > struct prefix_list prefix_h; > + struct prefix *next_prefix; > struct bgpd_addr exit_nexthop; > struct bgpd_addr true_nexthop; > struct bgpd_addr nexthop_net; > @@ -246,6 +249,7 @@ struct nexthop { > #endif > int refcnt; > enum nexthop_state state; > + enum nexthop_state oldstate; > u_int8_t nexthop_netlen; > u_int8_t flags; > #define NEXTHOP_CONNECTED 0x01 > @@ -593,6 +597,8 @@ prefix_vstate(struct prefix *p) > > void nexthop_init(u_int32_t); > void nexthop_shutdown(void); > +int nexthop_pending(void); > +void nexthop_runner(void); > void nexthop_modify(struct nexthop *, enum action_types, u_int8_t, > struct nexthop **, u_int8_t *); > void nexthop_link(struct prefix *); > Index: rde_rib.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v > retrieving revision 1.191 > diff -u -p -r1.191 rde_rib.c > --- rde_rib.c 17 Jun 2019 11:02:20 -0000 1.191 > +++ rde_rib.c 17 Jun 2019 19:10:07 -0000 > @@ -517,13 +517,15 @@ path_empty(struct rde_aspath *asp) > return (asp == NULL || asp->refcnt <= 0); > } > > -static inline void > +static inline struct rde_aspath * > path_ref(struct rde_aspath *asp) > { > if ((asp->flags & F_ATTR_LINKED) == 0) > fatalx("%s: unlinked object", __func__); > asp->refcnt++; > rdemem.path_refs++; > + > + return asp; > } > > static inline void > @@ -955,19 +957,16 @@ prefix_move(struct prefix *p, struct rde > fatalx("prefix_move: cross peer move"); > > np = prefix_alloc(); > - np->aspath = asp; > - np->communities = comm; > - np->nexthop = nexthop_ref(nexthop); > - nexthop_link(np); > + /* add reference to new AS path and communities */ > + np->aspath = path_ref(asp); > + np->communities = communities_get(comm); > np->peer = peer; > np->re = p->re; > - np->lastchange = time(NULL); > - np->nhflags = nhflags; > np->validation_state = vstate; > - > - /* add reference to new AS path and communities */ > - path_ref(asp); > - communities_get(comm); > + np->nhflags = nhflags; > + np->nexthop = nexthop_ref(nexthop); > + nexthop_link(np); > + np->lastchange = time(NULL); > > /* > * no need to update the peer prefix count because we are only moving > @@ -988,18 +987,18 @@ prefix_move(struct prefix *p, struct rde > /* remove old prefix node */ > /* as before peer count needs no update because of move */ > oasp = p->aspath; > - if (oasp) > - path_unref(oasp); > > /* destroy all references to other objects and free the old prefix */ > - p->aspath = NULL; > + nexthop_unlink(p); > + nexthop_put(p->nexthop); > communities_put(p->communities); > + if (oasp) > + path_unref(oasp); > p->communities = NULL; > + p->nexthop = NULL; > + p->aspath = NULL; > p->peer = NULL; > p->re = NULL; > - nexthop_unlink(p); > - nexthop_put(p->nexthop); > - p->nexthop = NULL; > prefix_free(p); > > /* destroy old path if empty */ > @@ -1252,19 +1251,23 @@ prefix_updateall(struct prefix *p, enum > enum nexthop_state oldstate) > { > /* Skip non local-RIBs or RIBs that are flagged as noeval. */ > - if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) > + if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) { > + log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__); > return; > + } > > - if (oldstate == state && state == NEXTHOP_REACH) { > + if (oldstate == state) { > /* > * The state of the nexthop did not change. The only > * thing that may have changed is the true_nexthop > * or other internal infos. This will not change > * the routing decision so shortcut here. > */ > - if ((re_rib(p->re)->flags & F_RIB_NOFIB) == 0 && > - p == p->re->active) > - rde_send_kroute(re_rib(p->re), p, NULL); > + if (state == NEXTHOP_REACH) { > + if ((re_rib(p->re)->flags & F_RIB_NOFIB) == 0 && > + p == p->re->active) > + rde_send_kroute(re_rib(p->re), p, NULL); > + } > return; > } > > @@ -1305,17 +1308,15 @@ prefix_link(struct prefix *p, struct rib > struct rde_aspath *asp, struct rde_community *comm, > struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate) > { > - path_ref(asp); > - > - p->aspath = asp; > - p->peer = peer; > + p->aspath = path_ref(asp); > p->communities = communities_get(comm); > + p->peer = peer; > + p->re = re; > + p->validation_state = vstate; > + p->nhflags = nhflags; > p->nexthop = nexthop_ref(nexthop); > nexthop_link(p); > - p->re = re; > p->lastchange = time(NULL); > - p->nhflags = nhflags; > - p->validation_state = vstate; > > /* make route decision */ > prefix_evaluate(p, re); > @@ -1336,22 +1337,21 @@ prefix_unlink(struct prefix *p) > LIST_REMOVE(p, rib_l); > prefix_evaluate(NULL, re); > > - if (p->aspath) > - path_unref(p->aspath); > - > - if (rib_empty(re)) > - rib_remove(re); > - > /* destroy all references to other objects */ > - communities_put(p->communities); > nexthop_unlink(p); > nexthop_put(p->nexthop); > + communities_put(p->communities); > + if (p->aspath) > + path_unref(p->aspath); > p->communities = NULL; > p->nexthop = NULL; > p->aspath = NULL; > p->peer = NULL; > p->re = NULL; > > + if (rib_empty(re)) > + rib_remove(re); > + > /* > * It's the caller's duty to do accounting and remove empty aspath > * structures. Also freeing the unlinked prefix is the caller's duty. > @@ -1401,6 +1401,8 @@ struct nexthop_table { > > SIPHASH_KEY nexthoptablekey; > > +TAILQ_HEAD(nexthop_queue, nexthop) nexthop_runners; > + > void > nexthop_init(u_int32_t hashsize) > { > @@ -1412,6 +1414,7 @@ nexthop_init(u_int32_t hashsize) > if (nexthoptable.nexthop_hashtbl == NULL) > fatal("nextop_init"); > > + TAILQ_INIT(&nexthop_runners); > for (i = 0; i < hs; i++) > LIST_INIT(&nexthoptable.nexthop_hashtbl[i]); > arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey)); > @@ -1443,12 +1446,45 @@ nexthop_shutdown(void) > free(nexthoptable.nexthop_hashtbl); > } > > +int > +nexthop_pending(void) > +{ > + return !TAILQ_EMPTY(&nexthop_runners); > +} > + > +void > +nexthop_runner(void) > +{ > + struct nexthop *nh; > + struct prefix *p; > + u_int32_t j; > + > + nh = TAILQ_FIRST(&nexthop_runners); > + if (nh == NULL) > + return; > + > + /* remove from runnner queue */ > + TAILQ_REMOVE(&nexthop_runners, nh, runner_l); > + > + p = nh->next_prefix; > + for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) { > + prefix_updateall(p, nh->state, nh->oldstate); > + p = LIST_NEXT(p, nexthop_l); > + } > + > + /* prep for next run, if not finished readd to tail of queue */ > + nh->next_prefix = p; > + if (p != NULL) > + TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l); > + else > + log_debug("nexthop %s update finished", > + log_addr(&nh->exit_nexthop)); > +} > + > void > nexthop_update(struct kroute_nexthop *msg) > { > struct nexthop *nh; > - struct prefix *p; > - enum nexthop_state oldstate; > > nh = nexthop_lookup(&msg->nexthop); > if (nh == NULL) { > @@ -1457,7 +1493,20 @@ nexthop_update(struct kroute_nexthop *ms > return; > } > > - oldstate = nh->state; > + nh->oldstate = nh->state; > + if (nh->oldstate == NEXTHOP_LOOKUP) > + /* drop reference which was hold during the lookup */ > + if (nexthop_put(nh)) > + return; /* nh lost last ref, no work left */ trailing whitespace > + > + if (nh->next_prefix) { > + /* > + * If nexthop_runner() is not finished with this nexthop > + * then ensure that all prefixes are updated by setting > + * the oldstate to NEXTHOP_FLAPPED. > + */ > + nh->oldstate = NEXTHOP_FLAPPED; > + } > > if (msg->valid) > nh->state = NEXTHOP_REACH; > @@ -1476,21 +1525,16 @@ nexthop_update(struct kroute_nexthop *ms > sizeof(nh->nexthop_net)); > nh->nexthop_netlen = msg->netlen; > > - if (oldstate == NEXTHOP_LOOKUP) > - /* drop reference which was hold during the lookup */ > - if (nexthop_put(nh)) > - return; > - > if (rde_noevaluate()) > /* > * if the decision process is turned off there is no need > - * for the aspath list walk. > + * for the prefix list walk. > */ > return; > > - LIST_FOREACH(p, &nh->prefix_h, nexthop_l) { > - prefix_updateall(p, nh->state, oldstate); > - } > + nh->next_prefix = LIST_FIRST(&nh->prefix_h); > + TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l); > + log_debug("nexthop %s update starting", log_addr(&nh->exit_nexthop)); > } > > void > @@ -1532,6 +1576,10 @@ nexthop_link(struct prefix *p) > if (p->nexthop == NULL) > return; > > + /* no need to link prefixes in RIBs that have no decision process */ > + if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) > + return; > + > LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, nexthop_l); > } > > @@ -1541,6 +1589,12 @@ nexthop_unlink(struct prefix *p) > if (p->nexthop == NULL) > return; > > + if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) > + return; > + > + if (p == p->nexthop->next_prefix) > + p->nexthop->next_prefix = LIST_NEXT(p, nexthop_l); > + > LIST_REMOVE(p, nexthop_l); > } > > @@ -1588,7 +1642,11 @@ nexthop_put(struct nexthop *nh) > > /* sanity check */ > if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP) > - fatalx("nexthop_put: refcnt error"); > + fatalx("%s: refcnt error", __func__); > + > + /* is nexthop update running? impossible, that is a refcnt error */ > + if (nh->next_prefix) > + fatalx("%s: next_prefix not NULL", __func__); > > LIST_REMOVE(nh, nexthop_l); > rde_send_nexthop(&nh->exit_nexthop, 0); >