Claudio Jeker(cje...@diehard.n-r-g.com) on 2019.06.17 21:34:30 +0200:
> Hi,
> 
> Now that the community rewrite is in here another diff to make bgpd more
> awesome. There was one loop left that did not run asynchronous compared to
> the main event loop. This loop was in nexthop_update() now a route server
> or a big route reflector would be hit hard by this loop since it will send
> out many updates when a nexthop becomes valid. I have seen times where the
> RDE was busy for 15min in nexthop_update().
> 
> This diff changes nexthop_update() to do this work asynchronous and
> therefor returns to the main event loop much more often.
> Nexthops now have a prefix pointer next_prefix which is used for looping
> through all prefixes. The for loop is broken up in RDE_RUNNER_ROUNDS steps
> which should hopefully return quickly.
> 
> Additionally prefixes belonging to a RIB that is not evaluated (no
> decision process running) are no longer linked into this nexthop list
> which should also shorten the loop a bit (at least Adj-RIB-In and
> Adj-RIB-Out are now skipped). To make this work correctly the linking and
> unlinking of prefixes has been adjusted to always work in the right way
> (nexhop last on link and first on unlink).

reads ok, one comment.

> When testing please watch out for this message:
> prefix_updateall: prefix with F_RIB_NOEVALUATE hit
> This should never happen.
> 
> I'm running this on a few routers with good success. Please check if you see
> a) worse convergence time (updates propagating much slower)
> b) better responsiveness (to e.g. bgpctl commands)

i see things much faster, actually too fast, i cant meaningdul test bgpctl
responsiveness.

> There is some potential for tuning but I would prefer to do this at a
> later stage.

sure. one question about that: is the TAILQ_INSERT_TAIL/TAILQ_INSERT_HEAD
optimal?

> -- 
> :wq Claudio
> 
> Index: rde.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
> retrieving revision 1.470
> diff -u -p -r1.470 rde.c
> --- rde.c     17 Jun 2019 13:35:43 -0000      1.470
> +++ rde.c     17 Jun 2019 13:56:16 -0000
> @@ -256,9 +256,6 @@ rde_main(int debug, int verbose)
>               set_pollfd(&pfd[PFD_PIPE_SESSION], ibuf_se);
>               set_pollfd(&pfd[PFD_PIPE_SESSION_CTL], ibuf_se_ctl);
>  
> -             if (rib_dump_pending() || rde_update_queue_pending())
> -                     timeout = 0;
> -
>               i = PFD_PIPE_COUNT;
>               for (mctx = LIST_FIRST(&rde_mrts); mctx != 0; mctx = xmctx) {
>                       xmctx = LIST_NEXT(mctx, entry);
> @@ -275,6 +272,10 @@ rde_main(int debug, int verbose)
>                       }
>               }
>  
> +             if (rib_dump_pending() || rde_update_queue_pending() ||
> +                 nexthop_pending())
> +                     timeout = 0;
> +
>               if (poll(pfd, i, timeout) == -1) {
>                       if (errno != EINTR)
>                               fatal("poll error");
> @@ -311,12 +312,13 @@ rde_main(int debug, int verbose)
>                       mctx = LIST_NEXT(mctx, entry);
>               }
>  
> +             rib_dump_runner();
> +             nexthop_runner();
>               if (ibuf_se && ibuf_se->w.queued < SESS_MSG_HIGH_MARK) {
>                       rde_update_queue_runner();
>                       for (aid = AID_INET6; aid < AID_MAX; aid++)
>                               rde_update6_queue_runner(aid);
>               }
> -             rib_dump_runner();
>       }
>  
>       /* do not clean up on shutdown on production, it takes ages. */
> Index: rde.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
> retrieving revision 1.214
> diff -u -p -r1.214 rde.h
> --- rde.h     17 Jun 2019 13:35:43 -0000      1.214
> +++ rde.h     17 Jun 2019 13:56:16 -0000
> @@ -228,12 +228,15 @@ struct rde_aspath {
>  enum nexthop_state {
>       NEXTHOP_LOOKUP,
>       NEXTHOP_UNREACH,
> -     NEXTHOP_REACH
> +     NEXTHOP_REACH,
> +     NEXTHOP_FLAPPED
>  };
>  
>  struct nexthop {
>       LIST_ENTRY(nexthop)     nexthop_l;
> +     TAILQ_ENTRY(nexthop)    runner_l;
>       struct prefix_list      prefix_h;
> +     struct prefix           *next_prefix;
>       struct bgpd_addr        exit_nexthop;
>       struct bgpd_addr        true_nexthop;
>       struct bgpd_addr        nexthop_net;
> @@ -246,6 +249,7 @@ struct nexthop {
>  #endif
>       int                     refcnt;
>       enum nexthop_state      state;
> +     enum nexthop_state      oldstate;
>       u_int8_t                nexthop_netlen;
>       u_int8_t                flags;
>  #define NEXTHOP_CONNECTED    0x01
> @@ -593,6 +597,8 @@ prefix_vstate(struct prefix *p)
>  
>  void          nexthop_init(u_int32_t);
>  void          nexthop_shutdown(void);
> +int           nexthop_pending(void);
> +void          nexthop_runner(void);
>  void          nexthop_modify(struct nexthop *, enum action_types, u_int8_t,
>                   struct nexthop **, u_int8_t *);
>  void          nexthop_link(struct prefix *);
> Index: rde_rib.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
> retrieving revision 1.191
> diff -u -p -r1.191 rde_rib.c
> --- rde_rib.c 17 Jun 2019 11:02:20 -0000      1.191
> +++ rde_rib.c 17 Jun 2019 19:10:07 -0000
> @@ -517,13 +517,15 @@ path_empty(struct rde_aspath *asp)
>       return (asp == NULL || asp->refcnt <= 0);
>  }
>  
> -static inline void
> +static inline struct rde_aspath *
>  path_ref(struct rde_aspath *asp)
>  {
>       if ((asp->flags & F_ATTR_LINKED) == 0)
>               fatalx("%s: unlinked object", __func__);
>       asp->refcnt++;
>       rdemem.path_refs++;
> +
> +     return asp;
>  }
>  
>  static inline void
> @@ -955,19 +957,16 @@ prefix_move(struct prefix *p, struct rde
>               fatalx("prefix_move: cross peer move");
>  
>       np = prefix_alloc();
> -     np->aspath = asp;
> -     np->communities = comm;
> -     np->nexthop = nexthop_ref(nexthop);
> -     nexthop_link(np);
> +     /* add reference to new AS path and communities */
> +     np->aspath = path_ref(asp);
> +     np->communities = communities_get(comm);
>       np->peer = peer;
>       np->re = p->re;
> -     np->lastchange = time(NULL);
> -     np->nhflags = nhflags;
>       np->validation_state = vstate;
> -
> -     /* add reference to new AS path and communities */
> -     path_ref(asp);
> -     communities_get(comm);
> +     np->nhflags = nhflags;
> +     np->nexthop = nexthop_ref(nexthop);
> +     nexthop_link(np);
> +     np->lastchange = time(NULL);
>  
>       /*
>        * no need to update the peer prefix count because we are only moving
> @@ -988,18 +987,18 @@ prefix_move(struct prefix *p, struct rde
>       /* remove old prefix node */
>       /* as before peer count needs no update because of move */
>       oasp = p->aspath;
> -     if (oasp)
> -             path_unref(oasp);
>  
>       /* destroy all references to other objects and free the old prefix */
> -     p->aspath = NULL;
> +     nexthop_unlink(p);
> +     nexthop_put(p->nexthop);
>       communities_put(p->communities);
> +     if (oasp)
> +             path_unref(oasp);
>       p->communities = NULL;
> +     p->nexthop = NULL;
> +     p->aspath = NULL;
>       p->peer = NULL;
>       p->re = NULL;
> -     nexthop_unlink(p);
> -     nexthop_put(p->nexthop);
> -     p->nexthop = NULL;
>       prefix_free(p);
>  
>       /* destroy old path if empty */
> @@ -1252,19 +1251,23 @@ prefix_updateall(struct prefix *p, enum 
>      enum nexthop_state oldstate)
>  {
>       /* Skip non local-RIBs or RIBs that are flagged as noeval. */
> -     if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
> +     if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) {
> +             log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__);
>               return;
> +     }
>  
> -     if (oldstate == state && state == NEXTHOP_REACH) {
> +     if (oldstate == state) {
>               /*
>                * The state of the nexthop did not change. The only
>                * thing that may have changed is the true_nexthop
>                * or other internal infos. This will not change
>                * the routing decision so shortcut here.
>                */
> -             if ((re_rib(p->re)->flags & F_RIB_NOFIB) == 0 &&
> -                 p == p->re->active)
> -                     rde_send_kroute(re_rib(p->re), p, NULL);
> +             if (state == NEXTHOP_REACH) {
> +                     if ((re_rib(p->re)->flags & F_RIB_NOFIB) == 0 &&
> +                         p == p->re->active)
> +                             rde_send_kroute(re_rib(p->re), p, NULL);
> +             }
>               return;
>       }
>  
> @@ -1305,17 +1308,15 @@ prefix_link(struct prefix *p, struct rib
>      struct rde_aspath *asp, struct rde_community *comm,
>      struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate)
>  {
> -     path_ref(asp);
> -
> -     p->aspath = asp;
> -     p->peer = peer;
> +     p->aspath = path_ref(asp);
>       p->communities = communities_get(comm);
> +     p->peer = peer;
> +     p->re = re;
> +     p->validation_state = vstate;
> +     p->nhflags = nhflags;
>       p->nexthop = nexthop_ref(nexthop);
>       nexthop_link(p);
> -     p->re = re;
>       p->lastchange = time(NULL);
> -     p->nhflags = nhflags;
> -     p->validation_state = vstate;
>  
>       /* make route decision */
>       prefix_evaluate(p, re);
> @@ -1336,22 +1337,21 @@ prefix_unlink(struct prefix *p)
>       LIST_REMOVE(p, rib_l);
>       prefix_evaluate(NULL, re);
>  
> -     if (p->aspath)
> -             path_unref(p->aspath);
> -
> -     if (rib_empty(re))
> -             rib_remove(re);
> -
>       /* destroy all references to other objects */
> -     communities_put(p->communities);
>       nexthop_unlink(p);
>       nexthop_put(p->nexthop);
> +     communities_put(p->communities);
> +     if (p->aspath)
> +             path_unref(p->aspath);
>       p->communities = NULL;
>       p->nexthop = NULL;
>       p->aspath = NULL;
>       p->peer = NULL;
>       p->re = NULL;
>  
> +     if (rib_empty(re))
> +             rib_remove(re);
> +
>       /*
>        * It's the caller's duty to do accounting and remove empty aspath
>        * structures. Also freeing the unlinked prefix is the caller's duty.
> @@ -1401,6 +1401,8 @@ struct nexthop_table {
>  
>  SIPHASH_KEY nexthoptablekey;
>  
> +TAILQ_HEAD(nexthop_queue, nexthop)   nexthop_runners;
> +
>  void
>  nexthop_init(u_int32_t hashsize)
>  {
> @@ -1412,6 +1414,7 @@ nexthop_init(u_int32_t hashsize)
>       if (nexthoptable.nexthop_hashtbl == NULL)
>               fatal("nextop_init");
>  
> +     TAILQ_INIT(&nexthop_runners);
>       for (i = 0; i < hs; i++)
>               LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
>       arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey));
> @@ -1443,12 +1446,45 @@ nexthop_shutdown(void)
>       free(nexthoptable.nexthop_hashtbl);
>  }
>  
> +int
> +nexthop_pending(void)
> +{
> +     return !TAILQ_EMPTY(&nexthop_runners);
> +}
> +
> +void
> +nexthop_runner(void)
> +{
> +     struct nexthop *nh;
> +     struct prefix *p;
> +     u_int32_t j;
> +
> +     nh = TAILQ_FIRST(&nexthop_runners);
> +     if (nh == NULL)
> +             return;
> +
> +     /* remove from runnner queue */
> +     TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
> +
> +     p = nh->next_prefix;
> +     for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) {
> +             prefix_updateall(p, nh->state, nh->oldstate);
> +             p = LIST_NEXT(p, nexthop_l);
> +     }
> +
> +     /* prep for next run, if not finished readd to tail of queue */
> +     nh->next_prefix = p;
> +     if (p != NULL)
> +             TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l);
> +     else
> +             log_debug("nexthop %s update finished",
> +                 log_addr(&nh->exit_nexthop));
> +}
> +
>  void
>  nexthop_update(struct kroute_nexthop *msg)
>  {
>       struct nexthop          *nh;
> -     struct prefix           *p;
> -     enum nexthop_state       oldstate;
>  
>       nh = nexthop_lookup(&msg->nexthop);
>       if (nh == NULL) {
> @@ -1457,7 +1493,20 @@ nexthop_update(struct kroute_nexthop *ms
>               return;
>       }
>  
> -     oldstate = nh->state;
> +     nh->oldstate = nh->state;
> +     if (nh->oldstate == NEXTHOP_LOOKUP)
> +             /* drop reference which was hold during the lookup */
> +             if (nexthop_put(nh))
> +                     return;         /* nh lost last ref, no work left */  

trailing whitespace

> +
> +     if (nh->next_prefix) {
> +             /*
> +              * If nexthop_runner() is not finished with this nexthop
> +              * then ensure that all prefixes are updated by setting
> +              * the oldstate to NEXTHOP_FLAPPED.
> +              */
> +             nh->oldstate = NEXTHOP_FLAPPED;
> +     }
>  
>       if (msg->valid)
>               nh->state = NEXTHOP_REACH;
> @@ -1476,21 +1525,16 @@ nexthop_update(struct kroute_nexthop *ms
>           sizeof(nh->nexthop_net));
>       nh->nexthop_netlen = msg->netlen;
>  
> -     if (oldstate == NEXTHOP_LOOKUP)
> -             /* drop reference which was hold during the lookup */
> -             if (nexthop_put(nh))
> -                     return;
> -
>       if (rde_noevaluate())
>               /*
>                * if the decision process is turned off there is no need
> -              * for the aspath list walk.
> +              * for the prefix list walk.
>                */
>               return;
>  
> -     LIST_FOREACH(p, &nh->prefix_h, nexthop_l) {
> -             prefix_updateall(p, nh->state, oldstate);
> -     }
> +     nh->next_prefix = LIST_FIRST(&nh->prefix_h);
> +     TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l);
> +     log_debug("nexthop %s update starting", log_addr(&nh->exit_nexthop));
>  }
>  
>  void
> @@ -1532,6 +1576,10 @@ nexthop_link(struct prefix *p)
>       if (p->nexthop == NULL)
>               return;
>  
> +     /* no need to link prefixes in RIBs that have no decision process */
> +     if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
> +             return;
> +
>       LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, nexthop_l);
>  }
>  
> @@ -1541,6 +1589,12 @@ nexthop_unlink(struct prefix *p)
>       if (p->nexthop == NULL)
>               return;
>  
> +     if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
> +             return;
> +
> +     if (p == p->nexthop->next_prefix)
> +             p->nexthop->next_prefix = LIST_NEXT(p, nexthop_l);
> +
>       LIST_REMOVE(p, nexthop_l);
>  }
>  
> @@ -1588,7 +1642,11 @@ nexthop_put(struct nexthop *nh)
>  
>       /* sanity check */
>       if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP)
> -             fatalx("nexthop_put: refcnt error");
> +             fatalx("%s: refcnt error", __func__);
> +
> +     /* is nexthop update running? impossible, that is a refcnt error */
> +     if (nh->next_prefix)
> +             fatalx("%s: next_prefix not NULL", __func__);
>  
>       LIST_REMOVE(nh, nexthop_l);
>       rde_send_nexthop(&nh->exit_nexthop, 0);
> 

Reply via email to