There is currently a race in bgpd when multiple nexthop become invalid at
the same time. The problem is that the decision process may select an
alternative path that also has a no longer valid nexthop but the process
that does all the adjustments did not reach that prefix yet.
The main issue here is that the RDE uses the true_nexthop to communicate
this change but the true_nexthop is 0 or :: in this case.

This diff solves the issue by no longer using true_nexthop in the kroute
message but instead have the kroute code do the lookup instead. The state
in kroute is always up to date so the system knows if the nexthop is valid
or not and either issues a change or remove depending on that.

With this the rde no longer uses the true_nexthop (it is only there to be
reported to bgpctl). It only cares about exit_nexthop, validity and the
connected network info. All of those should not cause any problems during
nexthop flips.

-- 
:wq Claudio

Index: kroute.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/kroute.c,v
retrieving revision 1.288
diff -u -p -r1.288 kroute.c
--- kroute.c    10 Aug 2022 14:21:24 -0000      1.288
+++ kroute.c    11 Aug 2022 09:31:15 -0000
@@ -160,6 +160,7 @@ void                 kif_clear(void);
 
 int             kroute_validate(struct kroute *);
 int             kroute6_validate(struct kroute6 *);
+int             knexthop_true_nexthop(struct ktable *, struct kroute_full *);
 void            knexthop_validate(struct ktable *, struct knexthop *);
 void            knexthop_track(struct ktable *, u_short);
 void            knexthop_send_update(struct knexthop *);
@@ -453,6 +454,8 @@ kr_change(u_int rtableid, struct kroute_
                return (0);
        kf->flags |= F_BGPD;
        kf->priority = RTP_MINE;
+       if (!knexthop_true_nexthop(kt, kf))
+               return kroute_remove(kt, kf, 1);
        switch (kf->prefix.aid) {
        case AID_INET:
                return (kr4_change(kt, kf));
@@ -2122,6 +2125,51 @@ kroute6_validate(struct kroute6 *kr)
        }
 
        return (kif->k.nh_reachable);
+}
+
+int
+knexthop_true_nexthop(struct ktable *kt, struct kroute_full *kf)
+{
+       struct bgpd_addr gateway = { 0 };
+       struct knexthop *kn;
+       struct kroute   *kr;
+       struct kroute6  *kr6;
+
+       /*
+        * Ignore the nexthop for VPN routes. The gateway is a forced
+        * to an mpe(4) interface route using an MPLS label.
+        */
+       switch (kf->prefix.aid) {
+       case AID_VPN_IPv4:
+       case AID_VPN_IPv6:
+               return 1;
+       }
+
+       kn = knexthop_find(kt, &kf->nexthop);
+       if (kn == NULL) {
+               log_warnx("%s: nexthop %s not found", __func__,
+                   log_addr(&kf->nexthop));
+               return 0;
+       }
+       if (kn->kroute == NULL)
+               return 0;
+
+       switch (kn->nexthop.aid) {
+       case AID_INET:
+               kr = kn->kroute;
+               gateway.aid = AID_INET;
+               gateway.v4.s_addr = kr->nexthop.s_addr;
+               break;
+       case AID_INET6:
+               kr6 = kn->kroute;
+               gateway.aid = AID_INET6;
+               memcpy(&gateway.v6, &kr6->nexthop, sizeof(struct in6_addr));
+               gateway.scope_id = kr6->nexthop_scope_id;
+               break;
+       }
+
+       kf->nexthop = gateway;
+       return 1;
 }
 
 void
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.562
diff -u -p -r1.562 rde.c
--- rde.c       10 Aug 2022 14:17:01 -0000      1.562
+++ rde.c       11 Aug 2022 09:03:03 -0000
@@ -3056,8 +3056,7 @@ rde_send_kroute(struct rib *rib, struct 
                        kf.flags |= F_REJECT;
                if (prefix_nhflags(p) == NEXTHOP_BLACKHOLE)
                        kf.flags |= F_BLACKHOLE;
-               memcpy(&kf.nexthop, &prefix_nexthop(p)->true_nexthop,
-                   sizeof(kf.nexthop));
+               kf.nexthop = prefix_nexthop(p)->exit_nexthop;
                strlcpy(kf.label, rtlabel_id2name(prefix_aspath(p)->rtlabelid),
                    sizeof(kf.label));
        }
@@ -3072,13 +3071,6 @@ rde_send_kroute(struct rib *rib, struct 
                SIMPLEQ_FOREACH(vpn, &conf->l3vpns, entry) {
                        if (!rde_l3vpn_import(prefix_communities(p), vpn))
                                continue;
-                       /* must send exit_nexthop so that correct MPLS tunnel
-                        * is chosen
-                        */
-                       if (type == IMSG_KROUTE_CHANGE)
-                               memcpy(&kf.nexthop,
-                                   &prefix_nexthop(p)->exit_nexthop,
-                                   sizeof(kf.nexthop));
                        /* XXX not ideal but this will change */
                        kf.ifindex = if_nametoindex(vpn->ifmpe);
                        if (imsg_compose(ibuf_main, type, vpn->rtableid, 0, -1,

Reply via email to