There is currently a race in bgpd when multiple nexthop become invalid at the same time. The problem is that the decision process may select an alternative path that also has a no longer valid nexthop but the process that does all the adjustments did not reach that prefix yet. The main issue here is that the RDE uses the true_nexthop to communicate this change but the true_nexthop is 0 or :: in this case.
This diff solves the issue by no longer using true_nexthop in the kroute message but instead have the kroute code do the lookup instead. The state in kroute is always up to date so the system knows if the nexthop is valid or not and either issues a change or remove depending on that. With this the rde no longer uses the true_nexthop (it is only there to be reported to bgpctl). It only cares about exit_nexthop, validity and the connected network info. All of those should not cause any problems during nexthop flips. -- :wq Claudio Index: kroute.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/kroute.c,v retrieving revision 1.288 diff -u -p -r1.288 kroute.c --- kroute.c 10 Aug 2022 14:21:24 -0000 1.288 +++ kroute.c 11 Aug 2022 09:31:15 -0000 @@ -160,6 +160,7 @@ void kif_clear(void); int kroute_validate(struct kroute *); int kroute6_validate(struct kroute6 *); +int knexthop_true_nexthop(struct ktable *, struct kroute_full *); void knexthop_validate(struct ktable *, struct knexthop *); void knexthop_track(struct ktable *, u_short); void knexthop_send_update(struct knexthop *); @@ -453,6 +454,8 @@ kr_change(u_int rtableid, struct kroute_ return (0); kf->flags |= F_BGPD; kf->priority = RTP_MINE; + if (!knexthop_true_nexthop(kt, kf)) + return kroute_remove(kt, kf, 1); switch (kf->prefix.aid) { case AID_INET: return (kr4_change(kt, kf)); @@ -2122,6 +2125,51 @@ kroute6_validate(struct kroute6 *kr) } return (kif->k.nh_reachable); +} + +int +knexthop_true_nexthop(struct ktable *kt, struct kroute_full *kf) +{ + struct bgpd_addr gateway = { 0 }; + struct knexthop *kn; + struct kroute *kr; + struct kroute6 *kr6; + + /* + * Ignore the nexthop for VPN routes. The gateway is a forced + * to an mpe(4) interface route using an MPLS label. + */ + switch (kf->prefix.aid) { + case AID_VPN_IPv4: + case AID_VPN_IPv6: + return 1; + } + + kn = knexthop_find(kt, &kf->nexthop); + if (kn == NULL) { + log_warnx("%s: nexthop %s not found", __func__, + log_addr(&kf->nexthop)); + return 0; + } + if (kn->kroute == NULL) + return 0; + + switch (kn->nexthop.aid) { + case AID_INET: + kr = kn->kroute; + gateway.aid = AID_INET; + gateway.v4.s_addr = kr->nexthop.s_addr; + break; + case AID_INET6: + kr6 = kn->kroute; + gateway.aid = AID_INET6; + memcpy(&gateway.v6, &kr6->nexthop, sizeof(struct in6_addr)); + gateway.scope_id = kr6->nexthop_scope_id; + break; + } + + kf->nexthop = gateway; + return 1; } void Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v retrieving revision 1.562 diff -u -p -r1.562 rde.c --- rde.c 10 Aug 2022 14:17:01 -0000 1.562 +++ rde.c 11 Aug 2022 09:03:03 -0000 @@ -3056,8 +3056,7 @@ rde_send_kroute(struct rib *rib, struct kf.flags |= F_REJECT; if (prefix_nhflags(p) == NEXTHOP_BLACKHOLE) kf.flags |= F_BLACKHOLE; - memcpy(&kf.nexthop, &prefix_nexthop(p)->true_nexthop, - sizeof(kf.nexthop)); + kf.nexthop = prefix_nexthop(p)->exit_nexthop; strlcpy(kf.label, rtlabel_id2name(prefix_aspath(p)->rtlabelid), sizeof(kf.label)); } @@ -3072,13 +3071,6 @@ rde_send_kroute(struct rib *rib, struct SIMPLEQ_FOREACH(vpn, &conf->l3vpns, entry) { if (!rde_l3vpn_import(prefix_communities(p), vpn)) continue; - /* must send exit_nexthop so that correct MPLS tunnel - * is chosen - */ - if (type == IMSG_KROUTE_CHANGE) - memcpy(&kf.nexthop, - &prefix_nexthop(p)->exit_nexthop, - sizeof(kf.nexthop)); /* XXX not ideal but this will change */ kf.ifindex = if_nametoindex(vpn->ifmpe); if (imsg_compose(ibuf_main, type, vpn->rtableid, 0, -1,