Re: bgpd: fix nexthop state bug in decision process

2022-07-25 Thread Theo Buehler
On Mon, Jul 25, 2022 at 12:16:50PM +0200, Claudio Jeker wrote:
> The nexthop validation or actually invalidation is buggy in bgpd since
> revision 1.90 of rde_decide.c. When I removed re->active and replaced it
> with a value that is calculated on the spot I did not realize that this
> calculation depends on the current nexthop state and not on the state used
> on the previous decision process.
> 
> The decision process used prefix_best() to get the previous best prefix.
> Now if that prefix just became invalid (because of a nexthop state change)
> prefix_best() will return NULL and oldbest and newbest will be NULL too.
> Because of this no update will be sent out because prefix_evaluate()
> thinks nothing changed.
> 
> This is also the reason why prefix_set_dmetric() sees bad dmetrics during
> nexthop flaps.
> 
> To solve this issue I decided to introduce a NEXTHOP_VALID flag in struct
> prefix (as part of the nhflags). Which is then used to validate the
> nexthop state. When a nexthop changes state nexthop_runner calls
> prefix_evaluate_nexthop() which adjust the NEXTHOP_VALID flag but this is
> done after removing the prefix from the evaluation list and before
> inserting it again. Because of this prefix_best() works correctly and
> oldbest is not NULL in the case mentioned above.
> 
> I decided to set the initial state of NEXTHOP_VALID in nexthop_link() and I
> clear it in nexthop_unlink(). The link call happens after nhflags is set
> and prefix_nhflags() masks out the NEXTHOP_VALID flag.
> 
> With this nexthops that turn invalid are properly removed from the FIB and
> Adj-RIB-Out.

Wow, that's nasty. The diff makes sense and looks correct to me.

ok tb



bgpd: fix nexthop state bug in decision process

2022-07-25 Thread Claudio Jeker
The nexthop validation or actually invalidation is buggy in bgpd since
revision 1.90 of rde_decide.c. When I removed re->active and replaced it
with a value that is calculated on the spot I did not realize that this
calculation depends on the current nexthop state and not on the state used
on the previous decision process.

The decision process used prefix_best() to get the previous best prefix.
Now if that prefix just became invalid (because of a nexthop state change)
prefix_best() will return NULL and oldbest and newbest will be NULL too.
Because of this no update will be sent out because prefix_evaluate()
thinks nothing changed.

This is also the reason why prefix_set_dmetric() sees bad dmetrics during
nexthop flaps.

To solve this issue I decided to introduce a NEXTHOP_VALID flag in struct
prefix (as part of the nhflags). Which is then used to validate the
nexthop state. When a nexthop changes state nexthop_runner calls
prefix_evaluate_nexthop() which adjust the NEXTHOP_VALID flag but this is
done after removing the prefix from the evaluation list and before
inserting it again. Because of this prefix_best() works correctly and
oldbest is not NULL in the case mentioned above.

I decided to set the initial state of NEXTHOP_VALID in nexthop_link() and I
clear it in nexthop_unlink(). The link call happens after nhflags is set
and prefix_nhflags() masks out the NEXTHOP_VALID flag.

With this nexthops that turn invalid are properly removed from the FIB and
Adj-RIB-Out.
-- 
:wq Claudio

Index: rde.c
===
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.558
diff -u -p -r1.558 rde.c
--- rde.c   23 Jul 2022 08:44:06 -  1.558
+++ rde.c   25 Jul 2022 08:23:18 -
@@ -4249,8 +4249,7 @@ network_dump_upcall(struct rib_entry *re
 
bzero(, sizeof(kf));
memcpy(, , sizeof(kf.prefix));
-   if (prefix_nexthop(p) == NULL ||
-   prefix_nexthop(p)->state != NEXTHOP_REACH)
+   if (prefix_nhvalid(p))
kf.nexthop.aid = kf.prefix.aid;
else
memcpy(, _nexthop(p)->true_nexthop,
Index: rde.h
===
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.259
diff -u -p -r1.259 rde.h
--- rde.h   11 Jul 2022 17:08:21 -  1.259
+++ rde.h   25 Jul 2022 08:28:38 -
@@ -362,6 +362,8 @@ struct prefix {
 #defineNEXTHOP_REJECT  0x02
 #defineNEXTHOP_BLACKHOLE   0x04
 #defineNEXTHOP_NOMODIFY0x08
+#defineNEXTHOP_MASK0x0f
+#defineNEXTHOP_VALID   0x80
 
 struct filterstate {
struct rde_aspathaspath;
@@ -522,6 +524,8 @@ int  prefix_eligible(struct prefix *);
 struct prefix  *prefix_best(struct rib_entry *);
 voidprefix_evaluate(struct rib_entry *, struct prefix *,
 struct prefix *);
+voidprefix_evaluate_nexthop(struct prefix *, enum nexthop_state,
+enum nexthop_state);
 
 /* rde_filter.c */
 void   rde_apply_set(struct filter_set_head *, struct rde_peer *,
@@ -663,7 +667,13 @@ prefix_nexthop(struct prefix *p)
 static inline uint8_t
 prefix_nhflags(struct prefix *p)
 {
-   return (p->nhflags);
+   return (p->nhflags & NEXTHOP_MASK);
+}
+
+static inline int
+prefix_nhvalid(struct prefix *p)
+{
+   return ((p->nhflags & NEXTHOP_VALID) != 0);
 }
 
 static inline uint8_t
Index: rde_decide.c
===
RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v
retrieving revision 1.96
diff -u -p -r1.96 rde_decide.c
--- rde_decide.c18 Jul 2022 09:42:46 -  1.96
+++ rde_decide.c25 Jul 2022 09:24:14 -
@@ -147,28 +147,10 @@ prefix_cmp(struct prefix *p1, struct pre
peer1 = prefix_peer(p1);
peer2 = prefix_peer(p2);
 
-   /* pathes with errors are not eligible */
-   if (asp1 == NULL || asp1->flags & F_ATTR_PARSE_ERR)
-   return -rv;
-   if (asp2 == NULL || asp2->flags & F_ATTR_PARSE_ERR)
-   return rv;
-
-   /* only loop free pathes are eligible */
-   if (asp1->flags & F_ATTR_LOOP)
-   return -rv;
-   if (asp2->flags & F_ATTR_LOOP)
-   return rv;
-
-   /*
-* 1. check if prefix is eligible a.k.a reachable
-*A NULL nexthop is eligible since it is used for locally
-*announced networks.
-*/
-   if (prefix_nexthop(p2) != NULL &&
-   prefix_nexthop(p2)->state != NEXTHOP_REACH)
+   /* 1. check if prefix is eligible a.k.a reachable */
+   if (!prefix_eligible(p2))
return rv;
-   if (prefix_nexthop(p1) != NULL &&
-   prefix_nexthop(p1)->state != NEXTHOP_REACH)
+   if (!prefix_eligible(p1))
return -rv;
 
/*