Hi, When forwaring in packets parallel while removing the corresponding ARP entry in a loop like this while :; do arp -nd 10.10.21.1; arp -nd 10.10.22.4; done it was quite easy to crash the kernel.
The race is exposed by putting a delay() here: Index: net/rtable.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v retrieving revision 1.82 diff -u -p -r1.82 rtable.c --- net/rtable.c 19 Apr 2023 17:42:47 -0000 1.82 +++ net/rtable.c 5 Jul 2023 19:59:04 -0000 @@ -489,6 +489,8 @@ rtable_match(unsigned int rtableid, stru if (an == NULL) goto out; + delay(10); /* triggers crash */ + rt = SRPL_FIRST(&sr, &an->an_rtlist); if (rt == NULL) { SRPL_LEAVE(&sr); Then expecting the struct rtentry rt_ifa is bogous and rt_refcnt is 0. So it looks like a refcounting bug. While an_rtlist is protected by SRP, the route entries are not. I increased the refcount while the entry is held the an_rtlist. Then the crash goes away. ok? bluhm Index: net/rtable.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v retrieving revision 1.82 diff -u -p -r1.82 rtable.c --- net/rtable.c 19 Apr 2023 17:42:47 -0000 1.82 +++ net/rtable.c 5 Jul 2023 20:05:26 -0000 @@ -604,6 +604,11 @@ rtable_insert(unsigned int rtableid, str SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next); prev = art_insert(ar, an, addr, plen); + if (prev == an) { + rw_exit_write(&ar->ar_lock); + /* keep the refcount for rt while it is in an_rtlist */ + return (0); + } if (prev != an) { SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); @@ -689,9 +694,10 @@ rtable_delete(unsigned int rtableid, str npaths++; if (npaths > 1) { - KASSERT(refcnt_read(&rt->rt_refcnt) >= 1); + KASSERT(refcnt_read(&rt->rt_refcnt) >= 2); SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); + rtfree(rt); mrt = SRPL_FIRST_LOCKED(&an->an_rtlist); if (npaths == 2) @@ -703,8 +709,9 @@ rtable_delete(unsigned int rtableid, str if (art_delete(ar, an, addr, plen) == NULL) panic("art_delete failed to find node %p", an); - KASSERT(refcnt_read(&rt->rt_refcnt) >= 1); + KASSERT(refcnt_read(&rt->rt_refcnt) >= 2); SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); + rtfree(rt); art_put(an); leave: @@ -821,12 +828,11 @@ rtable_mpath_reprio(unsigned int rtablei */ rt->rt_priority = prio; } else { - rtref(rt); /* keep rt alive in between remove and insert */ + KASSERT(refcnt_read(&rt->rt_refcnt) >= 2); SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); rt->rt_priority = prio; rtable_mpath_insert(an, rt); - rtfree(rt); error = EAGAIN; } rw_exit_write(&ar->ar_lock); @@ -839,6 +845,9 @@ rtable_mpath_insert(struct art_node *an, { struct rtentry *mrt, *prt = NULL; uint8_t prio = rt->rt_priority; + + /* increment the refcount for rt while it is in an_rtlist */ + rtref(rt); if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) { SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);