> On 27 Aug 2022, at 22:03, Alexander Bluhm <[email protected]> wrote:
>
> On Sat, Aug 27, 2022 at 03:14:15AM +0300, Vitaliy Makkoveev wrote:
>>> On 27 Aug 2022, at 00:04, Alexander Bluhm <[email protected]> wrote:
>>>
>>> Anyone willing to test or ok this?
>>>
>>
>> This fixes weird `ifa??? refcounting. I like this.
>>
>> Could the ifaref() and ifafree() names use the same notation? Like
>> ifaref() and ifarele() or ifaget() and ifafree() or something else?
>
> Refcount naming is very inconsistent.
>
> ifget(), ifput(), pf_state_key_ref(), pf_state_key_unref(), tdb_ref(),
> tdb_unref(), tdb_delete(), tdb_free(), vxlan_take(), vxlan_rele()
> all work in subtle different ways.
>
> I want to keep ifafree() as the name is established and called from
> many places. And giving ifaref() another name makes it different
> but not better.
>
> It would be easy to change something but hard to make it consistent.
> So I prefer to leave the diff as it is.
>
> bluhm
I have no objections to commit this diff.
>
>>> I would like to get it in before kn@ bumps ports due to net/if_var.h
>>> changes.
>>>
>>> On Wed, Aug 24, 2022 at 03:14:35PM +0200, Alexander Bluhm wrote:
>>>> On Tue, Aug 23, 2022 at 02:47:11PM +0200, Stefan Sperling wrote:
>>>>> ddb{2}> show struct ifaddr 0xffff8000004e9400
>>>>> struct ifaddr at 0xffff8000004e9400 (64 bytes) {ifa_addr = (struct
>>>>> sockaddr *)0
>>>>> xdeaf0009deafbead, ifa_dstaddr = (struct sockaddr *)0x20ef1a8af1d895de,
>>>>> ifa_net
>>>>> mask = (struct sockaddr *)0xdeafbeaddeafbead, ifa_ifp = (struct ifnet
>>>>> *)0xdeafb
>>>>> eaddeafbead, ifa_list = {tqe_next = (struct ifaddr *)0xdeafbeaddeafbead,
>>>>> tqe_pr
>>>>> ev = 0xdeafbeaddeafbead}, ifa_flags = 0xdeafbead, ifa_refcnt =
>>>>> 0xdeafbead, ifa_
>>>>> metric = 0xdeafbead}
>>>>
>>>> The ifaddr has been freed. That should not happen, as ifafree()
>>>> uses reference counting. But this refcount is a simple integer
>>>> increment, that is not MP safe. In theory all ++ should be protected
>>>> by exclusive netlock or shared netlock combiined with kernel lock.
>>>> But maybe I missed a path.
>>>>
>>>> What are you doing with the machine? Forwarding, IPv6, Stress test?
>>>> Do you have network interfaces with multiple network queues?
>>>>
>>>> Anyway, instead of adding kernel locks here and there, use a propper
>>>> struct refcnt. The variable ifatrash is never read, it looks like
>>>> a ddb debugging feature. But for debugging refcount leaks we have
>>>> dt(4) now.
>>>>
>>>> enc and mpls are special, they have interface address as part of
>>>> their sc structure. I use refcount anyway, the new panic prevents
>>>> use after free.
>>>>
>>>> This has passed a full regres run.
>>>>
>>>> ok?
>>>>
>>>> bluhm
>>>>
>>>> Index: dev/dt/dt_prov_static.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v
>>>> retrieving revision 1.14
>>>> diff -u -p -r1.14 dt_prov_static.c
>>>> --- dev/dt/dt_prov_static.c 28 Jun 2022 09:32:27 -0000 1.14
>>>> +++ dev/dt/dt_prov_static.c 23 Aug 2022 17:16:55 -0000
>>>> @@ -88,9 +88,10 @@ DT_STATIC_PROBE0(smr, wakeup);
>>>> DT_STATIC_PROBE2(smr, thread, "uint64_t", "uint64_t");
>>>>
>>>> /*
>>>> - * reference counting
>>>> + * reference counting, keep in sync with sys/refcnt.h
>>>> */
>>>> DT_STATIC_PROBE0(refcnt, none);
>>>> +DT_STATIC_PROBE3(refcnt, ifaddr, "void *", "int", "int");
>>>> DT_STATIC_PROBE3(refcnt, inpcb, "void *", "int", "int");
>>>> DT_STATIC_PROBE3(refcnt, tdb, "void *", "int", "int");
>>>>
>>>> @@ -135,6 +136,7 @@ struct dt_probe *const dtps_static[] = {
>>>> &_DT_STATIC_P(smr, thread),
>>>> /* refcnt */
>>>> &_DT_STATIC_P(refcnt, none),
>>>> + &_DT_STATIC_P(refcnt, ifaddr),
>>>> &_DT_STATIC_P(refcnt, inpcb),
>>>> &_DT_STATIC_P(refcnt, tdb),
>>>> };
>>>> Index: net/if_enc.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_enc.c,v
>>>> retrieving revision 1.78
>>>> diff -u -p -r1.78 if_enc.c
>>>> --- net/if_enc.c 28 Dec 2020 14:28:50 -0000 1.78
>>>> +++ net/if_enc.c 24 Aug 2022 07:51:49 -0000
>>>> @@ -100,6 +100,7 @@ enc_clone_create(struct if_clone *ifc, i
>>>> * and empty ifa of type AF_LINK for this purpose.
>>>> */
>>>> if_alloc_sadl(ifp);
>>>> + refcnt_init_trace(&sc->sc_ifa.ifa_refcnt, DT_REFCNT_IDX_IFADDR);
>>>> sc->sc_ifa.ifa_ifp = ifp;
>>>> sc->sc_ifa.ifa_addr = sdltosa(ifp->if_sadl);
>>>> sc->sc_ifa.ifa_netmask = NULL;
>>>> @@ -152,6 +153,10 @@ enc_clone_destroy(struct ifnet *ifp)
>>>> NET_UNLOCK();
>>>>
>>>> if_detach(ifp);
>>>> + if (refcnt_rele(&sc->sc_ifa.ifa_refcnt) == 0) {
>>>> + panic("%s: ifa refcnt has %u refs", __func__,
>>>> + sc->sc_ifa.ifa_refcnt.r_refs);
>>>> + }
>>>> free(sc, M_DEVBUF, sizeof(*sc));
>>>>
>>>> return (0);
>>>> Index: net/if_mpe.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_mpe.c,v
>>>> retrieving revision 1.101
>>>> diff -u -p -r1.101 if_mpe.c
>>>> --- net/if_mpe.c 8 Nov 2021 04:50:54 -0000 1.101
>>>> +++ net/if_mpe.c 24 Aug 2022 07:52:22 -0000
>>>> @@ -128,6 +128,7 @@ mpe_clone_create(struct if_clone *ifc, i
>>>> sc->sc_txhprio = 0;
>>>> sc->sc_rxhprio = IF_HDRPRIO_PACKET;
>>>> sc->sc_rdomain = 0;
>>>> + refcnt_init_trace(&sc->sc_ifa.ifa_refcnt, DT_REFCNT_IDX_IFADDR);
>>>> sc->sc_ifa.ifa_ifp = ifp;
>>>> sc->sc_ifa.ifa_addr = sdltosa(ifp->if_sadl);
>>>> sc->sc_smpls.smpls_len = sizeof(sc->sc_smpls);
>>>> @@ -154,6 +155,10 @@ mpe_clone_destroy(struct ifnet *ifp)
>>>> ifq_barrier(&ifp->if_snd);
>>>>
>>>> if_detach(ifp);
>>>> + if (refcnt_rele(&sc->sc_ifa.ifa_refcnt) == 0) {
>>>> + panic("%s: ifa refcnt has %u refs", __func__,
>>>> + sc->sc_ifa.ifa_refcnt.r_refs);
>>>> + }
>>>> free(sc, M_DEVBUF, sizeof *sc);
>>>> return (0);
>>>> }
>>>> Index: net/if_mpip.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_mpip.c,v
>>>> retrieving revision 1.15
>>>> diff -u -p -r1.15 if_mpip.c
>>>> --- net/if_mpip.c 26 Mar 2021 19:00:21 -0000 1.15
>>>> +++ net/if_mpip.c 24 Aug 2022 07:52:26 -0000
>>>> @@ -128,6 +128,7 @@ mpip_clone_create(struct if_clone *ifc,
>>>> bpfattach(&ifp->if_bpf, ifp, DLT_LOOP, sizeof(uint32_t));
>>>> #endif
>>>>
>>>> + refcnt_init_trace(&sc->sc_ifa.ifa_refcnt, DT_REFCNT_IDX_IFADDR);
>>>> sc->sc_ifa.ifa_ifp = ifp;
>>>> sc->sc_ifa.ifa_addr = sdltosa(ifp->if_sadl);
>>>>
>>>> @@ -152,7 +153,10 @@ mpip_clone_destroy(struct ifnet *ifp)
>>>> ifq_barrier(&ifp->if_snd);
>>>>
>>>> if_detach(ifp);
>>>> -
>>>> + if (refcnt_rele(&sc->sc_ifa.ifa_refcnt) == 0) {
>>>> + panic("%s: ifa refcnt has %u refs", __func__,
>>>> + sc->sc_ifa.ifa_refcnt.r_refs);
>>>> + }
>>>> free(sc->sc_neighbor, M_DEVBUF, sizeof(*sc->sc_neighbor));
>>>> free(sc, M_DEVBUF, sizeof(*sc));
>>>>
>>>> Index: net/if_mpw.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_mpw.c,v
>>>> retrieving revision 1.62
>>>> diff -u -p -r1.62 if_mpw.c
>>>> --- net/if_mpw.c 26 Mar 2021 19:00:21 -0000 1.62
>>>> +++ net/if_mpw.c 24 Aug 2022 07:52:00 -0000
>>>> @@ -122,6 +122,7 @@ mpw_clone_create(struct if_clone *ifc, i
>>>> sc->sc_txhprio = 0;
>>>> sc->sc_rxhprio = IF_HDRPRIO_PACKET;
>>>> sc->sc_rdomain = 0;
>>>> + refcnt_init_trace(&sc->sc_ifa.ifa_refcnt, DT_REFCNT_IDX_IFADDR);
>>>> sc->sc_ifa.ifa_ifp = ifp;
>>>> sc->sc_ifa.ifa_addr = sdltosa(ifp->if_sadl);
>>>> sc->sc_smpls.smpls_len = sizeof(sc->sc_smpls);
>>>> @@ -149,7 +150,10 @@ mpw_clone_destroy(struct ifnet *ifp)
>>>>
>>>> ether_ifdetach(ifp);
>>>> if_detach(ifp);
>>>> -
>>>> + if (refcnt_rele(&sc->sc_ifa.ifa_refcnt) == 0) {
>>>> + panic("%s: ifa refcnt has %u refs", __func__,
>>>> + sc->sc_ifa.ifa_refcnt.r_refs);
>>>> + }
>>>> free(sc->sc_neighbor, M_DEVBUF, sizeof(*sc->sc_neighbor));
>>>> free(sc, M_DEVBUF, sizeof(*sc));
>>>>
>>>> Index: net/if_pppx.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_pppx.c,v
>>>> retrieving revision 1.121
>>>> diff -u -p -r1.121 if_pppx.c
>>>> --- net/if_pppx.c 25 Jul 2022 08:29:26 -0000 1.121
>>>> +++ net/if_pppx.c 23 Aug 2022 17:34:20 -0000
>>>> @@ -659,6 +659,7 @@ pppx_add_session(struct pppx_dev *pxd, s
>>>> ifaddr.sin_addr = req->pr_ip_srcaddr;
>>>>
>>>> ia = malloc(sizeof (*ia), M_IFADDR, M_WAITOK | M_ZERO);
>>>> + refcnt_init_trace(&ia->ia_ifa.ifa_refcnt, DT_REFCNT_IDX_IFADDR);
>>>>
>>>> ia->ia_addr.sin_family = AF_INET;
>>>> ia->ia_addr.sin_len = sizeof(struct sockaddr_in);
>>>> Index: net/if_var.h
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
>>>> retrieving revision 1.114
>>>> diff -u -p -r1.114 if_var.h
>>>> --- net/if_var.h 20 Feb 2021 04:55:52 -0000 1.114
>>>> +++ net/if_var.h 23 Aug 2022 16:25:32 -0000
>>>> @@ -242,7 +242,7 @@ struct ifaddr {
>>>> struct ifnet *ifa_ifp; /* back-pointer to interface */
>>>> TAILQ_ENTRY(ifaddr) ifa_list; /* list of addresses for interface */
>>>> u_int ifa_flags; /* interface flags, see below */
>>>> - u_int ifa_refcnt; /* number of `rt_ifa` references */
>>>> + struct refcnt ifa_refcnt; /* number of `rt_ifa` references */
>>>> int ifa_metric; /* cost of going out this interface */
>>>> };
>>>>
>>>> @@ -333,6 +333,7 @@ int p2p_bpf_mtap(caddr_t, const struct m
>>>> struct ifaddr *ifa_ifwithaddr(struct sockaddr *, u_int);
>>>> struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *, u_int);
>>>> struct ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *);
>>>> +struct ifaddr *ifaref(struct ifaddr *);
>>>> void ifafree(struct ifaddr *);
>>>>
>>>> int if_isconnected(const struct ifnet *, unsigned int);
>>>> Index: net/route.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
>>>> retrieving revision 1.413
>>>> diff -u -p -r1.413 route.c
>>>> --- net/route.c 28 Jul 2022 22:19:09 -0000 1.413
>>>> +++ net/route.c 23 Aug 2022 16:31:29 -0000
>>>> @@ -146,7 +146,6 @@ extern unsigned int rtmap_limit;
>>>>
>>>> struct cpumem * rtcounters;
>>>> int rttrash; /* routes not in table but not
>>>> freed */
>>>> -int ifatrash; /* ifas not in ifp list but not
>>>> free */
>>>>
>>>> struct pool rtentry_pool; /* pool for rtentry structures
>>>> */
>>>> struct pool rttimer_pool; /* pool for rttimer structures
>>>> */
>>>> @@ -512,16 +511,19 @@ rtfree(struct rtentry *rt)
>>>> pool_put(&rtentry_pool, rt);
>>>> }
>>>>
>>>> +struct ifaddr *
>>>> +ifaref(struct ifaddr *ifa)
>>>> +{
>>>> + refcnt_take(&ifa->ifa_refcnt);
>>>> + return ifa;
>>>> +}
>>>> +
>>>> void
>>>> ifafree(struct ifaddr *ifa)
>>>> {
>>>> - if (ifa == NULL)
>>>> - panic("ifafree");
>>>> - if (ifa->ifa_refcnt == 0) {
>>>> - ifatrash--;
>>>> - free(ifa, M_IFADDR, 0);
>>>> - } else
>>>> - ifa->ifa_refcnt--;
>>>> + if (refcnt_rele(&ifa->ifa_refcnt) == 0)
>>>> + return;
>>>> + free(ifa, M_IFADDR, 0);
>>>> }
>>>>
>>>> /*
>>>> @@ -901,8 +903,7 @@ rtrequest(int req, struct rt_addrinfo *i
>>>> rt_mpls_clear(rt);
>>>> #endif
>>>>
>>>> - ifa->ifa_refcnt++;
>>>> - rt->rt_ifa = ifa;
>>>> + rt->rt_ifa = ifaref(ifa);
>>>> rt->rt_ifidx = ifp->if_index;
>>>> /*
>>>> * Copy metrics and a back pointer from the cloned
>>>> @@ -1857,8 +1858,8 @@ db_print_ifa(struct ifaddr *ifa)
>>>> db_print_sa(ifa->ifa_dstaddr);
>>>> db_printf(" ifa_mask=");
>>>> db_print_sa(ifa->ifa_netmask);
>>>> - db_printf(" flags=0x%x, refcnt=%d, metric=%d\n",
>>>> - ifa->ifa_flags, ifa->ifa_refcnt, ifa->ifa_metric);
>>>> + db_printf(" flags=0x%x, refcnt=%u, metric=%d\n",
>>>> + ifa->ifa_flags, ifa->ifa_refcnt.r_refs, ifa->ifa_metric);
>>>> }
>>>>
>>>> /*
>>>> Index: net/rtsock.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtsock.c,v
>>>> retrieving revision 1.341
>>>> diff -u -p -r1.341 rtsock.c
>>>> --- net/rtsock.c 22 Aug 2022 21:18:48 -0000 1.341
>>>> +++ net/rtsock.c 23 Aug 2022 16:27:03 -0000
>>>> @@ -1115,8 +1115,7 @@ rtm_output(struct rt_msghdr *rtm, struct
>>>> ifp->if_rtrequest(ifp, RTM_DELETE, rt);
>>>> ifafree(rt->rt_ifa);
>>>>
>>>> - ifa->ifa_refcnt++;
>>>> - rt->rt_ifa = ifa;
>>>> + rt->rt_ifa = ifaref(ifa);
>>>> rt->rt_ifidx = ifa->ifa_ifp->if_index;
>>>> /* recheck link state after ifp change */
>>>> rt_if_linkstate_change(rt, ifa->ifa_ifp,
>>>> Index: netinet/in.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in.c,v
>>>> retrieving revision 1.175
>>>> diff -u -p -r1.175 in.c
>>>> --- netinet/in.c 6 Aug 2022 15:57:59 -0000 1.175
>>>> +++ netinet/in.c 23 Aug 2022 16:59:09 -0000
>>>> @@ -379,6 +379,7 @@ in_ioctl_set_ifaddr(u_long cmd, caddr_t
>>>> }
>>>> if (ia == NULL) {
>>>> ia = malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO);
>>>> + refcnt_init_trace(&ia->ia_ifa.ifa_refcnt, DT_REFCNT_IDX_IFADDR);
>>>> ia->ia_addr.sin_family = AF_INET;
>>>> ia->ia_addr.sin_len = sizeof(ia->ia_addr);
>>>> ia->ia_ifa.ifa_addr = sintosa(&ia->ia_addr);
>>>> @@ -475,6 +476,8 @@ in_ioctl_change_ifaddr(u_long cmd, caddr
>>>>
>>>> if (ia == NULL) {
>>>> ia = malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO);
>>>> + refcnt_init_trace(&ia->ia_ifa.ifa_refcnt,
>>>> + DT_REFCNT_IDX_IFADDR);
>>>> ia->ia_addr.sin_family = AF_INET;
>>>> ia->ia_addr.sin_len = sizeof(ia->ia_addr);
>>>> ia->ia_ifa.ifa_addr = sintosa(&ia->ia_addr);
>>>> @@ -745,7 +748,6 @@ in_purgeaddr(struct ifaddr *ifa)
>>>> {
>>>> struct ifnet *ifp = ifa->ifa_ifp;
>>>> struct in_ifaddr *ia = ifatoia(ifa);
>>>> - extern int ifatrash;
>>>>
>>>> NET_ASSERT_LOCKED();
>>>>
>>>> @@ -760,7 +762,6 @@ in_purgeaddr(struct ifaddr *ifa)
>>>> ia->ia_allhosts = NULL;
>>>> }
>>>>
>>>> - ifatrash++;
>>>> ia->ia_ifp = NULL;
>>>> ifafree(&ia->ia_ifa);
>>>> }
>>>> Index: netinet6/in6.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6.c,v
>>>> retrieving revision 1.247
>>>> diff -u -p -r1.247 in6.c
>>>> --- netinet6/in6.c 6 Aug 2022 15:57:59 -0000 1.247
>>>> +++ netinet6/in6.c 23 Aug 2022 16:55:17 -0000
>>>> @@ -647,6 +647,8 @@ in6_update_ifa(struct ifnet *ifp, struct
>>>> if (ia6 == NULL) {
>>>> hostIsNew = 1;
>>>> ia6 = malloc(sizeof(*ia6), M_IFADDR, M_WAITOK | M_ZERO);
>>>> + refcnt_init_trace(&ia6->ia_ifa.ifa_refcnt,
>>>> + DT_REFCNT_IDX_IFADDR);
>>>> LIST_INIT(&ia6->ia6_memberships);
>>>> /* Initialize the address and masks, and put time stamp */
>>>> ia6->ia_ifa.ifa_addr = sin6tosa(&ia6->ia_addr);
>>>> @@ -938,7 +940,6 @@ void
>>>> in6_unlink_ifa(struct in6_ifaddr *ia6, struct ifnet *ifp)
>>>> {
>>>> struct ifaddr *ifa = &ia6->ia_ifa;
>>>> - extern int ifatrash;
>>>> int plen;
>>>>
>>>> NET_ASSERT_LOCKED();
>>>> @@ -953,7 +954,6 @@ in6_unlink_ifa(struct in6_ifaddr *ia6, s
>>>> rt_ifa_purge(ifa);
>>>> ifa_del(ifp, ifa);
>>>>
>>>> - ifatrash++;
>>>> ia6->ia_ifp = NULL;
>>>> ifafree(&ia6->ia_ifa);
>>>> }
>>>> Index: netinet6/nd6_nbr.c
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/nd6_nbr.c,v
>>>> retrieving revision 1.132
>>>> diff -u -p -r1.132 nd6_nbr.c
>>>> --- netinet6/nd6_nbr.c 8 Aug 2022 17:47:59 -0000 1.132
>>>> +++ netinet6/nd6_nbr.c 23 Aug 2022 16:27:11 -0000
>>>> @@ -1124,8 +1124,7 @@ nd6_dad_start(struct ifaddr *ifa)
>>>> * first packet to be sent from the interface after interface
>>>> * (re)initialization.
>>>> */
>>>> - dp->dad_ifa = ifa;
>>>> - ifa->ifa_refcnt++; /* just for safety */
>>>> + dp->dad_ifa = ifaref(ifa);
>>>> dp->dad_count = ip6_dad_count;
>>>> dp->dad_ns_icount = dp->dad_na_icount = 0;
>>>> dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
>>>> Index: sys/refcnt.h
>>>> ===================================================================
>>>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/refcnt.h,v
>>>> retrieving revision 1.7
>>>> diff -u -p -r1.7 refcnt.h
>>>> --- sys/refcnt.h 28 Jun 2022 09:32:28 -0000 1.7
>>>> +++ sys/refcnt.h 23 Aug 2022 16:54:02 -0000
>>>> @@ -43,8 +43,10 @@ void refcnt_finalize(struct refcnt *, co
>>>> int refcnt_shared(struct refcnt *);
>>>> unsigned int refcnt_read(struct refcnt *);
>>>>
>>>> -#define DT_REFCNT_IDX_INPCB 1
>>>> -#define DT_REFCNT_IDX_TDB 2
>>>> +/* sorted alphabetically, keep in sync with dev/dt/dt_prov_static.c */
>>>> +#define DT_REFCNT_IDX_IFADDR 1
>>>> +#define DT_REFCNT_IDX_INPCB 2
>>>> +#define DT_REFCNT_IDX_TDB 3
>>>>
>>>> #endif /* _KERNEL */
>>>>
>>>
>>
>