Please drop this diff. Within rt_clone() we have the code section protected by kernel lock, and this code section could be reached with mutex(9) held.
pipex(4) needs to be more refactored before introduce per-session locking. > On 28 Jun 2022, at 23:14, Vitaliy Makkoveev <[email protected]> wrote: > > On Tue, Jun 28, 2022 at 09:05:09PM +0300, Vitaliy Makkoveev wrote: >> The updated diff. It was triggered by Hrvoje Popovski, we could do >> direct (*if_qstart)() call in pipex(4) PPPOE input path, and this >> provides deadlock. The updated diff uses ip{,6}_send() instead of >> ipv{4,6}_input(). >> > > I think, I found better solution. The diff below still uses > ipv{4,6}_input(), but introduces custom pipex_if_enqueue() > (*if_enqueue)() handler. Now we can be sure, our (*if_qstart)() handlers > will never be called directly. Also we can be sure about netlock state > within our (*if_qstart)() handlers and remove pipexintr(). > > We do direct (*if_qstart)() call only if we overflow `if_snd' queue, and > of course we loose packets in this case, so the behavior mostly the > same. > > Index: sys/net/if_pppx.c > =================================================================== > RCS file: /cvs/src/sys/net/if_pppx.c,v > retrieving revision 1.117 > diff -u -p -r1.117 if_pppx.c > --- sys/net/if_pppx.c 26 Jun 2022 22:51:58 -0000 1.117 > +++ sys/net/if_pppx.c 28 Jun 2022 19:56:47 -0000 > @@ -651,6 +651,7 @@ pppx_add_session(struct pppx_dev *pxd, s > ifp->if_mtu = req->pr_peer_mru; /* XXX */ > ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST | IFF_UP; > ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; > + ifp->if_enqueue = pipex_if_enqueue; > ifp->if_qstart = pppx_if_qstart; > ifp->if_output = pppx_if_output; > ifp->if_ioctl = pppx_if_ioctl; > @@ -659,8 +660,6 @@ pppx_add_session(struct pppx_dev *pxd, s > ifp->if_softc = pxi; > /* ifp->if_rdomain = req->pr_rdomain; */ > if_counters_alloc(ifp); > - /* XXXSMP: be sure pppx_if_qstart() called with NET_LOCK held */ > - ifq_set_maxlen(&ifp->if_snd, 1); > > /* XXXSMP breaks atomicity */ > NET_UNLOCK(); > @@ -801,13 +800,16 @@ pppx_if_qstart(struct ifqueue *ifq) > struct mbuf *m; > int proto; > > - NET_ASSERT_LOCKED(); > + mtx_enter(&pxi->pxi_session->pxs_mtx); > + > while ((m = ifq_dequeue(ifq)) != NULL) { > proto = *mtod(m, int *); > m_adj(m, sizeof(proto)); > > pipex_ppp_output(m, pxi->pxi_session, proto); > } > + > + mtx_leave(&pxi->pxi_session->pxs_mtx); > } > > int > @@ -1041,11 +1043,10 @@ pppacopen(dev_t dev, int flags, int mode > ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST; > ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; > ifp->if_rtrequest = p2p_rtrequest; /* XXX */ > + ifp->if_enqueue = pipex_if_enqueue; > ifp->if_output = pppac_output; > ifp->if_qstart = pppac_qstart; > ifp->if_ioctl = pppac_ioctl; > - /* XXXSMP: be sure pppac_qstart() called with NET_LOCK held */ > - ifq_set_maxlen(&ifp->if_snd, 1); > > if_counters_alloc(ifp); > if_attach(ifp); > @@ -1441,7 +1442,6 @@ pppac_qstart(struct ifqueue *ifq) > struct ip ip; > int rv; > > - NET_ASSERT_LOCKED(); > while ((m = ifq_dequeue(ifq)) != NULL) { > #if NBPFILTER > 0 > if (ifp->if_bpf) { > Index: sys/net/pipex.c > =================================================================== > RCS file: /cvs/src/sys/net/pipex.c,v > retrieving revision 1.142 > diff -u -p -r1.142 pipex.c > --- sys/net/pipex.c 28 Jun 2022 08:01:40 -0000 1.142 > +++ sys/net/pipex.c 28 Jun 2022 19:56:47 -0000 > @@ -91,7 +91,6 @@ struct pool mppe_key_pool; > * Locks used to protect global data > * A atomic operation > * I immutable after creation > - * N net lock > * L pipex_list_mtx > */ > > @@ -172,7 +171,6 @@ pipex_ioctl(void *ownersc, u_long cmd, c > { > int ret = 0; > > - NET_ASSERT_LOCKED(); > switch (cmd) { > case PIPEXCSESSION: > ret = pipex_config_session( > @@ -197,6 +195,19 @@ pipex_ioctl(void *ownersc, u_long cmd, c > return (ret); > } > > +int > +pipex_if_enqueue(struct ifnet *ifp, struct mbuf *m) > +{ > + struct ifqueue *ifq = &ifp->if_snd; > + int error; > + > + if ((error = ifq_enqueue(ifq, m)) != 0) > + return (error); > + task_add(ifq->ifq_softnet, &ifq->ifq_bundle); > + > + return (0); > +} > + > /************************************************************************ > * Software Interrupt Handler > ************************************************************************/ > @@ -575,18 +586,20 @@ pipex_config_session(struct pipex_sessio > struct pipex_session *session; > int error = 0; > > - NET_ASSERT_LOCKED(); > - > session = pipex_lookup_by_session_id(req->pcr_protocol, > req->pcr_session_id); > if (session == NULL) > return (EINVAL); > > if (session->ownersc == ownersc) { > + mtx_enter(&session->pxs_mtx); > + > if (req->pcr_ip_forward != 0) > session->flags |= PIPEX_SFLAGS_IP_FORWARD; > else > session->flags &= ~PIPEX_SFLAGS_IP_FORWARD; > + > + mtx_leave(&session->pxs_mtx); > } else > error = EINVAL; > > @@ -601,8 +614,6 @@ pipex_get_stat(struct pipex_session_stat > struct pipex_session *session; > int error = 0; > > - NET_ASSERT_LOCKED(); > - > session = pipex_lookup_by_session_id(req->psr_protocol, > req->psr_session_id); > if (session == NULL) > @@ -811,6 +822,9 @@ pipex_ip_output(struct mbuf *m0, struct > /* > * Multicast packet is a idle packet and it's not TCP. > */ > + > + mtx_enter(&session->pxs_mtx); > + > if ((session->flags & (PIPEX_SFLAGS_IP_FORWARD | > PIPEX_SFLAGS_IP6_FORWARD)) == 0) > goto drop; > @@ -837,6 +851,8 @@ pipex_ip_output(struct mbuf *m0, struct > } > > pipex_ppp_output(m0, session, PPP_IP); > + > + mtx_leave(&session->pxs_mtx); > } else { > struct pipex_session *session_tmp; > struct mbuf *m; > @@ -846,16 +862,21 @@ pipex_ip_output(struct mbuf *m0, struct > LIST_FOREACH(session_tmp, &pipex_session_list, session_list) { > if (session_tmp->ownersc != session->ownersc) > continue; > + > + mtx_enter(&session->pxs_mtx); > + > if ((session->flags & (PIPEX_SFLAGS_IP_FORWARD | > PIPEX_SFLAGS_IP6_FORWARD)) == 0) > - continue; > + goto next; > m = m_copym(m0, 0, M_COPYALL, M_NOWAIT); > if (m == NULL) { > counters_inc(session->stat_counters, > pxc_oerrors); > - continue; > + goto next; > } > pipex_ppp_output(m, session_tmp, PPP_IP); > +next: > + mtx_leave(&session->pxs_mtx); > } > m_freem(m0); > } > @@ -864,6 +885,7 @@ pipex_ip_output(struct mbuf *m0, struct > drop: > m_freem(m0); > dropped: > + mtx_leave(&session->pxs_mtx); > counters_inc(session->stat_counters, pxc_oerrors); > } > > @@ -872,7 +894,7 @@ pipex_ppp_output(struct mbuf *m0, struct > { > u_char *cp, hdr[16]; > > - NET_ASSERT_LOCKED(); > + MUTEX_ASSERT_LOCKED(&session->pxs_mtx); > > #ifdef PIPEX_MPPE > if (pipex_session_is_mppe_enabled(session)) { > @@ -927,6 +949,8 @@ pipex_ppp_input(struct mbuf *m0, struct > int proto, hlen = 0; > struct mbuf *n; > > + MUTEX_ASSERT_LOCKED(&session->pxs_mtx); > + > KASSERT(m0->m_pkthdr.len >= PIPEX_PPPMINLEN); > proto = pipex_ppp_proto(m0, session, 0, &hlen); > #ifdef PIPEX_MPPE > @@ -1288,11 +1312,16 @@ pipex_pppoe_input(struct mbuf *m0, struc > sizeof(struct pipex_pppoe_header), &pppoe); > > hlen = sizeof(struct ether_header) + sizeof(struct pipex_pppoe_header); > - if ((m0 = pipex_common_input(session, m0, hlen, ntohs(pppoe.length))) > - == NULL) > - return (NULL); > - m_freem(m0); > - counters_inc(session->stat_counters, pxc_ierrors); > + > + mtx_enter(&session->pxs_mtx); > + m0 = pipex_common_input(session, m0, hlen, ntohs(pppoe.length)); > + mtx_leave(&session->pxs_mtx); > + > + if (m0 != NULL) { > + m_freem(m0); > + counters_inc(session->stat_counters, pxc_ierrors); > + } > + > return (NULL); > } > > @@ -1354,6 +1383,8 @@ pipex_pptp_output(struct mbuf *m0, struc > struct ip *ip; > u_char *cp; > > + MUTEX_ASSERT_LOCKED(&session->pxs_mtx); > + > reqlen = PIPEX_IPGRE_HDRLEN + (has_seq + has_ack) * 4; > > len = 0; > @@ -1507,7 +1538,6 @@ pipex_pptp_input(struct mbuf *m0, struct > struct pipex_pptp_session *pptp_session; > int rewind = 0; > > - NET_ASSERT_LOCKED(); > KASSERT(m0->m_pkthdr.len >= PIPEX_IPGRE_HDRLEN); > pptp_session = &session->proto.pptp; > > @@ -1537,6 +1567,9 @@ pipex_pptp_input(struct mbuf *m0, struct > seqp = cp; > GETLONG(seq, cp); > } > + > + mtx_enter(&session->pxs_mtx); > + > if (has_ack) { > ackp = cp; > GETLONG(ack, cp); > @@ -1585,6 +1618,7 @@ pipex_pptp_input(struct mbuf *m0, struct > /* ok, The packet is for PIPEX */ > if (!rewind) > session->proto.pptp.rcv_gap += nseq; > + mtx_leave(&session->pxs_mtx); > return (NULL); > } > > @@ -1618,6 +1652,8 @@ not_ours: > PUTLONG(ack, ackp); > } > > + mtx_leave(&session->pxs_mtx); > + > return (m0); > out_seq: > pipex_session_log(session, LOG_DEBUG, > @@ -1626,6 +1662,7 @@ out_seq: > pptp_session->rcv_nxt + pptp_session->maxwinsz, > ack, pptp_session->snd_una, > pptp_session->snd_nxt); > + mtx_leave(&session->pxs_mtx); > > /* FALLTHROUGH */ > drop: > @@ -1755,6 +1792,8 @@ pipex_pptp_userland_output(struct mbuf * > gre = mtod(m0, struct pipex_gre_header *); > cp = PIPEX_SEEK_NEXTHDR(gre, sizeof(struct pipex_gre_header), u_char *); > > + mtx_enter(&session->pxs_mtx); > + > /* > * overwrite sequence numbers to adjust a gap between pipex and > * userland. > @@ -1775,6 +1814,8 @@ pipex_pptp_userland_output(struct mbuf * > session->proto.pptp.rcv_acked = val32; > } > > + mtx_leave(&session->pxs_mtx); > + > return (m0); > } > #endif /* PIPEX_PPTP */ > @@ -1796,6 +1837,8 @@ pipex_l2tp_output(struct mbuf *m0, struc > #endif > struct m_tag *mtag; > > + MUTEX_ASSERT_LOCKED(&session->pxs_mtx); > + > hlen = sizeof(struct pipex_l2tp_header) + > ((pipex_session_is_l2tp_data_sequencing_on(session)) > ? sizeof(struct pipex_l2tp_seq_header) : 0) + > @@ -1967,17 +2010,15 @@ pipex_l2tp_input(struct mbuf *m0, int of > uint32_t ipsecflowinfo) > { > struct pipex_l2tp_session *l2tp_session; > - int length, offset, hlen, nseq; > - u_char *cp, *nsp, *nrp; > + int length = 0, offset = 0, hlen, nseq; > + u_char *cp, *nsp = NULL, *nrp = NULL; > uint16_t flags, ns = 0, nr = 0; > int rewind = 0; > > - NET_ASSERT_LOCKED(); > + mtx_enter(&session->pxs_mtx); > > - length = offset = ns = nr = 0; > l2tp_session = &session->proto.l2tp; > l2tp_session->ipsecflowinfo = ipsecflowinfo; > - nsp = nrp = NULL; > > m_copydata(m0, off0, sizeof(flags), &flags); > > @@ -2039,6 +2080,8 @@ pipex_l2tp_input(struct mbuf *m0, int of > /* ok, The packet is for PIPEX */ > if (!rewind) > session->proto.l2tp.nr_gap += nseq; > + mtx_leave(&session->pxs_mtx); > + > return (NULL); > } > > @@ -2069,15 +2112,18 @@ pipex_l2tp_input(struct mbuf *m0, int of > PUTSHORT(nr, nrp); > } > > + mtx_leave(&session->pxs_mtx); > + > return (m0); > out_seq: > pipex_session_log(session, LOG_DEBUG, > "Received bad data packet: out of sequence: seq=%u(%u-) " > "ack=%u(%u-%u)", ns, l2tp_session->nr_nxt, nr, l2tp_session->ns_una, > l2tp_session->ns_nxt); > - > /* FALLTHROUGH */ > drop: > + mtx_leave(&session->pxs_mtx); > + > m_freem(m0); > counters_inc(session->stat_counters, pxc_ierrors); > > @@ -2202,6 +2248,8 @@ pipex_l2tp_userland_output(struct mbuf * > ns = ntohs(seq->ns); > nr = ntohs(seq->nr); > > + mtx_enter(&session->pxs_mtx); > + > ns += session->proto.l2tp.ns_gap; > seq->ns = htons(ns); > session->proto.l2tp.ns_nxt++; > @@ -2211,6 +2259,8 @@ pipex_l2tp_userland_output(struct mbuf * > if (SEQ16_GT(nr, session->proto.l2tp.nr_acked)) > session->proto.l2tp.nr_acked = nr; > > + mtx_leave(&session->pxs_mtx); > + > return (m0); > } > #endif /* PIPEX_L2TP */ > @@ -2379,6 +2429,8 @@ pipex_mppe_input(struct mbuf *m0, struct > u_char *cp; > int rewind = 0; > > + MUTEX_ASSERT_LOCKED(&session->pxs_mtx); > + > /* pullup */ > PIPEX_PULLUP(m0, sizeof(coher_cnt)); > if (m0 == NULL) > @@ -2471,10 +2523,8 @@ pipex_mppe_input(struct mbuf *m0, struct > /* Send CCP ResetReq */ > PIPEX_DBG((session, LOG_DEBUG, "CCP SendResetReq")); > > - mtx_enter(&session->pxs_mtx); > ccp_id = session->ccp_id; > session->ccp_id++; > - mtx_leave(&session->pxs_mtx); > > pipex_ccp_output(session, CCP_RESETREQ, ccp_id); > goto drop; > Index: sys/net/pipex_local.h > =================================================================== > RCS file: /cvs/src/sys/net/pipex_local.h,v > retrieving revision 1.47 > diff -u -p -r1.47 pipex_local.h > --- sys/net/pipex_local.h 26 Jun 2022 15:50:21 -0000 1.47 > +++ sys/net/pipex_local.h 28 Jun 2022 19:56:47 -0000 > @@ -56,8 +56,8 @@ extern struct mutex pipex_list_mtx; > > /* > * Locks used to protect struct members: > + * A atomic operation > * I immutable after creation > - * N net lock > * L pipex_list_mtx > * s this pipex_session' `pxs_mtx' > * m this pipex_mppe' `pxm_mtx' > @@ -91,14 +91,14 @@ struct pipex_pppoe_session { > #ifdef PIPEX_PPTP > struct pipex_pptp_session { > /* sequence number gap between pipex and userland */ > - int32_t snd_gap; /* [N] gap of our sequence */ > - int32_t rcv_gap; /* [N] gap of peer's sequence */ > - int32_t ul_snd_una; /* [N] userland send acked seq */ > - > - uint32_t snd_nxt; /* [N] send next */ > - uint32_t rcv_nxt; /* [N] receive next */ > - uint32_t snd_una; /* [N] send acked sequence */ > - uint32_t rcv_acked; /* [N] recv acked sequence */ > + int32_t snd_gap; /* [s] gap of our sequence */ > + int32_t rcv_gap; /* [s] gap of peer's sequence */ > + int32_t ul_snd_una; /* [s] userland send acked seq */ > + > + uint32_t snd_nxt; /* [s] send next */ > + uint32_t rcv_nxt; /* [s] receive next */ > + uint32_t snd_una; /* [s] send acked sequence */ > + uint32_t rcv_acked; /* [s] recv acked sequence */ > > int winsz; /* [I] windows size */ > int maxwinsz; /* [I] max windows size */ > @@ -141,16 +141,16 @@ struct pipex_l2tp_session { > > uint32_t option_flags; /* [I] protocol options */ > > - int16_t ns_gap; /* [N] gap between userland and pipex */ > - int16_t nr_gap; /* [N] gap between userland and pipex */ > - uint16_t ul_ns_una; /* [N] unacked sequence number (userland) */ > - > - uint16_t ns_nxt; /* [N] next sequence number to send */ > - uint16_t ns_una; /* [N] unacked sequence number to send */ > - > - uint16_t nr_nxt; /* [N] next sequence number to recv */ > - uint16_t nr_acked; /* [N] acked sequence number to recv */ > - uint32_t ipsecflowinfo; /* [N] IPsec SA flow id for NAT-T */ > + int16_t ns_gap; /* [s] gap between userland and pipex */ > + int16_t nr_gap; /* [s] gap between userland and pipex */ > + uint16_t ul_ns_una; /* [s] unacked sequence number (userland) */ > + > + uint16_t ns_nxt; /* [s] next sequence number to send */ > + uint16_t ns_una; /* [s] unacked sequence number to send */ > + > + uint16_t nr_nxt; /* [s] next sequence number to recv */ > + uint16_t nr_acked; /* [s] acked sequence number to recv */ > + uint32_t ipsecflowinfo; /* [s] IPsec SA flow id for NAT-T */ > }; > #endif /* PIPEX_L2TP */ > > @@ -180,9 +180,9 @@ struct pipex_session { > > uint32_t idle_time; /* [L] idle time in seconds */ > > - u_int flags; /* [N] flags, see below */ > -#define PIPEX_SFLAGS_IP_FORWARD 0x01 /* [N] enable IP > forwarding */ > -#define PIPEX_SFLAGS_IP6_FORWARD 0x02 /* [N] enable IPv6 forwarding */ > + u_int flags; /* [s] flags, see below */ > +#define PIPEX_SFLAGS_IP_FORWARD 0x01 /* [s] enable IP > forwarding */ > +#define PIPEX_SFLAGS_IP6_FORWARD 0x02 /* [s] enable IPv6 forwarding */ > #define PIPEX_SFLAGS_MULTICAST 0x04 /* [I] virtual entry for > multicast */ > #define PIPEX_SFLAGS_PPPX 0x08 /* [I] interface is > @@ -200,7 +200,7 @@ struct pipex_session { > struct sockaddr_in6 ip6_address; /* [I] remote IPv6 address */ > int ip6_prefixlen; /* [I] remote IPv6 prefixlen */ > > - u_int ifindex; /* [N] interface index */ > + u_int ifindex; /* [A] interface index */ > void *ownersc; /* [I] owner context */ > > uint32_t ppp_flags; /* [I] configure flags */ > @@ -469,3 +469,4 @@ int pipex_ppp_enqueue > void pipex_timer_start (void); > void pipex_timer_stop (void); > void pipex_timer (void *); > +int pipex_if_enqueue(struct ifnet *, struct mbuf *);
