On Tue, Mar 22, 2022 at 02:56:43PM +0100, Alexander Bluhm wrote:
> On Tue, Mar 22, 2022 at 02:25:08PM +0100, Claudio Jeker wrote:
> > On Tue, Mar 22, 2022 at 02:09:51PM +0100, Alexander Bluhm wrote:
> > > Hi,
> > >
> > > syzkaller and witness found the same bug I introduced in UDP also
> > > for Raw IP. Fix it the same was for rip and rip6.
> > >
> > > https://syzkaller.appspot.com/bug?extid=9bac6356a881dc644265
> > > https://syzkaller.appspot.com/bug?extid=5b2679ee9be0895d26f9
> > >
> > > ok?
> >
> > Absolutly not a fan of this "fix". It just moves the landmine that is
> > about to explode a bit further to the left for the next person to step on.
> > The moment someone tries to run these input handlers in parallel all of
> > this will blow up. It is a workaround for now but how will we get out of
> > this in the future when the code runs in parallel up to the socket
> > layer?
>
> Moving the problem around is the only way to make any progress.
>
> The bug with MP forwarding I try to solve is this one.
> https://marc.info/?l=openbsd-tech&m=163857624429253&w=2
>
> After 4 months of ideas that were denied by different people, I
> came to this solution. Put a mutex around PCB tables. I think
> this is necessary anyway if we want to reach parallel protocol
> processing. Unfortunately I missed 3 of 4 places where I hold the
> mutex too long. I am trying to fix the last 2 of them.
>
> I do not want to delay parallel forwaring until parallel protocol
> layer is finished. Then neither will happen. If someone is working
> on parallel protocols, this code will blow up due to NET_ASSERT_WLOCKED().
> It has to be fixed then. My change is delaying work to make progress
> elsewhere. We cannot solve everything in a big commit.
>
> Do you have a better idea?
No but you push this layer into a specifc direction and by that make it
harder to fix the PCB tables in a different way. I just see people
changing the NET_ASSERT_WLOCKED() without realizing the actual reason for
the exclusive netlock use.
Looking at the pcb hash problem, I have to wonder if this reinserting of
PCBs is actually resulting in a measurable performance difference. The
hash table should be large enough to keep the number of PCB per bucket low.
One comment below.
--
:wq Claudio
> > > Index: netinet/raw_ip.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
> > > retrieving revision 1.125
> > > diff -u -p -r1.125 raw_ip.c
> > > --- netinet/raw_ip.c 21 Mar 2022 09:12:34 -0000 1.125
> > > +++ netinet/raw_ip.c 22 Mar 2022 12:59:05 -0000
> > > @@ -122,9 +122,9 @@ rip_input(struct mbuf **mp, int *offp, i
> > > {
> > > struct mbuf *m = *mp;
> > > struct ip *ip = mtod(m, struct ip *);
> > > - struct inpcb *inp, *last = NULL;
> > > + struct inpcb *inp;
> > > + SIMPLEQ_HEAD(, inpcb) inpcblist;
> > > struct in_addr *key;
> > > - struct mbuf *opts = NULL;
> > > struct counters_ref ref;
> > > uint64_t *counters;
> > >
> > > @@ -150,7 +150,8 @@ rip_input(struct mbuf **mp, int *offp, i
> > > }
> > > }
> > > #endif
> > > - NET_ASSERT_LOCKED();
> > > + NET_ASSERT_WLOCKED();
> > > + SIMPLEQ_INIT(&inpcblist);
> > > mtx_enter(&rawcbtable.inpt_mtx);
> > > TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> > > if (inp->inp_socket->so_state & SS_CANTRCVMORE)
> > > @@ -171,41 +172,16 @@ rip_input(struct mbuf **mp, int *offp, i
> > > if (inp->inp_faddr.s_addr &&
> > > inp->inp_faddr.s_addr != ip->ip_src.s_addr)
> > > continue;
> > > - if (last) {
> > > - struct mbuf *n;
> > >
> > > - if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
> > > - if (last->inp_flags & INP_CONTROLOPTS ||
> > > - last->inp_socket->so_options & SO_TIMESTAMP)
> > > - ip_savecontrol(last, &opts, ip, n);
> > > - if (sbappendaddr(last->inp_socket,
> > > - &last->inp_socket->so_rcv,
> > > - sintosa(&ripsrc), n, opts) == 0) {
> > > - /* should notify about lost packet */
> > > - m_freem(n);
> > > - m_freem(opts);
> > > - } else
> > > - sorwakeup(last->inp_socket);
> > > - opts = NULL;
> > > - }
> > > - }
> > > - last = inp;
> > > + in_pcbref(inp);
> > > + SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
> > > }
> > > mtx_leave(&rawcbtable.inpt_mtx);
> > >
> > > - if (last) {
> > > - if (last->inp_flags & INP_CONTROLOPTS ||
> > > - last->inp_socket->so_options & SO_TIMESTAMP)
> > > - ip_savecontrol(last, &opts, ip, m);
> > > - if (sbappendaddr(last->inp_socket, &last->inp_socket->so_rcv,
> > > - sintosa(&ripsrc), m, opts) == 0) {
> > > - m_freem(m);
> > > - m_freem(opts);
> > > - } else
> > > - sorwakeup(last->inp_socket);
> > > - } else {
> > > + if (SIMPLEQ_EMPTY(&inpcblist)) {
> > > if (ip->ip_p != IPPROTO_ICMP)
> > > - icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0,
> > > 0);
> > > + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
> > > + 0, 0);
> > > else
> > > m_freem(m);
> > >
> > > @@ -213,6 +189,30 @@ rip_input(struct mbuf **mp, int *offp, i
> > > counters[ips_noproto]++;
> > > counters[ips_delivered]--;
> > > counters_leave(&ref, ipcounters);
> > > + }
> > > +
> > > + while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> > > + struct mbuf *n, *opts = NULL;
> > > +
> > > + SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> > > + if (SIMPLEQ_EMPTY(&inpcblist))
> > > + n = m;
> > > + else
> > > + n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> > > + if (n != NULL) {
> > > + if (inp->inp_flags & INP_CONTROLOPTS ||
> > > + inp->inp_socket->so_options & SO_TIMESTAMP)
> > > + ip_savecontrol(inp, &opts, ip, n);
> > > + if (sbappendaddr(inp->inp_socket,
> > > + &inp->inp_socket->so_rcv,
> > > + sintosa(&ripsrc), n, opts) == 0) {
> > > + /* should notify about lost packet */
> > > + m_freem(n);
> > > + m_freem(opts);
> > > + } else
> > > + sorwakeup(inp->inp_socket);
> > > + }
> > > + in_pcbunref(inp);
> > > }
> > > return IPPROTO_DONE;
> > > }
> > > Index: netinet6/raw_ip6.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
> > > retrieving revision 1.145
> > > diff -u -p -r1.145 raw_ip6.c
> > > --- netinet6/raw_ip6.c 21 Mar 2022 09:12:34 -0000 1.145
> > > +++ netinet6/raw_ip6.c 22 Mar 2022 12:59:05 -0000
> > > @@ -121,10 +121,9 @@ rip6_input(struct mbuf **mp, int *offp,
> > > struct mbuf *m = *mp;
> > > struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
> > > struct inpcb *in6p;
> > > - struct inpcb *last = NULL;
> > > + SIMPLEQ_HEAD(, inpcb) inpcblist;
> > > struct in6_addr *key;
> > > struct sockaddr_in6 rip6src;
> > > - struct mbuf *opts = NULL;
> > >
> > > KASSERT(af == AF_INET6);
> > >
> > > @@ -156,7 +155,8 @@ rip6_input(struct mbuf **mp, int *offp,
> > > }
> > > }
> > > #endif
> > > - NET_ASSERT_LOCKED();
> > > + NET_ASSERT_WLOCKED();
> > > + SIMPLEQ_INIT(&inpcblist);
> > > mtx_enter(&rawin6pcbtable.inpt_mtx);
> > > TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) {
> > > if (in6p->inp_socket->so_state & SS_CANTRCVMORE)
> > > @@ -183,7 +183,7 @@ rip6_input(struct mbuf **mp, int *offp,
> > > sizeof(*icmp6));
> > > if (icmp6 == NULL) {
> > > mtx_leave(&rawin6pcbtable.inpt_mtx);
> > > - return IPPROTO_DONE;
> > > + goto bad;
> > > }
> > > if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
> > > in6p->inp_icmp6filt))
This part of the code should be split up:
if (proto == IPPROTO_ICMPV6 && in6p->inp_icmp6filt) {
struct icmp6_hdr *icmp6;
IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
sizeof(*icmp6));
if (icmp6 == NULL)
return IPPROTO_DONE;
if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
in6p->inp_icmp6filt))
continue;
}
At least the one bit that can fail can be moved outside of this loop:
if (proto == IPPROTO_ICMPV6) {
IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
sizeof(*icmp6));
if (icmp6 == NULL)
return IPPROTO_DONE;
}
and then in the loop:
if (proto == IPPROTO_ICMPV6 && icmp6 && in6p->inp_icmp6filt)
if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
in6p->inp_icmp6filt))
continue;
With that there is no need for the goto bad case that needs to unroll the
list.
Apart from that OK claudio@
> > > @@ -206,42 +206,13 @@ rip6_input(struct mbuf **mp, int *offp,
> > > continue;
> > > }
> > > }
> > > - if (last) {
> > > - struct mbuf *n;
> > > - if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
> > > - if (last->inp_flags & IN6P_CONTROLOPTS)
> > > - ip6_savecontrol(last, n, &opts);
> > > - /* strip intermediate headers */
> > > - m_adj(n, *offp);
> > > - if (sbappendaddr(last->inp_socket,
> > > - &last->inp_socket->so_rcv,
> > > - sin6tosa(&rip6src), n, opts) == 0) {
> > > - /* should notify about lost packet */
> > > - m_freem(n);
> > > - m_freem(opts);
> > > - rip6stat_inc(rip6s_fullsock);
> > > - } else
> > > - sorwakeup(last->inp_socket);
> > > - opts = NULL;
> > > - }
> > > - }
> > > - last = in6p;
> > > +
> > > + in_pcbref(in6p);
> > > + SIMPLEQ_INSERT_TAIL(&inpcblist, in6p, inp_notify);
> > > }
> > > mtx_leave(&rawin6pcbtable.inpt_mtx);
> > >
> > > - if (last) {
> > > - if (last->inp_flags & IN6P_CONTROLOPTS)
> > > - ip6_savecontrol(last, m, &opts);
> > > - /* strip intermediate headers */
> > > - m_adj(m, *offp);
> > > - if (sbappendaddr(last->inp_socket, &last->inp_socket->so_rcv,
> > > - sin6tosa(&rip6src), m, opts) == 0) {
> > > - m_freem(m);
> > > - m_freem(opts);
> > > - rip6stat_inc(rip6s_fullsock);
> > > - } else
> > > - sorwakeup(last->inp_socket);
> > > - } else {
> > > + if (SIMPLEQ_EMPTY(&inpcblist)) {
> > > struct counters_ref ref;
> > > uint64_t *counters;
> > >
> > > @@ -261,6 +232,39 @@ rip6_input(struct mbuf **mp, int *offp,
> > > counters = counters_enter(&ref, ip6counters);
> > > counters[ip6s_delivered]--;
> > > counters_leave(&ref, ip6counters);
> > > + }
> > > +
> > > + while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> > > + struct mbuf *n, *opts = NULL;
> > > +
> > > + SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> > > + if (SIMPLEQ_EMPTY(&inpcblist))
> > > + n = m;
> > > + else
> > > + n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> > > + if (n != NULL) {
> > > + if (in6p->inp_flags & IN6P_CONTROLOPTS)
> > > + ip6_savecontrol(in6p, n, &opts);
> > > + /* strip intermediate headers */
> > > + m_adj(n, *offp);
> > > + if (sbappendaddr(in6p->inp_socket,
> > > + &in6p->inp_socket->so_rcv,
> > > + sin6tosa(&rip6src), n, opts) == 0) {
> > > + /* should notify about lost packet */
> > > + m_freem(n);
> > > + m_freem(opts);
> > > + rip6stat_inc(rip6s_fullsock);
> > > + } else
> > > + sorwakeup(in6p->inp_socket);
> > > + }
> > > + in_pcbunref(in6p);
> > > + }
> > > + return IPPROTO_DONE;
> > > +
> > > + bad:
> > > + while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> > > + SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> > > + in_pcbunref(in6p);
> > > }
> > > return IPPROTO_DONE;
> > > }
> > >
> >
> > --
> > :wq Claudio
>