On Tue, Mar 22, 2022 at 02:56:43PM +0100, Alexander Bluhm wrote:
> On Tue, Mar 22, 2022 at 02:25:08PM +0100, Claudio Jeker wrote:
> > On Tue, Mar 22, 2022 at 02:09:51PM +0100, Alexander Bluhm wrote:
> > > Hi,
> > > 
> > > syzkaller and witness found the same bug I introduced in UDP also
> > > for Raw IP.  Fix it the same was for rip and rip6.
> > > 
> > > https://syzkaller.appspot.com/bug?extid=9bac6356a881dc644265
> > > https://syzkaller.appspot.com/bug?extid=5b2679ee9be0895d26f9
> > > 
> > > ok?
> > 
> > Absolutly not a fan of this "fix". It just moves the landmine that is
> > about to explode a bit further to the left for the next person to step on.
> > The moment someone tries to run these input handlers in parallel all of
> > this will blow up. It is a workaround for now but how will we get out of
> > this in the future when the code runs in parallel up to the socket
> > layer?
> 
> Moving the problem around is the only way to make any progress.
> 
> The bug with MP forwarding I try to solve is this one.
> https://marc.info/?l=openbsd-tech&m=163857624429253&w=2
> 
> After 4 months of ideas that were denied by different people, I
> came to this solution.  Put a mutex around PCB tables.  I think
> this is necessary anyway if we want to reach parallel protocol
> processing.  Unfortunately I missed 3 of 4 places where I hold the
> mutex too long.  I am trying to fix the last 2 of them.
> 
> I do not want to delay parallel forwaring until parallel protocol
> layer is finished.  Then neither will happen.  If someone is working
> on parallel protocols, this code will blow up due to NET_ASSERT_WLOCKED().
> It has to be fixed then.  My change is delaying work to make progress
> elsewhere.  We cannot solve everything in a big commit.
> 
> Do you have a better idea?
 
No but you push this layer into a specifc direction and by that make it
harder to fix the PCB tables in a different way. I just see people
changing the NET_ASSERT_WLOCKED() without realizing the actual reason for
the exclusive netlock use.

Looking at the pcb hash problem, I have to wonder if this reinserting of
PCBs is actually resulting in a measurable performance difference. The
hash table should be large enough to keep the number of PCB per bucket low. 

One comment below.
-- 
:wq Claudio

> > > Index: netinet/raw_ip.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
> > > retrieving revision 1.125
> > > diff -u -p -r1.125 raw_ip.c
> > > --- netinet/raw_ip.c      21 Mar 2022 09:12:34 -0000      1.125
> > > +++ netinet/raw_ip.c      22 Mar 2022 12:59:05 -0000
> > > @@ -122,9 +122,9 @@ rip_input(struct mbuf **mp, int *offp, i
> > >  {
> > >   struct mbuf *m = *mp;
> > >   struct ip *ip = mtod(m, struct ip *);
> > > - struct inpcb *inp, *last = NULL;
> > > + struct inpcb *inp;
> > > + SIMPLEQ_HEAD(, inpcb) inpcblist;
> > >   struct in_addr *key;
> > > - struct mbuf *opts = NULL;
> > >   struct counters_ref ref;
> > >   uint64_t *counters;
> > >  
> > > @@ -150,7 +150,8 @@ rip_input(struct mbuf **mp, int *offp, i
> > >           }
> > >   }
> > >  #endif
> > > - NET_ASSERT_LOCKED();
> > > + NET_ASSERT_WLOCKED();
> > > + SIMPLEQ_INIT(&inpcblist);
> > >   mtx_enter(&rawcbtable.inpt_mtx);
> > >   TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
> > >           if (inp->inp_socket->so_state & SS_CANTRCVMORE)
> > > @@ -171,41 +172,16 @@ rip_input(struct mbuf **mp, int *offp, i
> > >           if (inp->inp_faddr.s_addr &&
> > >               inp->inp_faddr.s_addr != ip->ip_src.s_addr)
> > >                   continue;
> > > -         if (last) {
> > > -                 struct mbuf *n;
> > >  
> > > -                 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
> > > -                         if (last->inp_flags & INP_CONTROLOPTS ||
> > > -                             last->inp_socket->so_options & SO_TIMESTAMP)
> > > -                                 ip_savecontrol(last, &opts, ip, n);
> > > -                         if (sbappendaddr(last->inp_socket,
> > > -                             &last->inp_socket->so_rcv,
> > > -                             sintosa(&ripsrc), n, opts) == 0) {
> > > -                                 /* should notify about lost packet */
> > > -                                 m_freem(n);
> > > -                                 m_freem(opts);
> > > -                         } else
> > > -                                 sorwakeup(last->inp_socket);
> > > -                         opts = NULL;
> > > -                 }
> > > -         }
> > > -         last = inp;
> > > +         in_pcbref(inp);
> > > +         SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
> > >   }
> > >   mtx_leave(&rawcbtable.inpt_mtx);
> > >  
> > > - if (last) {
> > > -         if (last->inp_flags & INP_CONTROLOPTS ||
> > > -             last->inp_socket->so_options & SO_TIMESTAMP)
> > > -                 ip_savecontrol(last, &opts, ip, m);
> > > -         if (sbappendaddr(last->inp_socket, &last->inp_socket->so_rcv,
> > > -             sintosa(&ripsrc), m, opts) == 0) {
> > > -                 m_freem(m);
> > > -                 m_freem(opts);
> > > -         } else
> > > -                 sorwakeup(last->inp_socket);
> > > - } else {
> > > + if (SIMPLEQ_EMPTY(&inpcblist)) {
> > >           if (ip->ip_p != IPPROTO_ICMP)
> > > -                 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 
> > > 0);
> > > +                 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
> > > +                     0, 0);
> > >           else
> > >                   m_freem(m);
> > >  
> > > @@ -213,6 +189,30 @@ rip_input(struct mbuf **mp, int *offp, i
> > >           counters[ips_noproto]++;
> > >           counters[ips_delivered]--;
> > >           counters_leave(&ref, ipcounters);
> > > + }
> > > +
> > > + while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> > > +         struct mbuf *n, *opts = NULL;
> > > +
> > > +         SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> > > +         if (SIMPLEQ_EMPTY(&inpcblist))
> > > +                 n = m;
> > > +         else
> > > +                 n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> > > +         if (n != NULL) {
> > > +                 if (inp->inp_flags & INP_CONTROLOPTS ||
> > > +                     inp->inp_socket->so_options & SO_TIMESTAMP)
> > > +                         ip_savecontrol(inp, &opts, ip, n);
> > > +                 if (sbappendaddr(inp->inp_socket,
> > > +                     &inp->inp_socket->so_rcv,
> > > +                     sintosa(&ripsrc), n, opts) == 0) {
> > > +                         /* should notify about lost packet */
> > > +                         m_freem(n);
> > > +                         m_freem(opts);
> > > +                 } else
> > > +                         sorwakeup(inp->inp_socket);
> > > +         }
> > > +         in_pcbunref(inp);
> > >   }
> > >   return IPPROTO_DONE;
> > >  }
> > > Index: netinet6/raw_ip6.c
> > > ===================================================================
> > > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
> > > retrieving revision 1.145
> > > diff -u -p -r1.145 raw_ip6.c
> > > --- netinet6/raw_ip6.c    21 Mar 2022 09:12:34 -0000      1.145
> > > +++ netinet6/raw_ip6.c    22 Mar 2022 12:59:05 -0000
> > > @@ -121,10 +121,9 @@ rip6_input(struct mbuf **mp, int *offp, 
> > >   struct mbuf *m = *mp;
> > >   struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
> > >   struct inpcb *in6p;
> > > - struct inpcb *last = NULL;
> > > + SIMPLEQ_HEAD(, inpcb) inpcblist;
> > >   struct in6_addr *key;
> > >   struct sockaddr_in6 rip6src;
> > > - struct mbuf *opts = NULL;
> > >  
> > >   KASSERT(af == AF_INET6);
> > >  
> > > @@ -156,7 +155,8 @@ rip6_input(struct mbuf **mp, int *offp, 
> > >           }
> > >   }
> > >  #endif
> > > - NET_ASSERT_LOCKED();
> > > + NET_ASSERT_WLOCKED();
> > > + SIMPLEQ_INIT(&inpcblist);
> > >   mtx_enter(&rawin6pcbtable.inpt_mtx);
> > >   TAILQ_FOREACH(in6p, &rawin6pcbtable.inpt_queue, inp_queue) {
> > >           if (in6p->inp_socket->so_state & SS_CANTRCVMORE)
> > > @@ -183,7 +183,7 @@ rip6_input(struct mbuf **mp, int *offp, 
> > >                       sizeof(*icmp6));
> > >                   if (icmp6 == NULL) {
> > >                           mtx_leave(&rawin6pcbtable.inpt_mtx);
> > > -                         return IPPROTO_DONE;
> > > +                         goto bad;
> > >                   }
> > >                   if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
> > >                       in6p->inp_icmp6filt))

This part of the code should be split up:
                if (proto == IPPROTO_ICMPV6 && in6p->inp_icmp6filt) {
                        struct icmp6_hdr *icmp6;

                        IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
                            sizeof(*icmp6));
                        if (icmp6 == NULL)
                                return IPPROTO_DONE;
                        if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
                            in6p->inp_icmp6filt))
                                continue;
                }

At least the one bit that can fail can be moved outside of this loop:

        if (proto == IPPROTO_ICMPV6) {
                IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
                    sizeof(*icmp6));
                if (icmp6 == NULL)
                        return IPPROTO_DONE;
        }
and then in the loop:
                if (proto == IPPROTO_ICMPV6 && icmp6 && in6p->inp_icmp6filt)
                        if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
                            in6p->inp_icmp6filt))
                                continue;

With that there is no need for the goto bad case that needs to unroll the
list.

Apart from that OK claudio@

> > > @@ -206,42 +206,13 @@ rip6_input(struct mbuf **mp, int *offp, 
> > >                           continue;
> > >                   }
> > >           }
> > > -         if (last) {
> > > -                 struct  mbuf *n;
> > > -                 if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
> > > -                         if (last->inp_flags & IN6P_CONTROLOPTS)
> > > -                                 ip6_savecontrol(last, n, &opts);
> > > -                         /* strip intermediate headers */
> > > -                         m_adj(n, *offp);
> > > -                         if (sbappendaddr(last->inp_socket,
> > > -                             &last->inp_socket->so_rcv,
> > > -                             sin6tosa(&rip6src), n, opts) == 0) {
> > > -                                 /* should notify about lost packet */
> > > -                                 m_freem(n);
> > > -                                 m_freem(opts);
> > > -                                 rip6stat_inc(rip6s_fullsock);
> > > -                         } else
> > > -                                 sorwakeup(last->inp_socket);
> > > -                         opts = NULL;
> > > -                 }
> > > -         }
> > > -         last = in6p;
> > > +
> > > +         in_pcbref(in6p);
> > > +         SIMPLEQ_INSERT_TAIL(&inpcblist, in6p, inp_notify);
> > >   }
> > >   mtx_leave(&rawin6pcbtable.inpt_mtx);
> > >  
> > > - if (last) {
> > > -         if (last->inp_flags & IN6P_CONTROLOPTS)
> > > -                 ip6_savecontrol(last, m, &opts);
> > > -         /* strip intermediate headers */
> > > -         m_adj(m, *offp);
> > > -         if (sbappendaddr(last->inp_socket, &last->inp_socket->so_rcv,
> > > -             sin6tosa(&rip6src), m, opts) == 0) {
> > > -                 m_freem(m);
> > > -                 m_freem(opts);
> > > -                 rip6stat_inc(rip6s_fullsock);
> > > -         } else
> > > -                 sorwakeup(last->inp_socket);
> > > - } else {
> > > + if (SIMPLEQ_EMPTY(&inpcblist)) {
> > >           struct counters_ref ref;
> > >           uint64_t *counters;
> > >  
> > > @@ -261,6 +232,39 @@ rip6_input(struct mbuf **mp, int *offp, 
> > >           counters = counters_enter(&ref, ip6counters);
> > >           counters[ip6s_delivered]--;
> > >           counters_leave(&ref, ip6counters);
> > > + }
> > > +
> > > + while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> > > +         struct mbuf *n, *opts = NULL;
> > > +
> > > +         SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> > > +         if (SIMPLEQ_EMPTY(&inpcblist))
> > > +                 n = m;
> > > +         else
> > > +                 n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
> > > +         if (n != NULL) {
> > > +                 if (in6p->inp_flags & IN6P_CONTROLOPTS)
> > > +                         ip6_savecontrol(in6p, n, &opts);
> > > +                 /* strip intermediate headers */
> > > +                 m_adj(n, *offp);
> > > +                 if (sbappendaddr(in6p->inp_socket,
> > > +                     &in6p->inp_socket->so_rcv,
> > > +                     sin6tosa(&rip6src), n, opts) == 0) {
> > > +                         /* should notify about lost packet */
> > > +                         m_freem(n);
> > > +                         m_freem(opts);
> > > +                         rip6stat_inc(rip6s_fullsock);
> > > +                 } else
> > > +                         sorwakeup(in6p->inp_socket);
> > > +         }
> > > +         in_pcbunref(in6p);
> > > + }
> > > + return IPPROTO_DONE;
> > > +
> > > + bad:
> > > + while ((in6p = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
> > > +         SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
> > > +         in_pcbunref(in6p);
> > >   }
> > >   return IPPROTO_DONE;
> > >  }
> > > 
> > 
> > -- 
> > :wq Claudio
> 

Reply via email to