> On 4 Aug 2022, at 01:36, Alexander Bluhm <[email protected]> wrote:
> 
> Hi,
> 
> Divert packet has a strange design as it calls the protocol layer
> directly from pf.  As pf runs in parallel, divert_packet() has a
> XXXSMP comment and kernel lock.  This gives the opportunity to make
> experiments.
> 
> I added a mutex in inet PCB layer.  It can be taken directly in
> protocol input functions or from socket via PRU_LOCK.  This makes
> it possible to run soreceive() in parallel.
> 
> I see a speed increase and it runs stable.  This diff is not ready
> for commit.  Next I want to work on UDP input, to see if my idea
> can be more generalized.  Just showing what I have now.
> 
> bluhm
> 

sblock()/sbunlock() relie on exclusive socket or layer lock,
which protects `sb_flags’ modification. This diff breaks them,
because nothing stops concurrent soreceive() threads to set
SB_LOCK flag and and be sure that socket buffer is locked.

sblock(struct socket *so, struct sockbuf *sb, int wait)
{       
        int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
        
        soassertlocked(so);
        
        if ((sb->sb_flags & SB_LOCK) == 0) {
                sb->sb_flags |= SB_LOCK;
                return (0);
        }
        if (wait & M_NOWAIT)
                return (EWOULDBLOCK);
        
        while (sb->sb_flags & SB_LOCK) {
                sb->sb_flags |= SB_WANT; 
                error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", INFSLP);
                if (error)
                        return (error);
        }
        sb->sb_flags |= SB_LOCK;
        return (0);
}

I started the work to allow socket buffer locking without socket
lock held, but the work is far from the diff expose.

> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.280
> diff -u -p -r1.280 uipc_socket.c
> --- kern/uipc_socket.c        25 Jul 2022 07:28:22 -0000      1.280
> +++ kern/uipc_socket.c        2 Aug 2022 13:54:03 -0000
> @@ -836,10 +836,10 @@ bad:
>       if (mp)
>               *mp = NULL;
> 
> -     solock(so);
> +     solock_shared(so);
> restart:
>       if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
> -             sounlock(so);
> +             sounlock_shared(so);
>               return (error);
>       }
> 
> @@ -907,7 +907,7 @@ restart:
>               sbunlock(so, &so->so_rcv);
>               error = sbwait(so, &so->so_rcv);
>               if (error) {
> -                     sounlock(so);
> +                     sounlock_shared(so);
>                       return (error);
>               }
>               goto restart;
> @@ -976,11 +976,11 @@ dontblock:
>                       sbsync(&so->so_rcv, nextrecord);
>                       if (controlp) {
>                               if (pr->pr_domain->dom_externalize) {
> -                                     sounlock(so);
> +                                     sounlock_shared(so);
>                                       error =
>                                           (*pr->pr_domain->dom_externalize)
>                                           (cm, controllen, flags);
> -                                     solock(so);
> +                                     solock_shared(so);
>                               }
>                               *controlp = cm;
>                       } else {
> @@ -1054,9 +1054,9 @@ dontblock:
>                       SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
>                       SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
>                       resid = uio->uio_resid;
> -                     sounlock(so);
> +                     sounlock_shared(so);
>                       uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
> -                     solock(so);
> +                     solock_shared(so);
>                       if (uio_error)
>                               uio->uio_resid = resid - len;
>               } else
> @@ -1140,7 +1140,7 @@ dontblock:
>                       error = sbwait(so, &so->so_rcv);
>                       if (error) {
>                               sbunlock(so, &so->so_rcv);
> -                             sounlock(so);
> +                             sounlock_shared(so);
>                               return (0);
>                       }
>                       if ((m = so->so_rcv.sb_mb) != NULL)
> @@ -1186,7 +1186,7 @@ dontblock:
>               *flagsp |= flags;
> release:
>       sbunlock(so, &so->so_rcv);
> -     sounlock(so);
> +     sounlock_shared(so);
>       return (error);
> }
> 
> Index: kern/uipc_socket2.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.126
> diff -u -p -r1.126 uipc_socket2.c
> --- kern/uipc_socket2.c       25 Jul 2022 07:28:22 -0000      1.126
> +++ kern/uipc_socket2.c       2 Aug 2022 19:07:22 -0000
> @@ -360,6 +360,25 @@ solock(struct socket *so)
>       }
> }
> 
> +void
> +solock_shared(struct socket *so)
> +{
> +     switch (so->so_proto->pr_domain->dom_family) {
> +     case PF_INET:
> +     case PF_INET6:
> +             if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
> +                     NET_RLOCK_IN_SYSCALL();
> +                     (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
> +                         NULL, NULL, NULL, NULL);
> +             } else
> +                     NET_LOCK();
> +             break;
> +     default:
> +             rw_enter_write(&so->so_lock);
> +             break;
> +     }
> +}
> +
> int
> solock_persocket(struct socket *so)
> {
> @@ -403,6 +422,25 @@ sounlock(struct socket *so)
> }
> 
> void
> +sounlock_shared(struct socket *so)
> +{
> +     switch (so->so_proto->pr_domain->dom_family) {
> +     case PF_INET:
> +     case PF_INET6:
> +             if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
> +                     (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
> +                         NULL, NULL, NULL, NULL);
> +                     NET_RUNLOCK_IN_SYSCALL();
> +             } else
> +                     NET_UNLOCK();
> +             break;
> +     default:
> +             rw_exit_write(&so->so_lock);
> +             break;
> +     }
> +}
> +
> +void
> soassertlocked(struct socket *so)
> {
>       switch (so->so_proto->pr_domain->dom_family) {
> @@ -425,7 +463,17 @@ sosleep_nsec(struct socket *so, void *id
>       switch (so->so_proto->pr_domain->dom_family) {
>       case PF_INET:
>       case PF_INET6:
> +             if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
> +                 rw_status(&netlock) == RW_READ) {
> +                     (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
> +                         NULL, NULL, NULL, NULL);
> +             }
>               ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
> +             if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
> +                 rw_status(&netlock) == RW_READ) {
> +                     (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
> +                         NULL, NULL, NULL, NULL);
> +             }
>               break;
>       default:
>               ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
> Index: netinet/in_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> retrieving revision 1.268
> diff -u -p -r1.268 in_pcb.c
> --- netinet/in_pcb.c  28 Jun 2022 09:32:27 -0000      1.268
> +++ netinet/in_pcb.c  2 Aug 2022 13:32:15 -0000
> @@ -236,6 +236,7 @@ in_pcballoc(struct socket *so, struct in
>       inp->inp_table = table;
>       inp->inp_socket = so;
>       refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
> +     mtx_init(&inp->inp_mtx, IPL_SOFTNET);
>       inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT;
>       inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
>       inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> retrieving revision 1.129
> diff -u -p -r1.129 in_pcb.h
> --- netinet/in_pcb.h  15 May 2022 09:12:20 -0000      1.129
> +++ netinet/in_pcb.h  2 Aug 2022 13:32:15 -0000
> @@ -79,6 +79,7 @@
>  *    I       immutable after creation
>  *    N       net lock
>  *    t       inpt_mtx                pcb table mutex
> + *   p       inpcb_mtx               pcb mutex
>  */
> 
> struct pf_state_key;
> @@ -121,6 +122,7 @@ struct inpcb {
> #define       inp_route       inp_ru.ru_route
> #define       inp_route6      inp_ru.ru_route6
>       struct    refcnt inp_refcnt;    /* refcount PCB, delay memory free */
> +     struct    mutex inp_mtx;        /* protect PCB and socket members */
>       int       inp_flags;            /* generic IP/datagram flags */
>       union {                         /* Header prototype. */
>               struct ip hu_ip;
> Index: netinet/in_proto.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
> retrieving revision 1.98
> diff -u -p -r1.98 in_proto.c
> --- netinet/in_proto.c        25 Feb 2022 23:51:03 -0000      1.98
> +++ netinet/in_proto.c        2 Aug 2022 13:54:03 -0000
> @@ -382,7 +382,7 @@ const struct protosw inetsw[] = {
>   .pr_type    = SOCK_RAW,
>   .pr_domain  = &inetdomain,
>   .pr_protocol        = IPPROTO_DIVERT,
> -  .pr_flags  = PR_ATOMIC|PR_ADDR,
> +  .pr_flags  = PR_ATOMIC|PR_ADDR|PR_MPSAFE,
>   .pr_ctloutput       = rip_ctloutput,
>   .pr_usrreq  = divert_usrreq,
>   .pr_attach  = divert_attach,
> Index: netinet/ip_divert.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
> retrieving revision 1.68
> diff -u -p -r1.68 ip_divert.c
> --- netinet/ip_divert.c       9 May 2022 19:33:46 -0000       1.68
> +++ netinet/ip_divert.c       2 Aug 2022 19:07:22 -0000
> @@ -221,22 +221,15 @@ divert_packet(struct mbuf *m, int dir, u
>               if_put(ifp);
>       }
> 
> +     mtx_enter(&inp->inp_mtx);
>       so = inp->inp_socket;
> -     /*
> -      * XXXSMP sbappendaddr() is not MP safe and this function is called
> -      * from pf with shared netlock.  To call only one sbappendaddr() from
> -      * divert_packet(), protect it with kernel lock.  All other places
> -      * call sbappendaddr() with exclusive net lock.  This blocks
> -      * divert_packet() as we have the shared lock.
> -      */
> -     KERNEL_LOCK();
>       if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
> -             KERNEL_UNLOCK();
> +             mtx_leave(&inp->inp_mtx);
>               divstat_inc(divs_fullsock);
>               goto bad;
>       }
> +     mtx_leave(&inp->inp_mtx);
>       sorwakeup(inp->inp_socket);
> -     KERNEL_UNLOCK();
> 
>       in_pcbunref(inp);
>       return;
> @@ -293,6 +286,14 @@ divert_usrreq(struct socket *so, int req
>               break;
> 
>       case PRU_SENSE:
> +             break;
> +
> +     case PRU_LOCK:
> +             mtx_enter(&inp->inp_mtx);
> +             break;
> +
> +     case PRU_UNLOCK:
> +             mtx_leave(&inp->inp_mtx);
>               break;
> 
>       case PRU_LISTEN:
> Index: sys/protosw.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
> retrieving revision 1.35
> diff -u -p -r1.35 protosw.h
> --- sys/protosw.h     25 Feb 2022 23:51:04 -0000      1.35
> +++ sys/protosw.h     2 Aug 2022 13:54:03 -0000
> @@ -108,6 +108,7 @@ struct protosw {
> #define       PR_ABRTACPTDIS  0x20            /* abort on accept(2) to 
> disconnected
>                                          socket */
> #define       PR_SPLICE       0x40            /* socket splicing is possible 
> */
> +#define      PR_MPSAFE       0x80            /* shared net lock for 
> send/recv */
> 
> /*
>  * The arguments to usrreq are:
> @@ -144,8 +145,10 @@ struct protosw {
> #define       PRU_SLOWTIMO            19      /* 500ms timeout */
> #define       PRU_PROTORCV            20      /* receive from below */
> #define       PRU_PROTOSEND           21      /* send to below */
> +#define      PRU_LOCK                22      /* lock protocol layer */
> +#define      PRU_UNLOCK              23      /* unlock protocol layer */
> 
> -#define      PRU_NREQ                22
> +#define      PRU_NREQ                24
> 
> #ifdef PRUREQUESTS
> const char *prurequests[] = {
> @@ -154,7 +157,7 @@ const char *prurequests[] = {
>       "RCVD",         "SEND",         "ABORT",        "CONTROL",
>       "SENSE",        "RCVOOB",       "SENDOOB",      "SOCKADDR",
>       "PEERADDR",     "CONNECT2",     "FASTTIMO",     "SLOWTIMO",
> -     "PROTORCV",     "PROTOSEND",
> +     "PROTORCV",     "PROTOSEND",    "LOCK",         "UNLOCK"
> };
> #endif
> 
> Index: sys/socketvar.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
> retrieving revision 1.106
> diff -u -p -r1.106 socketvar.h
> --- sys/socketvar.h   15 Jul 2022 17:20:24 -0000      1.106
> +++ sys/socketvar.h   2 Aug 2022 13:54:03 -0000
> @@ -346,9 +346,11 @@ int      sockargs(struct mbuf **, const void 
> 
> int   sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
> void  solock(struct socket *);
> +void solock_shared(struct socket *);
> int   solock_persocket(struct socket *);
> void  solock_pair(struct socket *, struct socket *);
> void  sounlock(struct socket *);
> +void sounlock_shared(struct socket *);
> 
> int   sendit(struct proc *, int, struct msghdr *, int, register_t *);
> int   recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);
> Index: sys/systm.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/systm.h,v
> retrieving revision 1.157
> diff -u -p -r1.157 systm.h
> --- sys/systm.h       12 Jul 2022 17:12:31 -0000      1.157
> +++ sys/systm.h       2 Aug 2022 19:07:22 -0000
> @@ -341,6 +341,15 @@ extern struct rwlock netlock;
> #define       NET_RLOCK_IN_IOCTL()    do { rw_enter_read(&netlock); } while 
> (0)
> #define       NET_RUNLOCK_IN_IOCTL()  do { rw_exit_read(&netlock); } while (0)
> 
> +/*
> + * Reader version of NET_LOCK() to be used in send and receive syscall.
> + *
> + * Can be grabbed instead of the exclusive version when no field
> + * protected by the NET_LOCK() is modified by the ioctl/sysctl.
> + */
> +#define      NET_RLOCK_IN_SYSCALL()  do { rw_enter_read(&netlock); } while 
> (0)
> +#define      NET_RUNLOCK_IN_SYSCALL() do { rw_exit_read(&netlock); } while 
> (0)
> +
> #ifdef DIAGNOSTIC
> 
> #define       NET_ASSERT_UNLOCKED()                                           
> \
> 

Reply via email to