Also, I like to have exclusive layer locks like `tcp_lock’,
`udp_lock’, etc.. And take them with shared netlock held as the
first step of inet sockets unlocking.

> On 4 Aug 2022, at 02:13, Vitaliy Makkoveev <[email protected]> wrote:
> 
>> On 4 Aug 2022, at 01:36, Alexander Bluhm <[email protected]> wrote:
>> 
>> Hi,
>> 
>> Divert packet has a strange design as it calls the protocol layer
>> directly from pf.  As pf runs in parallel, divert_packet() has a
>> XXXSMP comment and kernel lock.  This gives the opportunity to make
>> experiments.
>> 
>> I added a mutex in inet PCB layer.  It can be taken directly in
>> protocol input functions or from socket via PRU_LOCK.  This makes
>> it possible to run soreceive() in parallel.
>> 
>> I see a speed increase and it runs stable.  This diff is not ready
>> for commit.  Next I want to work on UDP input, to see if my idea
>> can be more generalized.  Just showing what I have now.
>> 
>> bluhm
>> 
> 
> sblock()/sbunlock() relie on exclusive socket or layer lock,
> which protects `sb_flags’ modification. This diff breaks them,
> because nothing stops concurrent soreceive() threads to set
> SB_LOCK flag and and be sure that socket buffer is locked.
> 
> sblock(struct socket *so, struct sockbuf *sb, int wait)
> {       
>        int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
> 
>        soassertlocked(so);
> 
>        if ((sb->sb_flags & SB_LOCK) == 0) {
>                sb->sb_flags |= SB_LOCK;
>                return (0);
>        }
>        if (wait & M_NOWAIT)
>                return (EWOULDBLOCK);
> 
>        while (sb->sb_flags & SB_LOCK) {
>                sb->sb_flags |= SB_WANT; 
>                error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", 
> INFSLP);
>                if (error)
>                        return (error);
>        }
>        sb->sb_flags |= SB_LOCK;
>        return (0);
> }
> 
> I started the work to allow socket buffer locking without socket
> lock held, but the work is far from the diff expose.
> 
>> Index: kern/uipc_socket.c
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
>> retrieving revision 1.280
>> diff -u -p -r1.280 uipc_socket.c
>> --- kern/uipc_socket.c       25 Jul 2022 07:28:22 -0000      1.280
>> +++ kern/uipc_socket.c       2 Aug 2022 13:54:03 -0000
>> @@ -836,10 +836,10 @@ bad:
>>      if (mp)
>>              *mp = NULL;
>> 
>> -    solock(so);
>> +    solock_shared(so);
>> restart:
>>      if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
>> -            sounlock(so);
>> +            sounlock_shared(so);
>>              return (error);
>>      }
>> 
>> @@ -907,7 +907,7 @@ restart:
>>              sbunlock(so, &so->so_rcv);
>>              error = sbwait(so, &so->so_rcv);
>>              if (error) {
>> -                    sounlock(so);
>> +                    sounlock_shared(so);
>>                      return (error);
>>              }
>>              goto restart;
>> @@ -976,11 +976,11 @@ dontblock:
>>                      sbsync(&so->so_rcv, nextrecord);
>>                      if (controlp) {
>>                              if (pr->pr_domain->dom_externalize) {
>> -                                    sounlock(so);
>> +                                    sounlock_shared(so);
>>                                      error =
>>                                          (*pr->pr_domain->dom_externalize)
>>                                          (cm, controllen, flags);
>> -                                    solock(so);
>> +                                    solock_shared(so);
>>                              }
>>                              *controlp = cm;
>>                      } else {
>> @@ -1054,9 +1054,9 @@ dontblock:
>>                      SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
>>                      SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
>>                      resid = uio->uio_resid;
>> -                    sounlock(so);
>> +                    sounlock_shared(so);
>>                      uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
>> -                    solock(so);
>> +                    solock_shared(so);
>>                      if (uio_error)
>>                              uio->uio_resid = resid - len;
>>              } else
>> @@ -1140,7 +1140,7 @@ dontblock:
>>                      error = sbwait(so, &so->so_rcv);
>>                      if (error) {
>>                              sbunlock(so, &so->so_rcv);
>> -                            sounlock(so);
>> +                            sounlock_shared(so);
>>                              return (0);
>>                      }
>>                      if ((m = so->so_rcv.sb_mb) != NULL)
>> @@ -1186,7 +1186,7 @@ dontblock:
>>              *flagsp |= flags;
>> release:
>>      sbunlock(so, &so->so_rcv);
>> -    sounlock(so);
>> +    sounlock_shared(so);
>>      return (error);
>> }
>> 
>> Index: kern/uipc_socket2.c
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
>> retrieving revision 1.126
>> diff -u -p -r1.126 uipc_socket2.c
>> --- kern/uipc_socket2.c      25 Jul 2022 07:28:22 -0000      1.126
>> +++ kern/uipc_socket2.c      2 Aug 2022 19:07:22 -0000
>> @@ -360,6 +360,25 @@ solock(struct socket *so)
>>      }
>> }
>> 
>> +void
>> +solock_shared(struct socket *so)
>> +{
>> +    switch (so->so_proto->pr_domain->dom_family) {
>> +    case PF_INET:
>> +    case PF_INET6:
>> +            if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
>> +                    NET_RLOCK_IN_SYSCALL();
>> +                    (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
>> +                        NULL, NULL, NULL, NULL);
>> +            } else
>> +                    NET_LOCK();
>> +            break;
>> +    default:
>> +            rw_enter_write(&so->so_lock);
>> +            break;
>> +    }
>> +}
>> +
>> int
>> solock_persocket(struct socket *so)
>> {
>> @@ -403,6 +422,25 @@ sounlock(struct socket *so)
>> }
>> 
>> void
>> +sounlock_shared(struct socket *so)
>> +{
>> +    switch (so->so_proto->pr_domain->dom_family) {
>> +    case PF_INET:
>> +    case PF_INET6:
>> +            if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
>> +                    (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
>> +                        NULL, NULL, NULL, NULL);
>> +                    NET_RUNLOCK_IN_SYSCALL();
>> +            } else
>> +                    NET_UNLOCK();
>> +            break;
>> +    default:
>> +            rw_exit_write(&so->so_lock);
>> +            break;
>> +    }
>> +}
>> +
>> +void
>> soassertlocked(struct socket *so)
>> {
>>      switch (so->so_proto->pr_domain->dom_family) {
>> @@ -425,7 +463,17 @@ sosleep_nsec(struct socket *so, void *id
>>      switch (so->so_proto->pr_domain->dom_family) {
>>      case PF_INET:
>>      case PF_INET6:
>> +            if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
>> +                rw_status(&netlock) == RW_READ) {
>> +                    (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
>> +                        NULL, NULL, NULL, NULL);
>> +            }
>>              ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
>> +            if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
>> +                rw_status(&netlock) == RW_READ) {
>> +                    (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
>> +                        NULL, NULL, NULL, NULL);
>> +            }
>>              break;
>>      default:
>>              ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
>> Index: netinet/in_pcb.c
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
>> retrieving revision 1.268
>> diff -u -p -r1.268 in_pcb.c
>> --- netinet/in_pcb.c 28 Jun 2022 09:32:27 -0000      1.268
>> +++ netinet/in_pcb.c 2 Aug 2022 13:32:15 -0000
>> @@ -236,6 +236,7 @@ in_pcballoc(struct socket *so, struct in
>>      inp->inp_table = table;
>>      inp->inp_socket = so;
>>      refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
>> +    mtx_init(&inp->inp_mtx, IPL_SOFTNET);
>>      inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT;
>>      inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
>>      inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
>> Index: netinet/in_pcb.h
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
>> retrieving revision 1.129
>> diff -u -p -r1.129 in_pcb.h
>> --- netinet/in_pcb.h 15 May 2022 09:12:20 -0000      1.129
>> +++ netinet/in_pcb.h 2 Aug 2022 13:32:15 -0000
>> @@ -79,6 +79,7 @@
>> *    I       immutable after creation
>> *    N       net lock
>> *    t       inpt_mtx                pcb table mutex
>> + *  p       inpcb_mtx               pcb mutex
>> */
>> 
>> struct pf_state_key;
>> @@ -121,6 +122,7 @@ struct inpcb {
>> #define      inp_route       inp_ru.ru_route
>> #define      inp_route6      inp_ru.ru_route6
>>      struct    refcnt inp_refcnt;    /* refcount PCB, delay memory free */
>> +    struct    mutex inp_mtx;        /* protect PCB and socket members */
>>      int       inp_flags;            /* generic IP/datagram flags */
>>      union {                         /* Header prototype. */
>>              struct ip hu_ip;
>> Index: netinet/in_proto.c
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
>> retrieving revision 1.98
>> diff -u -p -r1.98 in_proto.c
>> --- netinet/in_proto.c       25 Feb 2022 23:51:03 -0000      1.98
>> +++ netinet/in_proto.c       2 Aug 2022 13:54:03 -0000
>> @@ -382,7 +382,7 @@ const struct protosw inetsw[] = {
>>  .pr_type    = SOCK_RAW,
>>  .pr_domain  = &inetdomain,
>>  .pr_protocol        = IPPROTO_DIVERT,
>> -  .pr_flags = PR_ATOMIC|PR_ADDR,
>> +  .pr_flags = PR_ATOMIC|PR_ADDR|PR_MPSAFE,
>>  .pr_ctloutput       = rip_ctloutput,
>>  .pr_usrreq  = divert_usrreq,
>>  .pr_attach  = divert_attach,
>> Index: netinet/ip_divert.c
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
>> retrieving revision 1.68
>> diff -u -p -r1.68 ip_divert.c
>> --- netinet/ip_divert.c      9 May 2022 19:33:46 -0000       1.68
>> +++ netinet/ip_divert.c      2 Aug 2022 19:07:22 -0000
>> @@ -221,22 +221,15 @@ divert_packet(struct mbuf *m, int dir, u
>>              if_put(ifp);
>>      }
>> 
>> +    mtx_enter(&inp->inp_mtx);
>>      so = inp->inp_socket;
>> -    /*
>> -     * XXXSMP sbappendaddr() is not MP safe and this function is called
>> -     * from pf with shared netlock.  To call only one sbappendaddr() from
>> -     * divert_packet(), protect it with kernel lock.  All other places
>> -     * call sbappendaddr() with exclusive net lock.  This blocks
>> -     * divert_packet() as we have the shared lock.
>> -     */
>> -    KERNEL_LOCK();
>>      if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
>> -            KERNEL_UNLOCK();
>> +            mtx_leave(&inp->inp_mtx);
>>              divstat_inc(divs_fullsock);
>>              goto bad;
>>      }
>> +    mtx_leave(&inp->inp_mtx);
>>      sorwakeup(inp->inp_socket);
>> -    KERNEL_UNLOCK();
>> 
>>      in_pcbunref(inp);
>>      return;
>> @@ -293,6 +286,14 @@ divert_usrreq(struct socket *so, int req
>>              break;
>> 
>>      case PRU_SENSE:
>> +            break;
>> +
>> +    case PRU_LOCK:
>> +            mtx_enter(&inp->inp_mtx);
>> +            break;
>> +
>> +    case PRU_UNLOCK:
>> +            mtx_leave(&inp->inp_mtx);
>>              break;
>> 
>>      case PRU_LISTEN:
>> Index: sys/protosw.h
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
>> retrieving revision 1.35
>> diff -u -p -r1.35 protosw.h
>> --- sys/protosw.h    25 Feb 2022 23:51:04 -0000      1.35
>> +++ sys/protosw.h    2 Aug 2022 13:54:03 -0000
>> @@ -108,6 +108,7 @@ struct protosw {
>> #define      PR_ABRTACPTDIS  0x20            /* abort on accept(2) to 
>> disconnected
>>                                         socket */
>> #define      PR_SPLICE       0x40            /* socket splicing is possible 
>> */
>> +#define     PR_MPSAFE       0x80            /* shared net lock for 
>> send/recv */
>> 
>> /*
>> * The arguments to usrreq are:
>> @@ -144,8 +145,10 @@ struct protosw {
>> #define      PRU_SLOWTIMO            19      /* 500ms timeout */
>> #define      PRU_PROTORCV            20      /* receive from below */
>> #define      PRU_PROTOSEND           21      /* send to below */
>> +#define     PRU_LOCK                22      /* lock protocol layer */
>> +#define     PRU_UNLOCK              23      /* unlock protocol layer */
>> 
>> -#define     PRU_NREQ                22
>> +#define     PRU_NREQ                24
>> 
>> #ifdef PRUREQUESTS
>> const char *prurequests[] = {
>> @@ -154,7 +157,7 @@ const char *prurequests[] = {
>>      "RCVD",         "SEND",         "ABORT",        "CONTROL",
>>      "SENSE",        "RCVOOB",       "SENDOOB",      "SOCKADDR",
>>      "PEERADDR",     "CONNECT2",     "FASTTIMO",     "SLOWTIMO",
>> -    "PROTORCV",     "PROTOSEND",
>> +    "PROTORCV",     "PROTOSEND",    "LOCK",         "UNLOCK"
>> };
>> #endif
>> 
>> Index: sys/socketvar.h
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
>> retrieving revision 1.106
>> diff -u -p -r1.106 socketvar.h
>> --- sys/socketvar.h  15 Jul 2022 17:20:24 -0000      1.106
>> +++ sys/socketvar.h  2 Aug 2022 13:54:03 -0000
>> @@ -346,9 +346,11 @@ int     sockargs(struct mbuf **, const void 
>> 
>> int  sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
>> void solock(struct socket *);
>> +void        solock_shared(struct socket *);
>> int  solock_persocket(struct socket *);
>> void solock_pair(struct socket *, struct socket *);
>> void sounlock(struct socket *);
>> +void        sounlock_shared(struct socket *);
>> 
>> int  sendit(struct proc *, int, struct msghdr *, int, register_t *);
>> int  recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);
>> Index: sys/systm.h
>> ===================================================================
>> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/systm.h,v
>> retrieving revision 1.157
>> diff -u -p -r1.157 systm.h
>> --- sys/systm.h      12 Jul 2022 17:12:31 -0000      1.157
>> +++ sys/systm.h      2 Aug 2022 19:07:22 -0000
>> @@ -341,6 +341,15 @@ extern struct rwlock netlock;
>> #define      NET_RLOCK_IN_IOCTL()    do { rw_enter_read(&netlock); } while 
>> (0)
>> #define      NET_RUNLOCK_IN_IOCTL()  do { rw_exit_read(&netlock); } while (0)
>> 
>> +/*
>> + * Reader version of NET_LOCK() to be used in send and receive syscall.
>> + *
>> + * Can be grabbed instead of the exclusive version when no field
>> + * protected by the NET_LOCK() is modified by the ioctl/sysctl.
>> + */
>> +#define     NET_RLOCK_IN_SYSCALL()  do { rw_enter_read(&netlock); } while 
>> (0)
>> +#define     NET_RUNLOCK_IN_SYSCALL() do { rw_exit_read(&netlock); } while 
>> (0)
>> +
>> #ifdef DIAGNOSTIC
>> 
>> #define      NET_ASSERT_UNLOCKED()                                           
>> \

Reply via email to