Hi,

Divert packet has a strange design as it calls the protocol layer
directly from pf.  As pf runs in parallel, divert_packet() has a
XXXSMP comment and kernel lock.  This gives the opportunity to make
experiments.

I added a mutex in inet PCB layer.  It can be taken directly in
protocol input functions or from socket via PRU_LOCK.  This makes
it possible to run soreceive() in parallel.

I see a speed increase and it runs stable.  This diff is not ready
for commit.  Next I want to work on UDP input, to see if my idea
can be more generalized.  Just showing what I have now.

bluhm

Index: kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.280
diff -u -p -r1.280 uipc_socket.c
--- kern/uipc_socket.c  25 Jul 2022 07:28:22 -0000      1.280
+++ kern/uipc_socket.c  2 Aug 2022 13:54:03 -0000
@@ -836,10 +836,10 @@ bad:
        if (mp)
                *mp = NULL;
 
-       solock(so);
+       solock_shared(so);
 restart:
        if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
-               sounlock(so);
+               sounlock_shared(so);
                return (error);
        }
 
@@ -907,7 +907,7 @@ restart:
                sbunlock(so, &so->so_rcv);
                error = sbwait(so, &so->so_rcv);
                if (error) {
-                       sounlock(so);
+                       sounlock_shared(so);
                        return (error);
                }
                goto restart;
@@ -976,11 +976,11 @@ dontblock:
                        sbsync(&so->so_rcv, nextrecord);
                        if (controlp) {
                                if (pr->pr_domain->dom_externalize) {
-                                       sounlock(so);
+                                       sounlock_shared(so);
                                        error =
                                            (*pr->pr_domain->dom_externalize)
                                            (cm, controllen, flags);
-                                       solock(so);
+                                       solock_shared(so);
                                }
                                *controlp = cm;
                        } else {
@@ -1054,9 +1054,9 @@ dontblock:
                        SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
                        SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
                        resid = uio->uio_resid;
-                       sounlock(so);
+                       sounlock_shared(so);
                        uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
-                       solock(so);
+                       solock_shared(so);
                        if (uio_error)
                                uio->uio_resid = resid - len;
                } else
@@ -1140,7 +1140,7 @@ dontblock:
                        error = sbwait(so, &so->so_rcv);
                        if (error) {
                                sbunlock(so, &so->so_rcv);
-                               sounlock(so);
+                               sounlock_shared(so);
                                return (0);
                        }
                        if ((m = so->so_rcv.sb_mb) != NULL)
@@ -1186,7 +1186,7 @@ dontblock:
                *flagsp |= flags;
 release:
        sbunlock(so, &so->so_rcv);
-       sounlock(so);
+       sounlock_shared(so);
        return (error);
 }
 
Index: kern/uipc_socket2.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.126
diff -u -p -r1.126 uipc_socket2.c
--- kern/uipc_socket2.c 25 Jul 2022 07:28:22 -0000      1.126
+++ kern/uipc_socket2.c 2 Aug 2022 19:07:22 -0000
@@ -360,6 +360,25 @@ solock(struct socket *so)
        }
 }
 
+void
+solock_shared(struct socket *so)
+{
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
+                       NET_RLOCK_IN_SYSCALL();
+                       (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
+                           NULL, NULL, NULL, NULL);
+               } else
+                       NET_LOCK();
+               break;
+       default:
+               rw_enter_write(&so->so_lock);
+               break;
+       }
+}
+
 int
 solock_persocket(struct socket *so)
 {
@@ -403,6 +422,25 @@ sounlock(struct socket *so)
 }
 
 void
+sounlock_shared(struct socket *so)
+{
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               if (ISSET(so->so_proto->pr_flags, PR_MPSAFE)) {
+                       (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
+                           NULL, NULL, NULL, NULL);
+                       NET_RUNLOCK_IN_SYSCALL();
+               } else
+                       NET_UNLOCK();
+               break;
+       default:
+               rw_exit_write(&so->so_lock);
+               break;
+       }
+}
+
+void
 soassertlocked(struct socket *so)
 {
        switch (so->so_proto->pr_domain->dom_family) {
@@ -425,7 +463,17 @@ sosleep_nsec(struct socket *so, void *id
        switch (so->so_proto->pr_domain->dom_family) {
        case PF_INET:
        case PF_INET6:
+               if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
+                   rw_status(&netlock) == RW_READ) {
+                       (*so->so_proto->pr_usrreq)(so, PRU_UNLOCK,
+                           NULL, NULL, NULL, NULL);
+               }
                ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
+               if (ISSET(so->so_proto->pr_flags, PR_MPSAFE) &&
+                   rw_status(&netlock) == RW_READ) {
+                       (*so->so_proto->pr_usrreq)(so, PRU_LOCK,
+                           NULL, NULL, NULL, NULL);
+               }
                break;
        default:
                ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
retrieving revision 1.268
diff -u -p -r1.268 in_pcb.c
--- netinet/in_pcb.c    28 Jun 2022 09:32:27 -0000      1.268
+++ netinet/in_pcb.c    2 Aug 2022 13:32:15 -0000
@@ -236,6 +236,7 @@ in_pcballoc(struct socket *so, struct in
        inp->inp_table = table;
        inp->inp_socket = so;
        refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
+       mtx_init(&inp->inp_mtx, IPL_SOFTNET);
        inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT;
        inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
        inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.129
diff -u -p -r1.129 in_pcb.h
--- netinet/in_pcb.h    15 May 2022 09:12:20 -0000      1.129
+++ netinet/in_pcb.h    2 Aug 2022 13:32:15 -0000
@@ -79,6 +79,7 @@
  *     I       immutable after creation
  *     N       net lock
  *     t       inpt_mtx                pcb table mutex
+ *     p       inpcb_mtx               pcb mutex
  */
 
 struct pf_state_key;
@@ -121,6 +122,7 @@ struct inpcb {
 #define        inp_route       inp_ru.ru_route
 #define        inp_route6      inp_ru.ru_route6
        struct    refcnt inp_refcnt;    /* refcount PCB, delay memory free */
+       struct    mutex inp_mtx;        /* protect PCB and socket members */
        int       inp_flags;            /* generic IP/datagram flags */
        union {                         /* Header prototype. */
                struct ip hu_ip;
Index: netinet/in_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
retrieving revision 1.98
diff -u -p -r1.98 in_proto.c
--- netinet/in_proto.c  25 Feb 2022 23:51:03 -0000      1.98
+++ netinet/in_proto.c  2 Aug 2022 13:54:03 -0000
@@ -382,7 +382,7 @@ const struct protosw inetsw[] = {
   .pr_type     = SOCK_RAW,
   .pr_domain   = &inetdomain,
   .pr_protocol = IPPROTO_DIVERT,
-  .pr_flags    = PR_ATOMIC|PR_ADDR,
+  .pr_flags    = PR_ATOMIC|PR_ADDR|PR_MPSAFE,
   .pr_ctloutput        = rip_ctloutput,
   .pr_usrreq   = divert_usrreq,
   .pr_attach   = divert_attach,
Index: netinet/ip_divert.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
retrieving revision 1.68
diff -u -p -r1.68 ip_divert.c
--- netinet/ip_divert.c 9 May 2022 19:33:46 -0000       1.68
+++ netinet/ip_divert.c 2 Aug 2022 19:07:22 -0000
@@ -221,22 +221,15 @@ divert_packet(struct mbuf *m, int dir, u
                if_put(ifp);
        }
 
+       mtx_enter(&inp->inp_mtx);
        so = inp->inp_socket;
-       /*
-        * XXXSMP sbappendaddr() is not MP safe and this function is called
-        * from pf with shared netlock.  To call only one sbappendaddr() from
-        * divert_packet(), protect it with kernel lock.  All other places
-        * call sbappendaddr() with exclusive net lock.  This blocks
-        * divert_packet() as we have the shared lock.
-        */
-       KERNEL_LOCK();
        if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
-               KERNEL_UNLOCK();
+               mtx_leave(&inp->inp_mtx);
                divstat_inc(divs_fullsock);
                goto bad;
        }
+       mtx_leave(&inp->inp_mtx);
        sorwakeup(inp->inp_socket);
-       KERNEL_UNLOCK();
 
        in_pcbunref(inp);
        return;
@@ -293,6 +286,14 @@ divert_usrreq(struct socket *so, int req
                break;
 
        case PRU_SENSE:
+               break;
+
+       case PRU_LOCK:
+               mtx_enter(&inp->inp_mtx);
+               break;
+
+       case PRU_UNLOCK:
+               mtx_leave(&inp->inp_mtx);
                break;
 
        case PRU_LISTEN:
Index: sys/protosw.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
retrieving revision 1.35
diff -u -p -r1.35 protosw.h
--- sys/protosw.h       25 Feb 2022 23:51:04 -0000      1.35
+++ sys/protosw.h       2 Aug 2022 13:54:03 -0000
@@ -108,6 +108,7 @@ struct protosw {
 #define        PR_ABRTACPTDIS  0x20            /* abort on accept(2) to 
disconnected
                                           socket */
 #define        PR_SPLICE       0x40            /* socket splicing is possible 
*/
+#define        PR_MPSAFE       0x80            /* shared net lock for 
send/recv */
 
 /*
  * The arguments to usrreq are:
@@ -144,8 +145,10 @@ struct protosw {
 #define        PRU_SLOWTIMO            19      /* 500ms timeout */
 #define        PRU_PROTORCV            20      /* receive from below */
 #define        PRU_PROTOSEND           21      /* send to below */
+#define        PRU_LOCK                22      /* lock protocol layer */
+#define        PRU_UNLOCK              23      /* unlock protocol layer */
 
-#define        PRU_NREQ                22
+#define        PRU_NREQ                24
 
 #ifdef PRUREQUESTS
 const char *prurequests[] = {
@@ -154,7 +157,7 @@ const char *prurequests[] = {
        "RCVD",         "SEND",         "ABORT",        "CONTROL",
        "SENSE",        "RCVOOB",       "SENDOOB",      "SOCKADDR",
        "PEERADDR",     "CONNECT2",     "FASTTIMO",     "SLOWTIMO",
-       "PROTORCV",     "PROTOSEND",
+       "PROTORCV",     "PROTOSEND",    "LOCK",         "UNLOCK"
 };
 #endif
 
Index: sys/socketvar.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.106
diff -u -p -r1.106 socketvar.h
--- sys/socketvar.h     15 Jul 2022 17:20:24 -0000      1.106
+++ sys/socketvar.h     2 Aug 2022 13:54:03 -0000
@@ -346,9 +346,11 @@ int        sockargs(struct mbuf **, const void 
 
 int    sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
 void   solock(struct socket *);
+void   solock_shared(struct socket *);
 int    solock_persocket(struct socket *);
 void   solock_pair(struct socket *, struct socket *);
 void   sounlock(struct socket *);
+void   sounlock_shared(struct socket *);
 
 int    sendit(struct proc *, int, struct msghdr *, int, register_t *);
 int    recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);
Index: sys/systm.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/systm.h,v
retrieving revision 1.157
diff -u -p -r1.157 systm.h
--- sys/systm.h 12 Jul 2022 17:12:31 -0000      1.157
+++ sys/systm.h 2 Aug 2022 19:07:22 -0000
@@ -341,6 +341,15 @@ extern struct rwlock netlock;
 #define        NET_RLOCK_IN_IOCTL()    do { rw_enter_read(&netlock); } while 
(0)
 #define        NET_RUNLOCK_IN_IOCTL()  do { rw_exit_read(&netlock); } while (0)
 
+/*
+ * Reader version of NET_LOCK() to be used in send and receive syscall.
+ *
+ * Can be grabbed instead of the exclusive version when no field
+ * protected by the NET_LOCK() is modified by the ioctl/sysctl.
+ */
+#define        NET_RLOCK_IN_SYSCALL()  do { rw_enter_read(&netlock); } while 
(0)
+#define        NET_RUNLOCK_IN_SYSCALL() do { rw_exit_read(&netlock); } while 
(0)
+
 #ifdef DIAGNOSTIC
 
 #define        NET_ASSERT_UNLOCKED()                                           
\

Reply via email to