Author: jhb
Date: Wed Jul 27 18:29:35 2016
New Revision: 303405
URL: https://svnweb.freebsd.org/changeset/base/303405

Log:
  Add support for zero-copy aio_write() on TOE sockets.
  
  AIO write requests for a TOE socket on a Chelsio T4+ adapter can now
  DMA directly from the user-supplied buffer.  This is implemented by
  wiring the pages backing the user-supplied buffer and queueing special
  mbufs backed by raw VM pages to the socket buffer.  The TOE code
  recognizes these special mbufs and builds a sglist from the VM page
  array associated with the mbuf when queueing a work request to the TOE.
  
  Because these mbufs do not have an associated virtual address, m_data
  is not valid.  Thus, the AIO handler does not invoke sosend() directly
  for these mbufs but instead inlines portions of sosend_generic() and
  tcp_usr_send().
  
  An aiotx_buffer structure is used to describe the user buffer (e.g.
  it holds the array of VM pages and a reference to the AIO job).  The
  special mbufs reference this structure via m_ext.  Note that a single
  job might be split across multiple mbufs (e.g. if it is larger than
  the socket buffer size).  The 'ext_arg2' member of each mbuf gives an
  offset relative to the backing aiotx_buffer.  The AIO job associated
  with an aiotx_buffer structure is completed when the last reference to
  the structure is released.
  
  Zero-copy aio_write()'s for connections associated with a given
  adapter can be enabled/disabled at runtime via the
  'dev.t[45]nex.N.toe.tx_zcopy' sysctl.
  
  MFC after:    1 month
  Relnotes:     yes
  Sponsored by: Chelsio Communications

Modified:
  head/sys/dev/cxgbe/offload.h
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/tom/t4_cpl_io.c
  head/sys/dev/cxgbe/tom/t4_tom.c
  head/sys/dev/cxgbe/tom/t4_tom.h

Modified: head/sys/dev/cxgbe/offload.h
==============================================================================
--- head/sys/dev/cxgbe/offload.h        Wed Jul 27 18:12:36 2016        
(r303404)
+++ head/sys/dev/cxgbe/offload.h        Wed Jul 27 18:29:35 2016        
(r303405)
@@ -147,6 +147,7 @@ struct tom_tunables {
        int ddp;
        int rx_coalesce;
        int tx_align;
+       int tx_zcopy;
 };
 
 #ifdef TCP_OFFLOAD

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c        Wed Jul 27 18:12:36 2016        
(r303404)
+++ head/sys/dev/cxgbe/t4_main.c        Wed Jul 27 18:29:35 2016        
(r303405)
@@ -4864,6 +4864,11 @@ t4_sysctls(struct adapter *sc)
                SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align",
                    CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload");
 
+               sc->tt.tx_zcopy = 0;
+               SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy",
+                   CTLFLAG_RW, &sc->tt.tx_zcopy, 0,
+                   "Enable zero-copy aio_write(2)");
+
                SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick",
                    CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A",
                    "TP timer tick (us)");

Modified: head/sys/dev/cxgbe/tom/t4_cpl_io.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c  Wed Jul 27 18:12:36 2016        
(r303404)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c  Wed Jul 27 18:29:35 2016        
(r303405)
@@ -32,15 +32,18 @@ __FBSDID("$FreeBSD$");
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
-#include <sys/types.h>
+#include <sys/aio.h>
+#include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
+#include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
+#include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
@@ -51,6 +54,14 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
@@ -71,6 +82,34 @@ VNET_DECLARE(int, tcp_autorcvbuf_inc);
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
 
+#define        IS_AIOTX_MBUF(m)                                                
\
+       ((m)->m_flags & M_EXT && (m)->m_ext.ext_flags & EXT_FLAG_AIOTX)
+
+static void    t4_aiotx_cancel(struct kaiocb *job);
+static void    t4_aiotx_queue_toep(struct toepcb *toep);
+
+static size_t
+aiotx_mbuf_pgoff(struct mbuf *m)
+{
+       struct aiotx_buffer *ab;
+
+       MPASS(IS_AIOTX_MBUF(m));
+       ab = m->m_ext.ext_arg1;
+       return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE);
+}
+
+static vm_page_t *
+aiotx_mbuf_pages(struct mbuf *m)
+{
+       struct aiotx_buffer *ab;
+       int npages;
+
+       MPASS(IS_AIOTX_MBUF(m));
+       ab = m->m_ext.ext_arg1;
+       npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE;
+       return (ab->ps.pages + npages);
+}
+
 void
 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
 {
@@ -519,7 +558,11 @@ write_tx_sgl(void *dst, struct mbuf *sta
 
        i = -1;
        for (m = start; m != stop; m = m->m_next) {
-               rc = sglist_append(&sg, mtod(m, void *), m->m_len);
+               if (IS_AIOTX_MBUF(m))
+                       rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
+                           aiotx_mbuf_pgoff(m), m->m_len);
+               else
+                       rc = sglist_append(&sg, mtod(m, void *), m->m_len);
                if (__predict_false(rc != 0))
                        panic("%s: sglist_append %d", __func__, rc);
 
@@ -579,6 +622,7 @@ t4_push_frames(struct adapter *sc, struc
        struct sockbuf *sb = &so->so_snd;
        int tx_credits, shove, compl, sowwakeup;
        struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+       bool aiotx_mbuf_seen;
 
        INP_WLOCK_ASSERT(inp);
        KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
@@ -589,6 +633,10 @@ t4_push_frames(struct adapter *sc, struc
            toep->ulp_mode == ULP_MODE_RDMA,
            ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep));
 
+#ifdef VERBOSE_TRACES
+       CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
+           __func__, toep->tid, toep->flags, tp->t_flags);
+#endif
        if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
                return;
 
@@ -618,8 +666,15 @@ t4_push_frames(struct adapter *sc, struc
                plen = 0;
                nsegs = 0;
                max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
+               aiotx_mbuf_seen = false;
                for (m = sndptr; m != NULL; m = m->m_next) {
-                       int n = sglist_count(mtod(m, void *), m->m_len);
+                       int n;
+
+                       if (IS_AIOTX_MBUF(m))
+                               n = sglist_count_vmpages(aiotx_mbuf_pages(m),
+                                   aiotx_mbuf_pgoff(m), m->m_len);
+                       else
+                               n = sglist_count(mtod(m, void *), m->m_len);
 
                        nsegs += n;
                        plen += m->m_len;
@@ -631,9 +686,13 @@ t4_push_frames(struct adapter *sc, struc
                                if (plen == 0) {
                                        /* Too few credits */
                                        toep->flags |= TPF_TX_SUSPENDED;
-                                       if (sowwakeup)
+                                       if (sowwakeup) {
+                                               if (!TAILQ_EMPTY(
+                                                   &toep->aiotx_jobq))
+                                                       t4_aiotx_queue_toep(
+                                                           toep);
                                                sowwakeup_locked(so);
-                                       else
+                                       } else
                                                SOCKBUF_UNLOCK(sb);
                                        SOCKBUF_UNLOCK_ASSERT(sb);
                                        return;
@@ -641,6 +700,8 @@ t4_push_frames(struct adapter *sc, struc
                                break;
                        }
 
+                       if (IS_AIOTX_MBUF(m))
+                               aiotx_mbuf_seen = true;
                        if (max_nsegs_1mbuf < n)
                                max_nsegs_1mbuf = n;
                        sb_sndptr = m;  /* new sb->sb_sndptr if all goes well */
@@ -670,9 +731,11 @@ t4_push_frames(struct adapter *sc, struc
                        else
                                sowwakeup = 1;  /* room available */
                }
-               if (sowwakeup)
+               if (sowwakeup) {
+                       if (!TAILQ_EMPTY(&toep->aiotx_jobq))
+                               t4_aiotx_queue_toep(toep);
                        sowwakeup_locked(so);
-               else
+               } else
                        SOCKBUF_UNLOCK(sb);
                SOCKBUF_UNLOCK_ASSERT(sb);
 
@@ -687,7 +750,7 @@ t4_push_frames(struct adapter *sc, struc
                        panic("%s: excess tx.", __func__);
 
                shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
-               if (plen <= max_imm) {
+               if (plen <= max_imm && !aiotx_mbuf_seen) {
 
                        /* Immediate data tx */
 
@@ -1616,6 +1679,9 @@ do_fw4_ack(struct sge_iq *iq, const stru
                }
        }
 
+#ifdef VERBOSE_TRACES
+       CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
+#endif
        so = inp->inp_socket;
        txsd = &toep->txsd[toep->txsd_cidx];
        plen = 0;
@@ -1642,6 +1708,10 @@ do_fw4_ack(struct sge_iq *iq, const stru
 
        if (toep->flags & TPF_TX_SUSPENDED &&
            toep->tx_credits >= toep->tx_total / 4) {
+#ifdef VERBOSE_TRACES
+               CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
+                   tid);
+#endif
                toep->flags &= ~TPF_TX_SUSPENDED;
                if (toep->ulp_mode == ULP_MODE_ISCSI)
                        t4_push_pdus(sc, toep, plen);
@@ -1668,7 +1738,13 @@ do_fw4_ack(struct sge_iq *iq, const stru
                        sowwakeup_locked(so);   /* unlocks so_snd */
                        rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
                } else {
+#ifdef VERBOSE_TRACES
+                       CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
+                           tid, plen);
+#endif
                        sbdrop_locked(sb, plen);
+                       if (!TAILQ_EMPTY(&toep->aiotx_jobq))
+                               t4_aiotx_queue_toep(toep);
                        sowwakeup_locked(so);   /* unlocks so_snd */
                }
                SOCKBUF_UNLOCK_ASSERT(sb);
@@ -1768,4 +1844,397 @@ t4_uninit_cpl_io_handlers(void)
        t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
        t4_register_cpl_handler(CPL_FW4_ACK, do_fw4_ack);
 }
+
+/*
+ * Use the 'backend3' field in AIO jobs to store the amount of data
+ * sent by the AIO job so far and the 'backend4' field to hold an
+ * error that should be reported when the job is completed.
+ */
+#define        aio_sent        backend3
+#define        aio_error       backend4
+
+#define        jobtotid(job)                                                   
\
+       (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
+       
+static void
+free_aiotx_buffer(struct aiotx_buffer *ab)
+{
+       struct kaiocb *job;
+       long status;
+       int error;
+
+       if (refcount_release(&ab->refcount) == 0)
+               return;
+
+       job = ab->job;
+       error = job->aio_error;
+       status = job->aio_sent;
+       vm_page_unhold_pages(ab->ps.pages, ab->ps.npages);
+       free(ab, M_CXGBE);
+#ifdef VERBOSE_TRACES
+       CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
+           jobtotid(job), job, status, error);
+#endif
+       if (error == ECANCELED && status != 0)
+               error = 0;
+       if (error == ECANCELED)
+               aio_cancel(job);
+       else if (error)
+               aio_complete(job, -1, error);
+       else
+               aio_complete(job, status, 0);
+}
+
+static void
+t4_aiotx_mbuf_free(struct mbuf *m, void *buffer, void *arg)
+{
+       struct aiotx_buffer *ab = buffer;
+
+#ifdef VERBOSE_TRACES
+       CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
+           m->m_len, jobtotid(ab->job));
+#endif
+       free_aiotx_buffer(ab);
+}
+
+/*
+ * Hold the buffer backing an AIO request and return an AIO transmit
+ * buffer.
+ */
+static int
+hold_aio(struct kaiocb *job)
+{
+       struct aiotx_buffer *ab;
+       struct vmspace *vm;
+       vm_map_t map;
+       vm_offset_t start, end, pgoff;
+       int n;
+
+       MPASS(job->backend1 == NULL);
+
+       /*
+        * The AIO subsystem will cancel and drain all requests before
+        * permitting a process to exit or exec, so p_vmspace should
+        * be stable here.
+        */
+       vm = job->userproc->p_vmspace;
+       map = &vm->vm_map;
+       start = (uintptr_t)job->uaiocb.aio_buf;
+       pgoff = start & PAGE_MASK;
+       end = round_page(start + job->uaiocb.aio_nbytes);
+       start = trunc_page(start);
+       n = atop(end - start);
+
+       ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
+           M_ZERO);
+       refcount_init(&ab->refcount, 1);
+       ab->ps.pages = (vm_page_t *)(ab + 1);
+       ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start,
+           VM_PROT_WRITE, ab->ps.pages, n);
+       if (ab->ps.npages < 0) {
+               free(ab, M_CXGBE);
+               return (EFAULT);
+       }
+
+       KASSERT(ab->ps.npages == n,
+           ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n));
+
+       ab->ps.offset = pgoff;
+       ab->ps.len = job->uaiocb.aio_nbytes;
+       ab->job = job;
+       job->backend1 = ab;
+#ifdef VERBOSE_TRACES
+       CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
+           __func__, jobtotid(job), &ab->ps, job, ab->ps.npages);
+#endif
+       return (0);
+}
+
+static void
+t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb 
*job)
+{
+       struct adapter *sc;
+       struct sockbuf *sb;
+       struct file *fp;
+       struct aiotx_buffer *ab;
+       struct inpcb *inp;
+       struct tcpcb *tp;
+       struct mbuf *m;
+       int error;
+       bool moretocome, sendmore;
+
+       sc = td_adapter(toep->td);
+       sb = &so->so_snd;
+       SOCKBUF_UNLOCK(sb);
+       fp = job->fd_file;
+       ab = job->backend1;
+       m = NULL;
+
+#ifdef MAC
+       error = mac_socket_check_send(fp->f_cred, so);
+       if (error != 0)
+               goto out;
+#endif
+
+       if (ab == NULL) {
+               error = hold_aio(job);
+               if (error != 0)
+                       goto out;
+               ab = job->backend1;
+       }
+
+       /* Inline sosend_generic(). */
+
+       job->msgsnd = 1;
+
+       error = sblock(sb, SBL_WAIT);
+       MPASS(error == 0);
+
+sendanother:
+       m = m_get(M_WAITOK, MT_DATA);
+
+       SOCKBUF_LOCK(sb);
+       if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+               SOCKBUF_UNLOCK(sb);
+               sbunlock(sb);
+               if ((so->so_options & SO_NOSIGPIPE) == 0) {
+                       PROC_LOCK(job->userproc);
+                       kern_psignal(job->userproc, SIGPIPE);
+                       PROC_UNLOCK(job->userproc);
+               }
+               error = EPIPE;
+               goto out;
+       }
+       if (so->so_error) {
+               error = so->so_error;
+               so->so_error = 0;
+               SOCKBUF_UNLOCK(sb);
+               sbunlock(sb);
+               goto out;
+       }
+       if ((so->so_state & SS_ISCONNECTED) == 0) {
+               SOCKBUF_UNLOCK(sb);
+               sbunlock(sb);
+               error = ENOTCONN;
+               goto out;
+       }
+       if (sbspace(sb) < sb->sb_lowat) {
+               MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
+
+               /*
+                * Don't block if there is too little room in the socket
+                * buffer.  Instead, requeue the request.
+                */
+               if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
+                       SOCKBUF_UNLOCK(sb);
+                       sbunlock(sb);
+                       error = ECANCELED;
+                       goto out;
+               }
+               TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
+               SOCKBUF_UNLOCK(sb);
+               sbunlock(sb);
+               goto out;
+       }
+
+       /*
+        * Write as much data as the socket permits, but no more than a
+        * a single sndbuf at a time.
+        */
+       m->m_len = sbspace(sb);
+       if (m->m_len > ab->ps.len - job->aio_sent) {
+               m->m_len = ab->ps.len - job->aio_sent;
+               moretocome = false;
+       } else
+               moretocome = true;
+       if (m->m_len > sc->tt.sndbuf) {
+               m->m_len = sc->tt.sndbuf;
+               sendmore = true;
+       } else
+               sendmore = false;
+
+       if (!TAILQ_EMPTY(&toep->aiotx_jobq))
+               moretocome = true;
+       SOCKBUF_UNLOCK(sb);
+       MPASS(m->m_len != 0);
+
+       /* Inlined tcp_usr_send(). */
+
+       inp = toep->inp;
+       INP_WLOCK(inp);
+       if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+               INP_WUNLOCK(inp);
+               sbunlock(sb);
+               error = ECONNRESET;
+               goto out;
+       }
+
+       refcount_acquire(&ab->refcount);
+       m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab,
+           (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV);
+       m->m_ext.ext_flags |= EXT_FLAG_AIOTX;
+       job->aio_sent += m->m_len;
+       
+       sbappendstream(sb, m, 0);
+       m = NULL;
+
+       if (!(inp->inp_flags & INP_DROPPED)) {
+               tp = intotcpcb(inp);
+               if (moretocome)
+                       tp->t_flags |= TF_MORETOCOME;
+               error = tp->t_fb->tfb_tcp_output(tp);
+               if (moretocome)
+                       tp->t_flags &= ~TF_MORETOCOME;
+       }
+
+       INP_WUNLOCK(inp);
+       if (sendmore)
+               goto sendanother;
+       sbunlock(sb);
+
+       if (error)
+               goto out;
+
+       /*
+        * If this is a non-blocking socket and the request has not
+        * been fully completed, requeue it until the socket is ready
+        * again.
+        */
+       if (job->aio_sent < job->uaiocb.aio_nbytes &&
+           !(so->so_state & SS_NBIO)) {
+               SOCKBUF_LOCK(sb);
+               if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
+                       SOCKBUF_UNLOCK(sb);
+                       error = ECANCELED;
+                       goto out;
+               }
+               TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
+               return;
+       }
+
+       /*
+        * If the request will not be requeued, drop a reference on
+        * the the aiotx buffer.  Any mbufs in flight should still
+        * contain a reference, but this drops the reference that the
+        * job owns while it is waiting to queue mbufs to the socket.
+        */
+       free_aiotx_buffer(ab);
+
+out:
+       if (error) {
+               if (ab != NULL) {
+                       job->aio_error = error;
+                       free_aiotx_buffer(ab);
+               } else {
+                       MPASS(job->aio_sent == 0);
+                       aio_complete(job, -1, error);
+               }
+       }
+       if (m != NULL)
+               m_free(m);
+       SOCKBUF_LOCK(sb);
+}
+
+static void
+t4_aiotx_task(void *context, int pending)
+{
+       struct toepcb *toep = context;
+       struct inpcb *inp = toep->inp;
+       struct socket *so = inp->inp_socket;
+       struct kaiocb *job;
+
+       CURVNET_SET(so->so_vnet);
+       SOCKBUF_LOCK(&so->so_snd);
+       while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
+               job = TAILQ_FIRST(&toep->aiotx_jobq);
+               TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
+               if (!aio_clear_cancel_function(job))
+                       continue;
+
+               t4_aiotx_process_job(toep, so, job);
+       }
+       toep->aiotx_task_active = false;
+       SOCKBUF_UNLOCK(&so->so_snd);
+       CURVNET_RESTORE();
+
+       free_toepcb(toep);
+}
+
+static void
+t4_aiotx_queue_toep(struct toepcb *toep)
+{
+
+       SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
+#ifdef VERBOSE_TRACES
+       CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
+           __func__, toep->tid, toep->aiotx_task_active ? "true" : "false");
+#endif
+       if (toep->aiotx_task_active)
+               return;
+       toep->aiotx_task_active = true;
+       hold_toepcb(toep);
+       soaio_enqueue(&toep->aiotx_task);
+}
+
+static void
+t4_aiotx_cancel(struct kaiocb *job)
+{
+       struct aiotx_buffer *ab;
+       struct socket *so;
+       struct sockbuf *sb;
+       struct tcpcb *tp;
+       struct toepcb *toep;
+
+       so = job->fd_file->f_data;
+       tp = so_sototcpcb(so);
+       toep = tp->t_toe;
+       MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
+       sb = &so->so_snd;
+
+       SOCKBUF_LOCK(sb);
+       if (!aio_cancel_cleared(job))
+               TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
+       SOCKBUF_UNLOCK(sb);
+
+       ab = job->backend1;
+       if (ab != NULL)
+               free_aiotx_buffer(ab);
+       else
+               aio_cancel(job);
+}
+
+int
+t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
+{
+       struct tcpcb *tp = so_sototcpcb(so);
+       struct toepcb *toep = tp->t_toe;
+       struct adapter *sc = td_adapter(toep->td);
+
+       /* This only handles writes. */
+       if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
+               return (EOPNOTSUPP);
+
+       if (!sc->tt.tx_zcopy)
+               return (EOPNOTSUPP);
+
+       SOCKBUF_LOCK(&so->so_snd);
+#ifdef VERBOSE_TRACES
+       CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job);
+#endif
+       if (!aio_set_cancel_function(job, t4_aiotx_cancel))
+               panic("new job was cancelled");
+       TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
+       if (sowriteable(so))
+               t4_aiotx_queue_toep(toep);
+       SOCKBUF_UNLOCK(&so->so_snd);
+       return (0);
+}
+
+void
+aiotx_init_toep(struct toepcb *toep)
+{
+
+       TAILQ_INIT(&toep->aiotx_jobq);
+       TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
+}
 #endif

Modified: head/sys/dev/cxgbe/tom/t4_tom.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.c     Wed Jul 27 18:12:36 2016        
(r303404)
+++ head/sys/dev/cxgbe/tom/t4_tom.c     Wed Jul 27 18:29:35 2016        
(r303405)
@@ -68,11 +68,11 @@ __FBSDID("$FreeBSD$");
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
-static struct protosw ddp_protosw;
-static struct pr_usrreqs ddp_usrreqs;
+static struct protosw toe_protosw;
+static struct pr_usrreqs toe_usrreqs;
 
-static struct protosw ddp6_protosw;
-static struct pr_usrreqs ddp6_usrreqs;
+static struct protosw toe6_protosw;
+static struct pr_usrreqs toe6_usrreqs;
 
 /* Module ops */
 static int t4_tom_mod_load(void);
@@ -167,6 +167,7 @@ alloc_toepcb(struct vi_info *vi, int txq
        toep->txsd_avail = txsd_total;
        toep->txsd_pidx = 0;
        toep->txsd_cidx = 0;
+       aiotx_init_toep(toep);
        ddp_init_toep(toep);
 
        return (toep);
@@ -217,12 +218,10 @@ offload_socket(struct socket *so, struct
        sb = &so->so_rcv;
        SOCKBUF_LOCK(sb);
        sb->sb_flags |= SB_NOCOALESCE;
-       if (toep->ulp_mode == ULP_MODE_TCPDDP) {
-               if (inp->inp_vflag & INP_IPV6)
-                       so->so_proto = &ddp6_protosw;
-               else
-                       so->so_proto = &ddp_protosw;
-       }
+       if (inp->inp_vflag & INP_IPV6)
+               so->so_proto = &toe6_protosw;
+       else
+               so->so_proto = &toe_protosw;
        SOCKBUF_UNLOCK(sb);
 
        /* Update TCP PCB */
@@ -1120,6 +1119,22 @@ t4_tom_ifaddr_event(void *arg __unused, 
 }
 
 static int
+t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
+{
+       struct tcpcb *tp = so_sototcpcb(so);
+       struct toepcb *toep = tp->t_toe;
+       int error;
+
+       if (toep->ulp_mode == ULP_MODE_TCPDDP) {
+               error = t4_aio_queue_ddp(so, job);
+               if (error != EOPNOTSUPP)
+                       return (error);
+       }
+
+       return (t4_aio_queue_aiotx(so, job));
+}
+
+static int
 t4_tom_mod_load(void)
 {
        int rc;
@@ -1137,18 +1152,18 @@ t4_tom_mod_load(void)
        tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
        if (tcp_protosw == NULL)
                return (ENOPROTOOPT);
-       bcopy(tcp_protosw, &ddp_protosw, sizeof(ddp_protosw));
-       bcopy(tcp_protosw->pr_usrreqs, &ddp_usrreqs, sizeof(ddp_usrreqs));
-       ddp_usrreqs.pru_aio_queue = t4_aio_queue_ddp;
-       ddp_protosw.pr_usrreqs = &ddp_usrreqs;
+       bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw));
+       bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs));
+       toe_usrreqs.pru_aio_queue = t4_aio_queue_tom;
+       toe_protosw.pr_usrreqs = &toe_usrreqs;
 
        tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
        if (tcp6_protosw == NULL)
                return (ENOPROTOOPT);
-       bcopy(tcp6_protosw, &ddp6_protosw, sizeof(ddp6_protosw));
-       bcopy(tcp6_protosw->pr_usrreqs, &ddp6_usrreqs, sizeof(ddp6_usrreqs));
-       ddp6_usrreqs.pru_aio_queue = t4_aio_queue_ddp;
-       ddp6_protosw.pr_usrreqs = &ddp6_usrreqs;
+       bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
+       bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs));
+       toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom;
+       toe6_protosw.pr_usrreqs = &toe6_usrreqs;
 
        TIMEOUT_TASK_INIT(taskqueue_thread, &clip_task, 0, t4_clip_task, NULL);
        ifaddr_evhandler = EVENTHANDLER_REGISTER(ifaddr_event,

Modified: head/sys/dev/cxgbe/tom/t4_tom.h
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.h     Wed Jul 27 18:12:36 2016        
(r303404)
+++ head/sys/dev/cxgbe/tom/t4_tom.h     Wed Jul 27 18:29:35 2016        
(r303405)
@@ -102,6 +102,8 @@ TAILQ_HEAD(pagesetq, pageset);
 #define        PS_WIRED                0x0001  /* Pages wired rather than 
held. */
 #define        PS_PPODS_WRITTEN        0x0002  /* Page pods written to the 
card. */
 
+#define        EXT_FLAG_AIOTX          EXT_FLAG_VENDOR1
+
 struct ddp_buffer {
        struct pageset *ps;
 
@@ -109,6 +111,12 @@ struct ddp_buffer {
        int cancel_pending;
 };
 
+struct aiotx_buffer {
+       struct pageset ps;
+       struct kaiocb *job;
+       int refcount;
+};
+
 struct toepcb {
        TAILQ_ENTRY(toepcb) link; /* toep_list */
        u_int flags;            /* miscellaneous flags */
@@ -151,6 +159,10 @@ struct toepcb {
        struct kaiocb *ddp_queueing;
        struct mtx ddp_lock;
 
+       TAILQ_HEAD(, kaiocb) aiotx_jobq;
+       struct task aiotx_task;
+       bool aiotx_task_active;
+
        /* Tx software descriptor */
        uint8_t txsd_total;
        uint8_t txsd_pidx;
@@ -313,6 +325,8 @@ int do_abort_rpl_synqe(struct sge_iq *, 
 void t4_offload_socket(struct toedev *, void *, struct socket *);
 
 /* t4_cpl_io.c */
+void aiotx_init_toep(struct toepcb *);
+int t4_aio_queue_aiotx(struct socket *, struct kaiocb *);
 void t4_init_cpl_io_handlers(void);
 void t4_uninit_cpl_io_handlers(void);
 void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to