Author: trasz
Date: Thu May 26 09:49:29 2016
New Revision: 300723
URL: https://svnweb.freebsd.org/changeset/base/300723

Log:
  Bring in the Mellanox implementation of iSER (iSCSI over RDMA) initiator,
  written by Sagi Grimberg <sagig at mellanox.com> and Max Gurtovoy
  <maxg at mellanox.com>.
  
  This code comes from https://github.com/sagigrimberg/iser-freebsd, branch
  iser-rebase-11-current-r291993.  It's not connected to the build just yet;
  it still needs some tweaks to adapt to my changes to iSCSI infrastructure.
  
  Big thanks to Mellanox for their support for FreeBSD!
  
  Obtained from:        Mellanox Technologies
  MFC after:    1 month
  Relnotes:     yes

Added:
  head/sys/dev/iser/
  head/sys/dev/iser/icl_iser.c   (contents, props changed)
  head/sys/dev/iser/icl_iser.h   (contents, props changed)
  head/sys/dev/iser/iser_initiator.c   (contents, props changed)
  head/sys/dev/iser/iser_memory.c   (contents, props changed)
  head/sys/dev/iser/iser_verbs.c   (contents, props changed)
  head/sys/modules/iser/
  head/sys/modules/iser/Makefile   (contents, props changed)

Added: head/sys/dev/iser/icl_iser.c
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/dev/iser/icl_iser.c        Thu May 26 09:49:29 2016        
(r300723)
@@ -0,0 +1,582 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "icl_iser.h"
+
+SYSCTL_NODE(_kern, OID_AUTO, iser, CTLFLAG_RW, 0, "iSER module");
+int iser_debug = 0;
+SYSCTL_INT(_kern_iser, OID_AUTO, debug, CTLFLAG_RWTUN,
+    &iser_debug, 0, "Enable iser debug messages");
+
+static MALLOC_DEFINE(M_ICL_ISER, "icl_iser", "iSCSI iser backend");
+static uma_zone_t icl_pdu_zone;
+
+static volatile u_int  icl_iser_ncons;
+struct iser_global ig;
+
+static icl_conn_new_pdu_t      iser_conn_new_pdu;
+static icl_conn_pdu_free_t     iser_conn_pdu_free;
+static icl_conn_pdu_data_segment_length_t iser_conn_pdu_data_segment_length;
+static icl_conn_pdu_append_data_t      iser_conn_pdu_append_data;
+static icl_conn_pdu_queue_t    iser_conn_pdu_queue;
+static icl_conn_handoff_t      iser_conn_handoff;
+static icl_conn_free_t         iser_conn_free;
+static icl_conn_close_t                iser_conn_close;
+static icl_conn_release_t      iser_conn_release;
+static icl_conn_connect_t      iser_conn_connect;
+static icl_conn_connected_t    iser_conn_connected;
+static icl_conn_task_setup_t   iser_conn_task_setup;
+static icl_conn_task_done_t    iser_conn_task_done;
+static icl_conn_pdu_get_data_t iser_conn_pdu_get_data;
+
+static kobj_method_t icl_iser_methods[] = {
+       KOBJMETHOD(icl_conn_new_pdu, iser_conn_new_pdu),
+       KOBJMETHOD(icl_conn_pdu_free, iser_conn_pdu_free),
+       KOBJMETHOD(icl_conn_pdu_data_segment_length, 
iser_conn_pdu_data_segment_length),
+       KOBJMETHOD(icl_conn_pdu_append_data, iser_conn_pdu_append_data),
+       KOBJMETHOD(icl_conn_pdu_queue, iser_conn_pdu_queue),
+       KOBJMETHOD(icl_conn_handoff, iser_conn_handoff),
+       KOBJMETHOD(icl_conn_free, iser_conn_free),
+       KOBJMETHOD(icl_conn_close, iser_conn_close),
+       KOBJMETHOD(icl_conn_release, iser_conn_release),
+       KOBJMETHOD(icl_conn_connect, iser_conn_connect),
+       KOBJMETHOD(icl_conn_connected, iser_conn_connected),
+       KOBJMETHOD(icl_conn_task_setup, iser_conn_task_setup),
+       KOBJMETHOD(icl_conn_task_done, iser_conn_task_done),
+       KOBJMETHOD(icl_conn_pdu_get_data, iser_conn_pdu_get_data),
+       { 0, 0 }
+};
+
+DEFINE_CLASS(icl_iser, icl_iser_methods, sizeof(struct iser_conn));
+
+/**
+ * iser_initialize_headers() - Initialize task headers
+ * @pdu:       iser pdu
+ * @iser_conn:    iser connection
+ *
+ * Notes:
+ * This routine may race with iser teardown flow for scsi
+ * error handling TMFs. So for TMF we should acquire the
+ * state mutex to avoid dereferencing the IB device which
+ * may have already been terminated (racing teardown sequence).
+ */
+int
+iser_initialize_headers(struct icl_iser_pdu *pdu, struct iser_conn *iser_conn)
+{
+       struct iser_tx_desc *tx_desc = &pdu->desc;
+       struct iser_device *device = iser_conn->ib_conn.device;
+       u64 dma_addr;
+       int ret = 0;
+
+       dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+                               ISER_HEADERS_LEN, DMA_TO_DEVICE);
+       if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       tx_desc->mapped = true;
+       tx_desc->dma_addr = dma_addr;
+       tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+       tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+       tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+
+out:
+
+       return (ret);
+}
+
+int
+iser_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
+                         const void *addr, size_t len, int flags)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+       if (request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_LOGIN_REQUEST ||
+           request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_TEXT_REQUEST) {
+               ISER_DBG("copy to login buff");
+               memcpy(iser_conn->login_req_buf, addr, len);
+               request->ip_data_len = len;
+       }
+
+       return (0);
+}
+
+void
+iser_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
+                      size_t off, void *addr, size_t len)
+{
+       /* If we have a receive data, copy it to upper layer buffer */
+       if (ip->ip_data_mbuf)
+               memcpy(addr, ip->ip_data_mbuf + off, len);
+}
+
+/*
+ * Allocate icl_pdu with empty BHS to fill up by the caller.
+ */
+struct icl_pdu *
+iser_new_pdu(struct icl_conn *ic, int flags)
+{
+       struct icl_iser_pdu *iser_pdu;
+       struct icl_pdu *ip;
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+       iser_pdu = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
+       if (iser_pdu == NULL) {
+               ISER_WARN("failed to allocate %zd bytes", sizeof(*iser_pdu));
+               return (NULL);
+       }
+
+       iser_pdu->iser_conn = iser_conn;
+       ip = &iser_pdu->icl_pdu;
+       ip->ip_conn = ic;
+       ip->ip_bhs = &iser_pdu->desc.iscsi_header;
+
+       return (ip);
+}
+
+struct icl_pdu *
+iser_conn_new_pdu(struct icl_conn *ic, int flags)
+{
+       return (iser_new_pdu(ic, flags));
+}
+
+void
+iser_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
+{
+       struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+
+       uma_zfree(icl_pdu_zone, iser_pdu);
+}
+
+size_t
+iser_conn_pdu_data_segment_length(struct icl_conn *ic,
+                                 const struct icl_pdu *request)
+{
+       uint32_t len = 0;
+
+       len += request->ip_bhs->bhs_data_segment_len[0];
+       len <<= 8;
+       len += request->ip_bhs->bhs_data_segment_len[1];
+       len <<= 8;
+       len += request->ip_bhs->bhs_data_segment_len[2];
+
+       return (len);
+}
+
+void
+iser_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
+{
+       iser_pdu_free(ic, ip);
+}
+
+static bool
+is_control_opcode(uint8_t opcode)
+{
+       bool is_control = false;
+
+       switch (opcode & ISCSI_OPCODE_MASK) {
+               case ISCSI_BHS_OPCODE_NOP_OUT:
+               case ISCSI_BHS_OPCODE_LOGIN_REQUEST:
+               case ISCSI_BHS_OPCODE_LOGOUT_REQUEST:
+               case ISCSI_BHS_OPCODE_TEXT_REQUEST:
+                       is_control = true;
+                       break;
+               case ISCSI_BHS_OPCODE_SCSI_COMMAND:
+                       is_control = false;
+                       break;
+               default:
+                       ISER_ERR("unknown opcode %d", opcode);
+       }
+
+       return (is_control);
+}
+
+void
+iser_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+       struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+       int ret;
+
+       ret = iser_initialize_headers(iser_pdu, iser_conn);
+       if (ret) {
+               ISER_ERR("Failed to map TX descriptor pdu %p", iser_pdu);
+               return;
+       }
+
+       if (is_control_opcode(ip->ip_bhs->bhs_opcode)) {
+               ret = iser_send_control(iser_conn, iser_pdu);
+               if (unlikely(ret))
+                       ISER_ERR("Failed to send control pdu %p", iser_pdu);
+       } else {
+               ret = iser_send_command(iser_conn, iser_pdu);
+               if (unlikely(ret))
+                       ISER_ERR("Failed to send command pdu %p", iser_pdu);
+       }
+}
+
+static struct icl_conn *
+iser_new_conn(const char *name, struct mtx *lock)
+{
+       struct iser_conn *iser_conn;
+       struct icl_conn *ic;
+
+       refcount_acquire(&icl_iser_ncons);
+
+       iser_conn = (struct iser_conn *)kobj_create(&icl_iser_class, 
M_ICL_ISER, M_WAITOK | M_ZERO);
+       if (!iser_conn) {
+               ISER_ERR("failed to allocate iser conn");
+               refcount_release(&icl_iser_ncons);
+               return (NULL);
+       }
+
+       cv_init(&iser_conn->up_cv, "iser_cv");
+       sx_init(&iser_conn->state_mutex, "iser_conn_state_mutex");
+       mtx_init(&iser_conn->ib_conn.beacon.flush_lock, "flush_lock", NULL, 
MTX_DEF);
+       cv_init(&iser_conn->ib_conn.beacon.flush_cv, "flush_cv");
+       mtx_init(&iser_conn->ib_conn.lock, "lock", NULL, MTX_DEF);
+
+       ic = &iser_conn->icl_conn;
+       ic->ic_lock = lock;
+       ic->ic_name = name;
+       ic->ic_driver = strdup("iser", M_TEMP);
+       ic->ic_iser = true;
+
+       return (ic);
+}
+
+void
+iser_conn_free(struct icl_conn *ic)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+       cv_destroy(&iser_conn->ib_conn.beacon.flush_cv);
+       mtx_destroy(&iser_conn->ib_conn.beacon.flush_lock);
+       sx_destroy(&iser_conn->state_mutex);
+       cv_destroy(&iser_conn->up_cv);
+       kobj_delete((struct kobj *)iser_conn, M_ICL_ISER);
+       refcount_release(&icl_iser_ncons);
+}
+
+int
+iser_conn_handoff(struct icl_conn *ic, int cmds_max)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+       int error = 0;
+
+       sx_xlock(&iser_conn->state_mutex);
+       if (iser_conn->state != ISER_CONN_UP) {
+               error = EINVAL;
+               ISER_ERR("iser_conn %p state is %d, teardown started\n",
+                        iser_conn, iser_conn->state);
+               goto out;
+       }
+
+       /*
+        * In discovery session no need to allocate rx desc and posting recv
+        * work request
+        */
+       if (ic->ic_session_type_discovery(ic))
+               goto out;
+
+       error = iser_alloc_rx_descriptors(iser_conn, cmds_max);
+       if (error)
+               goto out;
+
+       error = iser_post_recvm(iser_conn, iser_conn->min_posted_rx);
+       if (error)
+               goto post_error;
+
+       sx_xunlock(&iser_conn->state_mutex);
+       return (error);
+
+post_error:
+       iser_free_rx_descriptors(iser_conn);
+out:
+       sx_xunlock(&iser_conn->state_mutex);
+       return (error);
+
+}
+
+/**
+ * Frees all conn objects
+ */
+void
+iser_conn_release(struct icl_conn *ic)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+       struct ib_conn *ib_conn = &iser_conn->ib_conn;
+       struct iser_conn *curr, *tmp;
+
+       mtx_lock(&ig.connlist_mutex);
+       /*
+        * Search for iser connection in global list.
+        * It may not be there in case of failure in connection establishment
+        * stage.
+        */
+       list_for_each_entry_safe(curr, tmp, &ig.connlist, conn_list) {
+               if (iser_conn == curr) {
+                       ISER_WARN("found iser_conn %p", iser_conn);
+                       list_del(&iser_conn->conn_list);
+               }
+       }
+       mtx_unlock(&ig.connlist_mutex);
+
+       /*
+        * In case we reconnecting or removing session, we need to
+        * release IB resources (which is safe to call more than once).
+        */
+       sx_xlock(&iser_conn->state_mutex);
+       iser_free_ib_conn_res(iser_conn, true);
+       sx_xunlock(&iser_conn->state_mutex);
+
+       if (ib_conn->cma_id != NULL) {
+               rdma_destroy_id(ib_conn->cma_id);
+               ib_conn->cma_id = NULL;
+       }
+
+}
+
+void
+iser_conn_close(struct icl_conn *ic)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+       ISER_INFO("closing conn %p", iser_conn);
+
+       sx_xlock(&iser_conn->state_mutex);
+       /*
+        * In case iser connection is waiting on conditional variable
+        * (state PENDING) and we try to close it before connection 
establishment,
+        * we need to signal it to continue releasing connection properly.
+        */
+       if (!iser_conn_terminate(iser_conn) && iser_conn->state == 
ISER_CONN_PENDING)
+               cv_signal(&iser_conn->up_cv);
+       sx_xunlock(&iser_conn->state_mutex);
+
+}
+
+int
+iser_conn_connect(struct icl_conn *ic, int domain, int socktype,
+               int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+       struct ib_conn *ib_conn = &iser_conn->ib_conn;
+       int err = 0;
+
+       sx_xlock(&iser_conn->state_mutex);
+        /* the device is known only --after-- address resolution */
+       ib_conn->device = NULL;
+
+       iser_conn->state = ISER_CONN_PENDING;
+
+       ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)iser_conn,
+                       RDMA_PS_TCP, IB_QPT_RC);
+       if (IS_ERR(ib_conn->cma_id)) {
+               err = -PTR_ERR(ib_conn->cma_id);
+               ISER_ERR("rdma_create_id failed: %d", err);
+               goto id_failure;
+       }
+
+       err = rdma_resolve_addr(ib_conn->cma_id, from_sa, to_sa, 1000);
+       if (err) {
+               ISER_ERR("rdma_resolve_addr failed: %d", err);
+               if (err < 0)
+                       err = -err;
+               goto addr_failure;
+       }
+
+       ISER_DBG("before cv_wait: %p", iser_conn);
+       cv_wait(&iser_conn->up_cv, &iser_conn->state_mutex);
+       ISER_DBG("after cv_wait: %p", iser_conn);
+
+       if (iser_conn->state != ISER_CONN_UP) {
+               err = EIO;
+               goto addr_failure;
+       }
+
+       err = iser_alloc_login_buf(iser_conn);
+       if (err)
+               goto addr_failure;
+       sx_xunlock(&iser_conn->state_mutex);
+
+       mtx_lock(&ig.connlist_mutex);
+       list_add(&iser_conn->conn_list, &ig.connlist);
+       mtx_unlock(&ig.connlist_mutex);
+
+       return (0);
+
+id_failure:
+       ib_conn->cma_id = NULL;
+addr_failure:
+       sx_xunlock(&iser_conn->state_mutex);
+       return (err);
+}
+
+/**
+ * Called with session spinlock held.
+ * No need to lock state mutex on an advisory check.
+ **/
+bool
+iser_conn_connected(struct icl_conn *ic)
+{
+       struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+       return (iser_conn->state == ISER_CONN_UP);
+}
+
+int
+iser_conn_task_setup(struct icl_conn *ic, struct ccb_scsiio *csio,
+                    uint32_t *task_tagp, void **prvp, struct icl_pdu *ip)
+{
+       struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+
+       *prvp = ip;
+       iser_pdu->csio = csio;
+
+       return (0);
+}
+
+void
+iser_conn_task_done(struct icl_conn *ic, void *prv)
+{
+       struct icl_pdu *ip = prv;
+       struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+       struct iser_device *device = iser_pdu->iser_conn->ib_conn.device;
+       struct iser_tx_desc *tx_desc = &iser_pdu->desc;
+
+       if (iser_pdu->dir[ISER_DIR_IN]) {
+               iser_unreg_rdma_mem(iser_pdu, ISER_DIR_IN);
+               iser_dma_unmap_task_data(iser_pdu,
+                                        &iser_pdu->data[ISER_DIR_IN],
+                                        DMA_FROM_DEVICE);
+       }
+
+       if (iser_pdu->dir[ISER_DIR_OUT]) {
+               iser_unreg_rdma_mem(iser_pdu, ISER_DIR_OUT);
+               iser_dma_unmap_task_data(iser_pdu,
+                                        &iser_pdu->data[ISER_DIR_OUT],
+                                        DMA_TO_DEVICE);
+       }
+
+       if (likely(tx_desc->mapped)) {
+               ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+                                   ISER_HEADERS_LEN, DMA_TO_DEVICE);
+               tx_desc->mapped = false;
+       }
+
+       iser_pdu_free(ic, ip);
+}
+
+static u_int32_t
+iser_hba_misc()
+{
+       return (PIM_UNMAPPED);
+}
+
+static int
+iser_limits(size_t *limitp)
+{
+       *limitp = 128 * 1024;
+
+       return (0);
+}
+
+static int
+icl_iser_load(void)
+{
+       int error;
+
+       ISER_DBG("Starting iSER datamover...");
+
+       icl_pdu_zone = uma_zcreate("icl_iser_pdu", sizeof(struct icl_iser_pdu),
+                                  NULL, NULL, NULL, NULL,
+                                  UMA_ALIGN_PTR, 0);
+       /* FIXME: Check rc */
+
+       refcount_init(&icl_iser_ncons, 0);
+
+       error = icl_register("iser", 0, iser_limits, iser_new_conn, 
iser_hba_misc);
+       KASSERT(error == 0, ("failed to register iser"));
+
+       memset(&ig, 0, sizeof(struct iser_global));
+
+       /* device init is called only after the first addr resolution */
+       sx_init(&ig.device_list_mutex,  "global_device_lock");
+       INIT_LIST_HEAD(&ig.device_list);
+       mtx_init(&ig.connlist_mutex, "global_conn_lock", NULL, MTX_DEF);
+       INIT_LIST_HEAD(&ig.connlist);
+       sx_init(&ig.close_conns_mutex,  "global_close_conns_lock");
+
+       return (error);
+}
+
+static int
+icl_iser_unload(void)
+{
+       ISER_DBG("Removing iSER datamover...");
+
+       if (icl_iser_ncons != 0)
+               return (EBUSY);
+
+       sx_destroy(&ig.close_conns_mutex);
+       mtx_destroy(&ig.connlist_mutex);
+       sx_destroy(&ig.device_list_mutex);
+
+       icl_unregister("iser");
+
+       uma_zdestroy(icl_pdu_zone);
+
+       return (0);
+}
+
+static int
+icl_iser_modevent(module_t mod, int what, void *arg)
+{
+       switch (what) {
+       case MOD_LOAD:
+               return (icl_iser_load());
+       case MOD_UNLOAD:
+               return (icl_iser_unload());
+       default:
+               return (EINVAL);
+       }
+}
+
+moduledata_t icl_iser_data = {
+       .name = "icl_iser",
+       .evhand = icl_iser_modevent,
+       .priv = 0
+};
+
+DECLARE_MODULE(icl_iser, icl_iser_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+MODULE_DEPEND(icl_iser, icl, 1, 1, 1);
+MODULE_DEPEND(icl_iser, iscsi, 1, 1, 1);
+MODULE_DEPEND(icl_iser, ibcore, 1, 1, 1);
+MODULE_DEPEND(icl_iser, linuxkpi, 1, 1, 1);
+MODULE_VERSION(icl_iser, 1);
+

Added: head/sys/dev/iser/icl_iser.h
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/dev/iser/icl_iser.h        Thu May 26 09:49:29 2016        
(r300723)
@@ -0,0 +1,547 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef ICL_ISER_H
+#define ICL_ISER_H
+
+/*
+ * iSCSI Common Layer for RDMA.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/uio.h>
+#include <sys/taskqueue.h>
+#include <sys/bio.h>
+#include <vm/uma.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <dev/iscsi/icl.h>
+#include <dev/iscsi/iscsi_proto.h>
+#include <icl_conn_if.h>
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
+
+
+#define        ISER_DBG(X, ...)                                                
\
+       do {                                                            \
+               if (unlikely(iser_debug > 2))                           \
+                       printf("DEBUG: %s: " X "\n",                    \
+                               __func__, ## __VA_ARGS__);              \
+       } while (0)
+
+#define        ISER_INFO(X, ...)                                               
\
+       do {                                                            \
+               if (unlikely(iser_debug > 1))                           \
+                       printf("INFO: %s: " X "\n",                     \
+                               __func__, ## __VA_ARGS__);              \
+       } while (0)
+
+#define        ISER_WARN(X, ...)                                               
\
+       do {                                                            \
+               if (unlikely(iser_debug > 0)) {                         \
+                       printf("WARNING: %s: " X "\n",                  \
+                               __func__, ## __VA_ARGS__);              \
+               }                                                       \
+       } while (0)
+
+#define        ISER_ERR(X, ...)                                                
\
+       printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
+
+#define ISER_VER                       0x10
+#define ISER_WSV                       0x08
+#define ISER_RSV                       0x04
+
+#define ISER_FASTREG_LI_WRID           0xffffffffffffffffULL
+#define ISER_BEACON_WRID               0xfffffffffffffffeULL
+
+#define SHIFT_4K       12
+#define SIZE_4K        (1ULL << SHIFT_4K)
+#define MASK_4K        (~(SIZE_4K-1))
+
+/* support up to 512KB in one RDMA */
+#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
+#define ISER_DEF_XMIT_CMDS_MAX 256
+
+/* the max RX (recv) WR supported by the iSER QP is defined by                 
*
+ * max_recv_wr = commands_max + recv_beacon                                    
*/
+#define ISER_QP_MAX_RECV_DTOS  (ISER_DEF_XMIT_CMDS_MAX + 1)
+#define ISER_MIN_POSTED_RX             (ISER_DEF_XMIT_CMDS_MAX >> 2)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISER_MAX_RX_MISC_PDUS           4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+#define ISER_MAX_TX_MISC_PDUS           6 /* NOOP_OUT(2), TEXT(1), 
SCSI_TMFUNC(2), LOGOUT(1) */
+
+/* the max TX (send) WR supported by the iSER QP is defined by                 
*
+ * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   
*
+ * to have at max for SCSI command. The tx posting & completion handling code  
*
+ * supports -EAGAIN scheme where tx is suspended till the QP has room for more 
*
+ * send WR. D=8 comes from 64K/8K                                              
*/
+
+#define ISER_INFLIGHT_DATAOUTS         8
+
+/* the send_beacon increase the max_send_wr by 1  */
+#define ISER_QP_MAX_REQ_DTOS           (ISER_DEF_XMIT_CMDS_MAX *    \
+                                       (1 + ISER_INFLIGHT_DATAOUTS) + \
+                                       ISER_MAX_TX_MISC_PDUS        + \
+                                       ISER_MAX_RX_MISC_PDUS + 1)
+
+#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr                      \
+                                        - ISER_MAX_TX_MISC_PDUS        \
+                                        - ISER_MAX_RX_MISC_PDUS - 1) / \
+                                        (1 + ISER_INFLIGHT_DATAOUTS))
+
+#define ISER_WC_BATCH_COUNT   16
+#define ISER_SIGNAL_CMD_COUNT 32
+
+/* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might   
*
+ * encounter a CQ overrun state.                                               
*/
+#define ISCSI_ISER_MAX_CONN    8
+#define ISER_MAX_RX_LEN                (ISER_QP_MAX_RECV_DTOS * 
ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_LEN                (ISER_QP_MAX_REQ_DTOS  * 
ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_CQ_LEN                (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
+                                ISCSI_ISER_MAX_CONN)
+
+#define ISER_ZBVA_NOT_SUPPORTED                0x80
+#define ISER_SEND_W_INV_NOT_SUPPORTED  0x40
+
+#define icl_to_iser_conn(ic) \
+       container_of(ic, struct iser_conn, icl_conn)
+#define icl_to_iser_pdu(ip) \
+       container_of(ip, struct icl_iser_pdu, icl_pdu)
+
+/**
+ * struct iser_hdr - iSER header
+ *
+ * @flags:        flags support (zbva, remote_inv)
+ * @rsvd:         reserved
+ * @write_stag:   write rkey
+ * @write_va:     write virtual address
+ * @reaf_stag:    read rkey
+ * @read_va:      read virtual address
+ */
+struct iser_hdr {
+       u8      flags;
+       u8      rsvd[3];
+       __be32  write_stag;
+       __be64  write_va;
+       __be32  read_stag;
+       __be64  read_va;
+} __attribute__((packed));
+
+struct iser_cm_hdr {
+       u8      flags;
+       u8      rsvd[3];
+} __packed;
+
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
+
+#define ISER_RECV_DATA_SEG_LEN 128
+#define ISER_RX_PAYLOAD_SIZE   (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+
+#define ISER_RX_LOGIN_SIZE     (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+enum iser_conn_state {
+       ISER_CONN_INIT,            /* descriptor allocd, no conn          */
+       ISER_CONN_PENDING,         /* in the process of being established */
+       ISER_CONN_UP,              /* up and running                      */
+       ISER_CONN_TERMINATING,     /* in the process of being terminated  */
+       ISER_CONN_DOWN,            /* shut down                           */
+       ISER_CONN_STATES_NUM
+};
+
+enum iser_task_status {
+       ISER_TASK_STATUS_INIT = 0,
+       ISER_TASK_STATUS_STARTED,
+       ISER_TASK_STATUS_COMPLETED
+};
+
+enum iser_data_dir {
+       ISER_DIR_IN = 0,           /* to initiator */
+       ISER_DIR_OUT,              /* from initiator */
+       ISER_DIRS_NUM
+};
+
+/**
+ * struct iser_mem_reg - iSER memory registration info
+ *
+ * @sge:          memory region sg element
+ * @rkey:         memory region remote key
+ * @mem_h:        pointer to registration context (FMR/Fastreg)
+ */
+struct iser_mem_reg {
+       struct ib_sge    sge;
+       u32              rkey;
+       void            *mem_h;
+};
+
+enum iser_desc_type {
+       ISCSI_TX_CONTROL ,
+       ISCSI_TX_SCSI_COMMAND,
+       ISCSI_TX_DATAOUT
+};
+
+/**
+ * struct iser_data_buf - iSER data buffer
+ *
+ * @sg:           pointer to the sg list
+ * @size:         num entries of this sg
+ * @data_len:     total beffer byte len
+ * @dma_nents:    returned by dma_map_sg
+ * @copy_buf:     allocated copy buf for SGs unaligned
+ *                for rdma which are copied
+ * @orig_sg:      pointer to the original sg list (in case
+ *                we used a copy)
+ * @sg_single:    SG-ified clone of a non SG SC or
+ *                unaligned SG
+ */
+struct iser_data_buf {
+       struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
+       void               *sg;
+       unsigned int       size;
+       unsigned long      data_len;
+       unsigned int       dma_nents;
+       char               *copy_buf;
+       struct scatterlist *orig_sg;
+       struct scatterlist sg_single;
+  };
+
+/* fwd declarations */
+struct iser_conn;
+struct ib_conn;
+struct iser_device;
+
+/**
+ * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header (bhs)
+ * @type:          command/control/dataout
+ * @dma_addr:      header buffer dma_address
+ * @tx_sg:         sg[0] points to iser/iscsi headers
+ *                 sg[1] optionally points to either of immediate data
+ *                 unsolicited data-out or control
+ * @num_sge:       number sges used on this TX task
+ * @mapped:        indicates if the descriptor is dma mapped
+ */
+struct iser_tx_desc {
+       struct iser_hdr              iser_header;
+       struct iscsi_bhs             iscsi_header __attribute__((packed));
+       enum   iser_desc_type        type;
+       u64                          dma_addr;
+       struct ib_sge                tx_sg[2];
+       int                          num_sge;
+       bool                         mapped;
+};
+
+#define ISER_RX_PAD_SIZE       (256 - (ISER_RX_PAYLOAD_SIZE + \
+                                       sizeof(u64) + sizeof(struct ib_sge)))
+/**
+ * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @data:          received data segment
+ * @dma_addr:      receive buffer dma address
+ * @rx_sg:         ib_sge of receive buffer
+ * @pad:           for sense data TODO: Modify to maximum sense length 
supported
+ */
+struct iser_rx_desc {
+       struct iser_hdr              iser_header;
+       struct iscsi_bhs             iscsi_header;
+       char                         data[ISER_RECV_DATA_SEG_LEN];
+       u64                          dma_addr;
+       struct ib_sge                rx_sg;
+       char                         pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
+struct icl_iser_pdu {
+       struct icl_pdu               icl_pdu;
+       struct iser_tx_desc          desc;
+       struct iser_conn             *iser_conn;
+       enum iser_task_status        status;
+       struct ccb_scsiio                        *csio;
+       int                          command_sent;
+       int                          dir[ISER_DIRS_NUM];
+       struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
+       struct iser_data_buf         data[ISER_DIRS_NUM];
+};
+
+/**
+ * struct iser_comp - iSER completion context
+ *
+ * @device:     pointer to device handle
+ * @cq:         completion queue
+ * @wcs:        work completion array
+ * @tq:        taskqueue handle
+ * @task:      task to run task_fn
+ * @active_qps: Number of active QPs attached
+ *              to completion context
+ */
+struct iser_comp {
+       struct iser_device      *device;
+       struct ib_cq            *cq;
+       struct ib_wc             wcs[ISER_WC_BATCH_COUNT];
+       struct taskqueue        *tq;
+       struct task             task;
+       int                      active_qps;
+};
+
+/**
+ * struct iser_device - iSER device handle
+ *
+ * @ib_device:     RDMA device
+ * @pd:            Protection Domain for this device
+ * @dev_attr:      Device attributes container
+ * @mr:            Global DMA memory region
+ * @event_handler: IB events handle routine
+ * @ig_list:      entry in devices list
+ * @refcount:      Reference counter, dominated by open iser connections
+ * @comps_used:    Number of completion contexts used, Min between online
+ *                 cpus and device max completion vectors
+ * @comps:         Dinamically allocated array of completion handlers
+ */
+struct iser_device {
+       struct ib_device             *ib_device;
+       struct ib_pd                 *pd;
+       struct ib_device_attr        dev_attr;
+       struct ib_mr                 *mr;
+       struct ib_event_handler      event_handler;
+       struct list_head             ig_list;
+       int                          refcount;
+       int                          comps_used;
+       struct iser_comp             *comps;
+};
+
+/**
+ * struct iser_reg_resources - Fast registration recources
+ *
+ * @mr:         memory region
+ * @frpl:       fast reg page list
+ * @mr_valid:   is mr valid indicator
+ */
+struct iser_reg_resources {
+       struct ib_mr                     *mr;
+       struct ib_fast_reg_page_list     *frpl;
+       u8                                mr_valid:1;
+};
+
+/**
+ * struct fast_reg_descriptor - Fast registration descriptor
+ *
+ * @list:           entry in connection fastreg pool
+ * @rsc:            data buffer registration resources
+ */
+struct fast_reg_descriptor {
+       struct list_head                  list;
+       struct iser_reg_resources         rsc;
+};
+
+
+/**
+ * struct iser_beacon - beacon to signal all flush errors were drained
+ *
+ * @send:           send wr
+ * @recv:           recv wr
+ * @flush_lock:     protects flush_cv
+ * @flush_cv:       condition variable for beacon flush
+ */
+struct iser_beacon {
+       union {
+               struct ib_send_wr       send;
+               struct ib_recv_wr       recv;
+       };
+       struct mtx                   flush_lock;
+       struct cv                    flush_cv;
+};
+
+/**
+ * struct ib_conn - Infiniband related objects
+ *
+ * @cma_id:              rdma_cm connection maneger handle
+ * @qp:                  Connection Queue-pair
+ * @device:              reference to iser device
+ * @comp:                iser completion context
+  */
+struct ib_conn {
+       struct rdma_cm_id           *cma_id;
+       struct ib_qp                *qp;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to