Hi Roman,

Some comments below.

On 02/02/2018 04:08 PM, Roman Pen wrote:
This is main functionality of ibtrs-server module, which accepts
set of RDMA connections (so called IBTRS session), creates/destroys
sysfs entries associated with IBTRS session and notifies upper layer
(user of IBTRS API) about RDMA requests or link events.

Signed-off-by: Roman Pen <[email protected]>
Signed-off-by: Danil Kipnis <[email protected]>
Cc: Jack Wang <[email protected]>
---
  drivers/infiniband/ulp/ibtrs/ibtrs-srv.c | 1811 ++++++++++++++++++++++++++++++
  1 file changed, 1811 insertions(+)

diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c 
b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c
new file mode 100644
index 000000000000..0d1fc08bd821
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c
@@ -0,0 +1,1811 @@
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <[email protected]>
+ *          Jack Wang <[email protected]>
+ *          Kleber Souza <[email protected]>
+ *          Danil Kipnis <[email protected]>
+ *          Roman Penyaev <[email protected]>
+ *          Milind Dumbare <[email protected]>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <[email protected]>
+ *          Roman Penyaev <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/mempool.h>
+
+#include "ibtrs-srv.h"
+#include "ibtrs-log.h"
+
+MODULE_AUTHOR("[email protected]");
+MODULE_DESCRIPTION("IBTRS Server");
+MODULE_VERSION(IBTRS_VER_STRING);
+MODULE_LICENSE("GPL");
+
+#define DEFAULT_MAX_IO_SIZE_KB 128
+#define DEFAULT_MAX_IO_SIZE (DEFAULT_MAX_IO_SIZE_KB * 1024)
+#define MAX_REQ_SIZE PAGE_SIZE
+#define MAX_SG_COUNT ((MAX_REQ_SIZE - sizeof(struct ibtrs_msg_rdma_read)) \
+                     / sizeof(struct ibtrs_sg_desc))
+
+static int max_io_size = DEFAULT_MAX_IO_SIZE;
+static int rcv_buf_size = DEFAULT_MAX_IO_SIZE + MAX_REQ_SIZE;
+
+static int max_io_size_set(const char *val, const struct kernel_param *kp)
+{
+       int err, ival;
+
+       err = kstrtoint(val, 0, &ival);
+       if (err)
+               return err;
+
+       if (ival < 4096 || ival + MAX_REQ_SIZE > (4096 * 1024) ||
+           (ival + MAX_REQ_SIZE) % 512 != 0) {
+               pr_err("Invalid max io size value %d, has to be"
+                      " > %d, < %d\n", ival, 4096, 4194304);
+               return -EINVAL;
+       }
+
+       max_io_size = ival;
+       rcv_buf_size = max_io_size + MAX_REQ_SIZE;
+       pr_info("max io size changed to %d\n", ival);
+
+       return 0;
+}
+
+static const struct kernel_param_ops max_io_size_ops = {
+       .set            = max_io_size_set,
+       .get            = param_get_int,
+};
+module_param_cb(max_io_size, &max_io_size_ops, &max_io_size, 0444);
+MODULE_PARM_DESC(max_io_size,
+                "Max size for each IO request, when change the unit is in byte"
+                " (default: " __stringify(DEFAULT_MAX_IO_SIZE_KB) "KB)");
+
+#define DEFAULT_SESS_QUEUE_DEPTH 512
+static int sess_queue_depth = DEFAULT_SESS_QUEUE_DEPTH;
+module_param_named(sess_queue_depth, sess_queue_depth, int, 0444);
+MODULE_PARM_DESC(sess_queue_depth,
+                "Number of buffers for pending I/O requests to allocate"
+                " per session. Maximum: " __stringify(MAX_SESS_QUEUE_DEPTH)
+                " (default: " __stringify(DEFAULT_SESS_QUEUE_DEPTH) ")");
+
+/* We guarantee to serve 10 paths at least */
+#define CHUNK_POOL_SIZE (DEFAULT_SESS_QUEUE_DEPTH * 10)
+static mempool_t *chunk_pool;
+
+static int retry_count = 7;
+
+static int retry_count_set(const char *val, const struct kernel_param *kp)
+{
+       int err, ival;
+
+       err = kstrtoint(val, 0, &ival);
+       if (err)
+               return err;
+
+       if (ival < MIN_RTR_CNT || ival > MAX_RTR_CNT) {
+               pr_err("Invalid retry count value %d, has to be"
+                      " > %d, < %d\n", ival, MIN_RTR_CNT, MAX_RTR_CNT);
+               return -EINVAL;
+       }
+
+       retry_count = ival;
+       pr_info("QP retry count changed to %d\n", ival);
+
+       return 0;
+}
+
+static const struct kernel_param_ops retry_count_ops = {
+       .set            = retry_count_set,
+       .get            = param_get_int,
+};
+module_param_cb(retry_count, &retry_count_ops, &retry_count, 0644);
+
+MODULE_PARM_DESC(retry_count, "Number of times to send the message if the"
+                " remote side didn't respond with Ack or Nack (default: 3,"
+                " min: " __stringify(MIN_RTR_CNT) ", max: "
+                __stringify(MAX_RTR_CNT) ")");
+
+static char cq_affinity_list[256] = "";
+static cpumask_t cq_affinity_mask = { CPU_BITS_ALL };
+
+static void init_cq_affinity(void)
+{
+       sprintf(cq_affinity_list, "0-%d", nr_cpu_ids - 1);
+}
+
+static int cq_affinity_list_set(const char *val, const struct kernel_param *kp)
+{
+       int ret = 0, len = strlen(val);
+       cpumask_var_t new_value;
+
+       if (!strlen(cq_affinity_list))
+               init_cq_affinity();
+
+       if (len >= sizeof(cq_affinity_list))
+               return -EINVAL;
+       if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+               return -ENOMEM;
+
+       ret = cpulist_parse(val, new_value);
+       if (ret) {
+               pr_err("Can't set cq_affinity_list \"%s\": %d\n", val,
+                      ret);
+               goto free_cpumask;
+       }
+
+       strlcpy(cq_affinity_list, val, sizeof(cq_affinity_list));
+       *strchrnul(cq_affinity_list, '\n') = '\0';
+       cpumask_copy(&cq_affinity_mask, new_value);
+
+       pr_info("cq_affinity_list changed to %*pbl\n",
+               cpumask_pr_args(&cq_affinity_mask));
+free_cpumask:
+       free_cpumask_var(new_value);
+       return ret;
+}
+
+static struct kparam_string cq_affinity_list_kparam_str = {
+       .maxlen = sizeof(cq_affinity_list),
+       .string = cq_affinity_list
+};
+
+static const struct kernel_param_ops cq_affinity_list_ops = {
+       .set    = cq_affinity_list_set,
+       .get    = param_get_string,
+};
+
+module_param_cb(cq_affinity_list, &cq_affinity_list_ops,
+               &cq_affinity_list_kparam_str, 0644);
+MODULE_PARM_DESC(cq_affinity_list, "Sets the list of cpus to use as cq 
vectors."
+                "(default: use all possible CPUs)");
+

Can you explain why not using configfs?

+static void ibtrs_srv_close_work(struct work_struct *work)
+{
+       struct ibtrs_srv_sess *sess;
+       struct ibtrs_srv_ctx *ctx;
+       struct ibtrs_srv_con *con;
+       int i;
+
+       sess = container_of(work, typeof(*sess), close_work);
+       ctx = sess->srv->ctx;
+
+       ibtrs_srv_destroy_sess_files(sess);
+       ibtrs_srv_stop_hb(sess);
+
+       for (i = 0; i < sess->s.con_num; i++) {
+               con = to_srv_con(sess->s.con[i]);
+               if (!con)
+                       continue;
+
+               rdma_disconnect(con->c.cm_id);
+               ib_drain_qp(con->c.qp);
+       }
+       /* Wait for all inflights */
+       ibtrs_srv_wait_ops_ids(sess);
+
+       /* Notify upper layer if we are the last path */
+       ibtrs_srv_sess_down(sess);
+
+       unmap_cont_bufs(sess);
+       ibtrs_srv_free_ops_ids(sess);
+
+       for (i = 0; i < sess->s.con_num; i++) {
+               con = to_srv_con(sess->s.con[i]);
+               if (!con)
+                       continue;
+
+               ibtrs_cq_qp_destroy(&con->c);
+               rdma_destroy_id(con->c.cm_id);
+               kfree(con);
+       }
+       ibtrs_ib_dev_put(sess->s.ib_dev);
+
+       del_path_from_srv(sess);
+       put_srv(sess->srv);
+       sess->srv = NULL;
+       ibtrs_srv_change_state(sess, IBTRS_SRV_CLOSED);
+
+       kfree(sess->rdma_addr);
+       kfree(sess->s.con);
+       kfree(sess);
+}
+
+static int ibtrs_rdma_do_accept(struct ibtrs_srv_sess *sess,
+                               struct rdma_cm_id *cm_id)
+{
+       struct ibtrs_srv *srv = sess->srv;
+       struct ibtrs_msg_conn_rsp msg;
+       struct rdma_conn_param param;
+       int err;
+
+       memset(&param, 0, sizeof(param));
+       param.retry_count = retry_count;
+       param.rnr_retry_count = 7;
+       param.private_data = &msg;
+       param.private_data_len = sizeof(msg);
+
+       memset(&msg, 0, sizeof(msg));
+       msg.magic = cpu_to_le16(IBTRS_MAGIC);
+       msg.version = cpu_to_le16(IBTRS_VERSION);
+       msg.errno = 0;
+       msg.queue_depth = cpu_to_le16(srv->queue_depth);
+       msg.rkey = cpu_to_le32(sess->s.ib_dev->rkey);

As said, this cannot happen anymore...

+static struct rdma_cm_id *ibtrs_srv_cm_init(struct ibtrs_srv_ctx *ctx,
+                                           struct sockaddr *addr,
+                                           enum rdma_port_space ps)
+{
+       struct rdma_cm_id *cm_id;
+       int ret;
+
+       cm_id = rdma_create_id(&init_net, ibtrs_srv_rdma_cm_handler,
+                              ctx, ps, IB_QPT_RC);
+       if (IS_ERR(cm_id)) {
+               ret = PTR_ERR(cm_id);
+               pr_err("Creating id for RDMA connection failed, err: %d\n",
+                      ret);
+               goto err_out;
+       }
+       ret = rdma_bind_addr(cm_id, addr);
+       if (ret) {
+               pr_err("Binding RDMA address failed, err: %d\n", ret);
+               goto err_cm;
+       }
+       ret = rdma_listen(cm_id, 64);
+       if (ret) {
+               pr_err("Listening on RDMA connection failed, err: %d\n",
+                      ret);
+               goto err_cm;
+       }
+
+       switch (addr->sa_family) {
+       case AF_INET:
+               pr_debug("listening on port %u\n",
+                        ntohs(((struct sockaddr_in *)addr)->sin_port));
+               break;
+       case AF_INET6:
+               pr_debug("listening on port %u\n",
+                        ntohs(((struct sockaddr_in6 *)addr)->sin6_port));
+               break;
+       case AF_IB:
+               pr_debug("listening on service id 0x%016llx\n",
+                        be64_to_cpu(rdma_get_service_id(cm_id, addr)));
+               break;
+       default:
+               pr_debug("listening on address family %u\n", addr->sa_family);
+       }

We already have printk that accepts address format...

+
+       return cm_id;
+
+err_cm:
+       rdma_destroy_id(cm_id);
+err_out:
+
+       return ERR_PTR(ret);
+}
+
+static int ibtrs_srv_rdma_init(struct ibtrs_srv_ctx *ctx, unsigned int port)
+{
+       struct sockaddr_in6 sin = {
+               .sin6_family    = AF_INET6,
+               .sin6_addr      = IN6ADDR_ANY_INIT,
+               .sin6_port      = htons(port),
+       };
+       struct sockaddr_ib sib = {
+               .sib_family                     = AF_IB,
+               .sib_addr.sib_subnet_prefix     = 0ULL,
+               .sib_addr.sib_interface_id      = 0ULL,
+               .sib_sid        = cpu_to_be64(RDMA_IB_IP_PS_IB | port),
+               .sib_sid_mask   = cpu_to_be64(0xffffffffffffffffULL),
+               .sib_pkey       = cpu_to_be16(0xffff),
+       };

ipv4?

+       struct rdma_cm_id *cm_ip, *cm_ib;
+       int ret;
+
+       /*
+        * We accept both IPoIB and IB connections, so we need to keep
+        * two cm id's, one for each socket type and port space.
+        * If the cm initialization of one of the id's fails, we abort
+        * everything.
+        */
+       cm_ip = ibtrs_srv_cm_init(ctx, (struct sockaddr *)&sin, RDMA_PS_TCP);
+       if (unlikely(IS_ERR(cm_ip)))
+               return PTR_ERR(cm_ip);
+
+       cm_ib = ibtrs_srv_cm_init(ctx, (struct sockaddr *)&sib, RDMA_PS_IB);
+       if (unlikely(IS_ERR(cm_ib))) {
+               ret = PTR_ERR(cm_ib);
+               goto free_cm_ip;
+       }
+
+       ctx->cm_id_ip = cm_ip;
+       ctx->cm_id_ib = cm_ib;
+
+       return 0;
+
+free_cm_ip:
+       rdma_destroy_id(cm_ip);
+
+       return ret;
+}
+
+static struct ibtrs_srv_ctx *alloc_srv_ctx(rdma_ev_fn *rdma_ev,
+                                          link_ev_fn *link_ev)
+{
+       struct ibtrs_srv_ctx *ctx;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return NULL;
+
+       ctx->rdma_ev = rdma_ev;
+       ctx->link_ev = link_ev;
+       mutex_init(&ctx->srv_mutex);
+       INIT_LIST_HEAD(&ctx->srv_list);
+
+       return ctx;
+}
+
+static void free_srv_ctx(struct ibtrs_srv_ctx *ctx)
+{
+       WARN_ON(!list_empty(&ctx->srv_list));
+       kfree(ctx);
+}
+
+struct ibtrs_srv_ctx *ibtrs_srv_open(rdma_ev_fn *rdma_ev, link_ev_fn *link_ev,
+                                    unsigned int port)
+{
+       struct ibtrs_srv_ctx *ctx;
+       int err;
+
+       ctx = alloc_srv_ctx(rdma_ev, link_ev);
+       if (unlikely(!ctx))
+               return ERR_PTR(-ENOMEM);
+
+       err = ibtrs_srv_rdma_init(ctx, port);
+       if (unlikely(err)) {
+               free_srv_ctx(ctx);
+               return ERR_PTR(err);
+       }
+       /* Do not let module be unloaded if server context is alive */
+       __module_get(THIS_MODULE);
+
+       return ctx;
+}
+EXPORT_SYMBOL(ibtrs_srv_open);
+
+void ibtrs_srv_queue_close(struct ibtrs_srv_sess *sess)
+{
+       close_sess(sess);
+}
+
+static void close_sess(struct ibtrs_srv_sess *sess)
+{
+       enum ibtrs_srv_state old_state;
+
+       if (ibtrs_srv_change_state_get_old(sess, IBTRS_SRV_CLOSING,
+                                          &old_state))
+               queue_work(ibtrs_wq, &sess->close_work);
+       WARN_ON(sess->state != IBTRS_SRV_CLOSING);
+}
+
+static void close_sessions(struct ibtrs_srv *srv)
+{
+       struct ibtrs_srv_sess *sess;
+
+       mutex_lock(&srv->paths_mutex);
+       list_for_each_entry(sess, &srv->paths_list, s.entry)
+               close_sess(sess);
+       mutex_unlock(&srv->paths_mutex);
+}
+
+static void close_ctx(struct ibtrs_srv_ctx *ctx)
+{
+       struct ibtrs_srv *srv;
+
+       mutex_lock(&ctx->srv_mutex);
+       list_for_each_entry(srv, &ctx->srv_list, ctx_list)
+               close_sessions(srv);
+       mutex_unlock(&ctx->srv_mutex);
+       flush_workqueue(ibtrs_wq);
+}
+
+void ibtrs_srv_close(struct ibtrs_srv_ctx *ctx)
+{
+       rdma_destroy_id(ctx->cm_id_ip);
+       rdma_destroy_id(ctx->cm_id_ib);
+       close_ctx(ctx);
+       free_srv_ctx(ctx);
+       module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL(ibtrs_srv_close);
+
+static int check_module_params(void)
+{
+       if (sess_queue_depth < 1 || sess_queue_depth > MAX_SESS_QUEUE_DEPTH) {
+               pr_err("Invalid sess_queue_depth parameter value\n");
+               return -EINVAL;
+       }
+
+       /* check if IB immediate data size is enough to hold the mem_id and the
+        * offset inside the memory chunk
+        */
+       if (ilog2(sess_queue_depth - 1) + ilog2(rcv_buf_size - 1) >
+           MAX_IMM_PAYL_BITS) {
+               pr_err("RDMA immediate size (%db) not enough to encode "
+                      "%d buffers of size %dB. Reduce 'sess_queue_depth' "
+                      "or 'max_io_size' parameters.\n", MAX_IMM_PAYL_BITS,
+                      sess_queue_depth, rcv_buf_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int __init ibtrs_server_init(void)
+{
+       int err;
+
+       if (!strlen(cq_affinity_list))
+               init_cq_affinity();
+
+       pr_info("Loading module %s, version: %s "
+               "(retry_count: %d, cq_affinity_list: %s, "
+               "max_io_size: %d, sess_queue_depth: %d)\n",
+               KBUILD_MODNAME, IBTRS_VER_STRING, retry_count,
+               cq_affinity_list, max_io_size, sess_queue_depth);
+
+       err = check_module_params();
+       if (err) {
+               pr_err("Failed to load module, invalid module parameters,"
+                      " err: %d\n", err);
+               return err;
+       }
+       chunk_pool = mempool_create_page_pool(CHUNK_POOL_SIZE,
+                                             get_order(rcv_buf_size));
+       if (unlikely(!chunk_pool)) {
+               pr_err("Failed preallocate pool of chunks\n");
+               return -ENOMEM;
+       }
+       ibtrs_wq = alloc_workqueue("ibtrs_server_wq", WQ_MEM_RECLAIM, 0);
+       if (!ibtrs_wq) {
+               pr_err("Failed to load module, alloc ibtrs_server_wq failed\n");
+               goto out_chunk_pool;
+       }
+       err = ibtrs_srv_create_sysfs_module_files();
+       if (err) {
+               pr_err("Failed to load module, can't create sysfs files,"
+                      " err: %d\n", err);
+               goto out_ibtrs_wq;
+       }
+
+       return 0;
+
+out_ibtrs_wq:
+       destroy_workqueue(ibtrs_wq);
+out_chunk_pool:
+       mempool_destroy(chunk_pool);
+
+       return err;
+}
+
+static void __exit ibtrs_server_exit(void)
+{
+       ibtrs_srv_destroy_sysfs_module_files();
+       destroy_workqueue(ibtrs_wq);
+       mempool_destroy(chunk_pool);
+}
+
+module_init(ibtrs_server_init);
+module_exit(ibtrs_server_exit);

Reply via email to