Rick, have you had a chance to test out this patch?

- Sean


>If a user allocates a QP on an rdma_cm_id, the rdma_cm will automatically
>transition the QP through its states (RTR, RTS, error, etc.)  While the
>QP state transitions are occurring, the QP itself must remain valid.
>Provide locking around the QP pointer to prevent its destruction while
>accessing the pointer.
>
>This fixes an issue reported by Olaf Kirch from Oracle that resulted in
>a system crash:
>
>"An incoming connection arrives and we decide to tear down the nascent
> connection.  The remote ends decides to do the same.  We start to shut
> down the connection, and call rdma_destroy_qp on our cm_id. ... Now
> apparently a 'connect reject' message comes in from the other host,
> and cma_ib_handler() is called with an event of IB_CM_REJ_RECEIVED.
> It calls cma_modify_qp_err, which for some odd reason tries to modify
> the exact same QP we just destroyed."
>
>Signed-off-by: Sean Hefty <[EMAIL PROTECTED]>
>---
>Rick, can you please test this patch and let me know if it fixes your problem?
>
> drivers/infiniband/core/cma.c |   90 +++++++++++++++++++++++++++--------------
> 1 files changed, 60 insertions(+), 30 deletions(-)
>
>diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
>index 9ffb998..c6a6dba 100644
>--- a/drivers/infiniband/core/cma.c
>+++ b/drivers/infiniband/core/cma.c
>@@ -120,6 +120,8 @@ struct rdma_id_private {
>
>       enum cma_state          state;
>       spinlock_t              lock;
>+      struct mutex            qp_mutex;
>+
>       struct completion       comp;
>       atomic_t                refcount;
>       wait_queue_head_t       wait_remove;
>@@ -387,6 +389,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler
>event_handler,
>       id_priv->id.event_handler = event_handler;
>       id_priv->id.ps = ps;
>       spin_lock_init(&id_priv->lock);
>+      mutex_init(&id_priv->qp_mutex);
>       init_completion(&id_priv->comp);
>       atomic_set(&id_priv->refcount, 1);
>       init_waitqueue_head(&id_priv->wait_remove);
>@@ -472,61 +475,86 @@ EXPORT_SYMBOL(rdma_create_qp);
>
> void rdma_destroy_qp(struct rdma_cm_id *id)
> {
>-      ib_destroy_qp(id->qp);
>+      struct rdma_id_private *id_priv;
>+
>+      id_priv = container_of(id, struct rdma_id_private, id);
>+      mutex_lock(&id_priv->qp_mutex);
>+      ib_destroy_qp(id_priv->id.qp);
>+      id_priv->id.qp = NULL;
>+      mutex_unlock(&id_priv->qp_mutex);
> }
> EXPORT_SYMBOL(rdma_destroy_qp);
>
>-static int cma_modify_qp_rtr(struct rdma_cm_id *id)
>+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
> {
>       struct ib_qp_attr qp_attr;
>       int qp_attr_mask, ret;
>
>-      if (!id->qp)
>-              return 0;
>+      mutex_lock(&id_priv->qp_mutex);
>+      if (!id_priv->id.qp) {
>+              ret = 0;
>+              goto out;
>+      }
>
>       /* Need to update QP attributes from default values. */
>       qp_attr.qp_state = IB_QPS_INIT;
>-      ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+      ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
>       if (ret)
>-              return ret;
>+              goto out;
>
>-      ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+      ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
>       if (ret)
>-              return ret;
>+              goto out;
>
>       qp_attr.qp_state = IB_QPS_RTR;
>-      ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+      ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
>       if (ret)
>-              return ret;
>+              goto out;
>
>-      return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+      ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
>+out:
>+      mutex_unlock(&id_priv->qp_mutex);
>+      return ret;
> }
>
>-static int cma_modify_qp_rts(struct rdma_cm_id *id)
>+static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
> {
>       struct ib_qp_attr qp_attr;
>       int qp_attr_mask, ret;
>
>-      if (!id->qp)
>-              return 0;
>+      mutex_lock(&id_priv->qp_mutex);
>+      if (!id_priv->id.qp) {
>+              ret = 0;
>+              goto out;
>+      }
>
>       qp_attr.qp_state = IB_QPS_RTS;
>-      ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+      ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
>       if (ret)
>-              return ret;
>+              goto out;
>
>-      return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+      ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
>+out:
>+      mutex_unlock(&id_priv->qp_mutex);
>+      return ret;
> }
>
>-static int cma_modify_qp_err(struct rdma_cm_id *id)
>+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
> {
>       struct ib_qp_attr qp_attr;
>+      int ret;
>
>-      if (!id->qp)
>-              return 0;
>+      mutex_lock(&id_priv->qp_mutex);
>+      if (!id_priv->id.qp) {
>+              ret = 0;
>+              goto out;
>+      }
>
>       qp_attr.qp_state = IB_QPS_ERR;
>-      return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
>+      ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
>+out:
>+      mutex_unlock(&id_priv->qp_mutex);
>+      return ret;
> }
>
> static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
>@@ -855,11 +883,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
> {
>       int ret;
>
>-      ret = cma_modify_qp_rtr(&id_priv->id);
>+      ret = cma_modify_qp_rtr(id_priv);
>       if (ret)
>               goto reject;
>
>-      ret = cma_modify_qp_rts(&id_priv->id);
>+      ret = cma_modify_qp_rts(id_priv);
>       if (ret)
>               goto reject;
>
>@@ -869,7 +897,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
>
>       return 0;
> reject:
>-      cma_modify_qp_err(&id_priv->id);
>+      cma_modify_qp_err(id_priv);
>       ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
>                      NULL, 0, NULL, 0);
>       return ret;
>@@ -945,7 +973,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct
>ib_cm_event *ib_event)
>               /* ignore event */
>               goto out;
>       case IB_CM_REJ_RECEIVED:
>-              cma_modify_qp_err(&id_priv->id);
>+              cma_modify_qp_err(id_priv);
>               event.status = ib_event->param.rej_rcvd.reason;
>               event.event = RDMA_CM_EVENT_REJECTED;
>               event.param.conn.private_data = ib_event->private_data;
>@@ -2236,7 +2264,7 @@ static int cma_connect_iw(struct rdma_id_private
>*id_priv,
>       sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
>       cm_id->remote_addr = *sin;
>
>-      ret = cma_modify_qp_rtr(&id_priv->id);
>+      ret = cma_modify_qp_rtr(id_priv);
>       if (ret)
>               goto out;
>
>@@ -2303,7 +2331,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
>       int qp_attr_mask, ret;
>
>       if (id_priv->id.qp) {
>-              ret = cma_modify_qp_rtr(&id_priv->id);
>+              ret = cma_modify_qp_rtr(id_priv);
>               if (ret)
>                       goto out;
>
>@@ -2342,7 +2370,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
>       struct iw_cm_conn_param iw_param;
>       int ret;
>
>-      ret = cma_modify_qp_rtr(&id_priv->id);
>+      ret = cma_modify_qp_rtr(id_priv);
>       if (ret)
>               return ret;
>
>@@ -2414,7 +2442,7 @@ int rdma_accept(struct rdma_cm_id *id, struct
>rdma_conn_param *conn_param)
>
>       return 0;
> reject:
>-      cma_modify_qp_err(id);
>+      cma_modify_qp_err(id_priv);
>       rdma_reject(id, NULL, 0);
>       return ret;
> }
>@@ -2484,7 +2512,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
>
>       switch (rdma_node_get_transport(id->device->node_type)) {
>       case RDMA_TRANSPORT_IB:
>-              ret = cma_modify_qp_err(id);
>+              ret = cma_modify_qp_err(id_priv);
>               if (ret)
>                       goto out;
>               /* Initiate or respond to a disconnect. */
>@@ -2515,9 +2543,11 @@ static int cma_ib_mc_handler(int status, struct
>ib_sa_multicast *multicast)
>           cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
>               return 0;
>
>+      mutex_lock(&id_priv->qp_mutex);
>       if (!status && id_priv->id.qp)
>               status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
>                                        multicast->rec.mlid);
>+      mutex_unlock(&id_priv->qp_mutex);
>
>       memset(&event, 0, sizeof event);
>       event.status = status;
>
>_______________________________________________
>general mailing list
>[email protected]
>http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
>To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to