On Fri, 12 Nov 2004 12:18:32 -0500
Hal Rosenstock <[EMAIL PROTECTED]> wrote:

> On Fri, 2004-11-12 at 12:13, Sean Hefty wrote:
> > Not sure what the issue is.  Let me make sure that I've pulled the latest 
> > code and 
> > resubmit the patch.
> 
> It looks right to me. Does it work for you ? Can you send a normal
> rather than unified diff ?

Can you try this version?  I'll also revert back to the original code and see if
I can apply the patch.

- Sean


Index: include/ib_mad.h
===================================================================
--- include/ib_mad.h    (revision 1221)
+++ include/ib_mad.h    (working copy)
@@ -250,6 +250,8 @@
  * @mad_agent - Specifies the associated registration to post the send to.
  * @send_wr - Specifies the information needed to send the MAD(s).
  * @bad_send_wr - Specifies the MAD on which an error was encountered.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
  */
 int ib_post_send_mad(struct ib_mad_agent *mad_agent,
                     struct ib_send_wr *send_wr,
Index: core/mad.c
===================================================================
--- core/mad.c  (revision 1221)
+++ core/mad.c  (working copy)
@@ -90,6 +90,8 @@
                                    struct ib_mad_send_wc *mad_send_wc);
 static void timeout_sends(void *data);
 static int solicited_mad(struct ib_mad *mad);
+static int ib_mad_change_qp_state_to_rts(struct ib_qp *qp,
+                                        enum ib_qp_state cur_state);
 
 /*
  * Returns a ib_mad_port_private structure or NULL for a device/port.
@@ -591,6 +593,7 @@
                /* Timeout will be updated after send completes */
                mad_send_wr->timeout = msecs_to_jiffies(send_wr->wr.
                                                        ud.timeout_ms);
+               mad_send_wr->retry = 0;
                /* One reference for each work request to QP + response */
                mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
                mad_send_wr->status = IB_WC_SUCCESS;
@@ -1339,6 +1342,70 @@
        }
 }
 
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+       struct ib_mad_send_wr_private *mad_send_wr;
+       struct ib_mad_list_head *mad_list;
+       int flags;
+
+       spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+       list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+               mad_send_wr = container_of(mad_list,
+                                          struct ib_mad_send_wr_private,
+                                          mad_list);
+               mad_send_wr->retry = 1;
+       }
+       spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+                             struct ib_wc *wc)
+{
+       struct ib_mad_list_head *mad_list;
+       struct ib_mad_qp_info *qp_info;
+       struct ib_mad_send_wr_private *mad_send_wr;
+       int ret;
+
+       /* Determine if failure was a send or receive */
+       mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+       qp_info = mad_list->mad_queue->qp_info;
+       if (mad_list->mad_queue == &qp_info->recv_queue) {
+               /*
+               * Receive errors indicate that the QP has entered the error 
+               * state - error handling/shutdown code will cleanup.
+               */
+               return;
+       }
+
+       /*
+        * Send errors will transition the QP to SQE - move
+        * QP to RTS and repost flushed work requests.
+        */
+       mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+                                  mad_list);
+       if (wc->status == IB_WC_WR_FLUSH_ERR) {
+               if (mad_send_wr->retry) {
+                       /* Repost send. */
+                       struct ib_send_wr *bad_send_wr;
+
+                       mad_send_wr->retry = 0;
+                       ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+                                       &bad_send_wr);
+                       if (ret)
+                               ib_mad_send_done_handler(port_priv, wc);
+               } else
+                       ib_mad_send_done_handler(port_priv, wc);
+       } else {
+               /* Transition QP to RTS and fail offending send. */
+               ret = ib_mad_change_qp_state_to_rts(qp_info->qp, IB_QPS_SQE);
+               if (ret)
+                       printk(KERN_ERR PFX "mad_error_handler - unable to "
+                              "transition QP to RTS : %d\n", ret);
+               ib_mad_send_done_handler(port_priv, wc);
+               mark_sends_for_retry(qp_info);
+       }
+}
+
 /*
  * IB MAD completion callback
  */
@@ -1346,34 +1413,25 @@
 {
        struct ib_mad_port_private *port_priv;
        struct ib_wc wc;
-       struct ib_mad_list_head *mad_list;
-       struct ib_mad_qp_info *qp_info;
 
        port_priv = (struct ib_mad_port_private*)data;
        ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
        
        while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
-               if (wc.status != IB_WC_SUCCESS) {
-                       /* Determine if failure was a send or receive */
-                       mad_list = (struct ib_mad_list_head *)
-                                  (unsigned long)wc.wr_id;
-                       qp_info = mad_list->mad_queue->qp_info;
-                       if (mad_list->mad_queue == &qp_info->send_queue)
-                               wc.opcode = IB_WC_SEND;
-                       else
-                               wc.opcode = IB_WC_RECV;
-               }
-               switch (wc.opcode) {
-               case IB_WC_SEND:
-                       ib_mad_send_done_handler(port_priv, &wc);
-                       break;
-               case IB_WC_RECV:
-                       ib_mad_recv_done_handler(port_priv, &wc);
-                       break;
-               default:
-                       BUG_ON(1);
-                       break;
-               }
+               if (wc.status == IB_WC_SUCCESS) {
+                       switch (wc.opcode) {
+                       case IB_WC_SEND:
+                               ib_mad_send_done_handler(port_priv, &wc);
+                               break;
+                       case IB_WC_RECV:
+                               ib_mad_recv_done_handler(port_priv, &wc);
+                               break;
+                       default:
+                               BUG_ON(1);
+                               break;
+                       }
+               } else
+                       mad_error_handler(port_priv, &wc);
        }
 }
 
@@ -1717,7 +1775,8 @@
 /*
  * Modify QP into Ready-To-Send state
  */
-static inline int ib_mad_change_qp_state_to_rts(struct ib_qp *qp)
+static int ib_mad_change_qp_state_to_rts(struct ib_qp *qp,
+                                        enum ib_qp_state cur_state)
 {
        int ret;
        struct ib_qp_attr *attr;
@@ -1729,11 +1788,12 @@
                       "ib_qp_attr\n");
                return -ENOMEM;
        }
-
        attr->qp_state = IB_QPS_RTS;
-       attr->sq_psn = IB_MAD_SEND_Q_PSN;
-       attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
-
+       attr_mask = IB_QP_STATE;
+       if (cur_state == IB_QPS_RTR) {
+               attr->sq_psn = IB_MAD_SEND_Q_PSN;
+               attr_mask |= IB_QP_SQ_PSN;
+       }
        ret = ib_modify_qp(qp, attr, attr_mask);
        kfree(attr);
 
@@ -1793,7 +1853,8 @@
                        goto error;
                }
 
-               ret = ib_mad_change_qp_state_to_rts(port_priv->qp_info[i].qp);
+               ret = ib_mad_change_qp_state_to_rts(port_priv->qp_info[i].qp,
+                                                   IB_QPS_RTR);
                if (ret) {
                        printk(KERN_ERR PFX "Couldn't change QP%d state to "
                               "RTS\n", i);
@@ -1852,6 +1913,15 @@
        }
 }
 
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+       struct ib_mad_qp_info   *qp_info = qp_context;
+
+       /* It's worse than that! He's dead, Jim! */
+       printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+               event->event, qp_info->qp->qp_num);
+}
+
 static void init_mad_queue(struct ib_mad_qp_info *qp_info,
                           struct ib_mad_queue *mad_queue)
 {
@@ -1884,6 +1954,8 @@
        qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
        qp_init_attr.qp_type = qp_type;
        qp_init_attr.port_num = port_priv->port_num;
+       qp_init_attr.qp_context = qp_info;
+       qp_init_attr.event_handler = qp_event_handler;
        qp_info->qp = ib_create_qp(port_priv->pd, &qp_init_attr);
        if (IS_ERR(qp_info->qp)) {
                printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
Index: core/mad_priv.h
===================================================================
--- core/mad_priv.h     (revision 1221)
+++ core/mad_priv.h     (working copy)
@@ -127,6 +127,7 @@
        u64 wr_id;                      /* client WR ID */
        u64 tid;
        unsigned long timeout;
+       int retry;
        int refcount;
        enum ib_wc_status status;
 };
_______________________________________________
openib-general mailing list
[EMAIL PROTECTED]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to