Re: [ofa-general] Re: [RFC][PATCH] last wqe event handler patch

Roland Dreier Wed, 25 Jun 2008 14:06:03 -0700

 > I prefer the original patch, the reason is: drain_list only have one
 > element, which the first of of the flush list, whenever its drain WR
 > processed, it will be put in reap_list, why we need drain_list here?


Maybe I'm missing something.  If we get rid of drain_list, how does
ipoib_cm_start_rx_drain() know if a drain WQE is pending or not?

I guess we can look at how many entries the flush_list has, something
like this?  (compile tested only)

commit 17120d826ec4c12f71d075b69a836f0d4f5ac499
Author: Roland Dreier <[EMAIL PROTECTED]>
Date:   Wed Jun 25 14:05:17 2008 -0700

    IPoIB/cm: Drain receive QPs one at a time
    
    The current handling of last WQE reached events is broken when
    multiple QPs are drained at once -- for example:
    
    1. Last WQE reached event for QP100: QP100 context is added into
       flush_list, and then it is put into drain_list, and does post_send
       of a drain WR.
    
    2. Last WQE reached event for QP200: QP200 context is added into
       flush_list, but not drain_list since only one drain WR will be
       posted.
    
    3. Last WQE reached event for QP300: QP300 context is added into
       flush_list, but not drain_list since only one drain WR will be
       posted.
    
    So at this point, QP100 is on drain_list and QP200 and QP300 are on
    flush_list.
    
    4. QP100 drain WR CQE is handled: put QP100 into reap_list then call
       ipoib_cm_start_rx_drain(), which does post_send of QP200 drain WR,
       and moves both QP200 and QP300 from flush_list to drain_list.
    
    5. QP200 drain WR CQE is polled, which will move both QP200 and QP300
       from drain_list to reap_list.
    
    6. QP200 and QP300 are both freed from reap_list.
    
    7. A CQE for QP300 is seen, but QP300 context has been freed ---> crash.
    
    The fundamental problem is that a last WQE reached event does not mean
    all pending completions have been queued.
    
    Fix this by flushing QPs from the flush_list one at a time, so that we
    never free QPs until we really know that we are done with them.  This
    allows us to simplify the code by removing the drain_list completely
    (since we flush one at a time, the context to move to the reap_list when
    the drain WQE completes is always just the first entry in flush_list).
    
    Debugged-by: Shirley Ma <[EMAIL PROTECTED]>
    Fix-suggested-by: Eli Cohen <[EMAIL PROTECTED]>
    Signed-off-by: Roland Dreier <[EMAIL PROTECTED]>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 8754b36..c02e3b0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -224,8 +224,7 @@ struct ipoib_cm_dev_priv {
        struct ib_cm_id        *id;
        struct list_head        passive_ids;   /* state: LIVE */
        struct list_head        rx_error_list; /* state: ERROR */
-       struct list_head        rx_flush_list; /* state: FLUSH, drain not 
started */
-       struct list_head        rx_drain_list; /* state: FLUSH, drain started */
+       struct list_head        rx_flush_list; /* state: FLUSHING or waiting 
for FLUSH */
        struct list_head        rx_reap_list;  /* state: FLUSH, drain done */
        struct work_struct      start_task;
        struct work_struct      reap_task;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 6223fc3..07939ed 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -208,21 +208,13 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv 
*priv)
        struct ib_send_wr *bad_wr;
        struct ipoib_cm_rx *p;
 
-       /* We only reserved 1 extra slot in CQ for drain WRs, so
-        * make sure we have at most 1 outstanding WR. */
-       if (list_empty(&priv->cm.rx_flush_list) ||
-           !list_empty(&priv->cm.rx_drain_list))
-               return;
-
        /*
         * QPs on flush list are error state.  This way, a "flush
         * error" WC will be immediately generated for each WR we post.
         */
-       p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+       p = list_first_entry(&priv->cm.rx_flush_list, typeof(*p), list);
        if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
                ipoib_warn(priv, "failed to post drain wr\n");
-
-       list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
 }
 
 static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
@@ -235,9 +227,15 @@ static void ipoib_cm_rx_event_handler(struct ib_event 
*event, void *ctx)
                return;
 
        spin_lock_irqsave(&priv->lock, flags);
-       list_move(&p->list, &priv->cm.rx_flush_list);
+
+       list_move_tail(&p->list, &priv->cm.rx_flush_list);
        p->state = IPOIB_CM_RX_FLUSH;
-       ipoib_cm_start_rx_drain(priv);
+
+       /* We only reserved 1 extra slot in CQ for drain WRs, so
+        * make sure we have at most 1 outstanding WR. */
+       if (list_is_singular(&priv->cm.rx_flush_list))
+               ipoib_cm_start_rx_drain(priv);
+
        spin_unlock_irqrestore(&priv->lock, flags);
 }
 
@@ -531,8 +529,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct 
ib_wc *wc)
        if (unlikely(wr_id >= ipoib_recvq_size)) {
                if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | 
IPOIB_OP_RECV))) {
                        spin_lock_irqsave(&priv->lock, flags);
-                       list_splice_init(&priv->cm.rx_drain_list, 
&priv->cm.rx_reap_list);
-                       ipoib_cm_start_rx_drain(priv);
+                       list_move(&priv->cm.rx_flush_list, 
&priv->cm.rx_reap_list);
+                       if (!list_empty(&priv->cm.rx_flush_list))
+                               ipoib_cm_start_rx_drain(priv);
                        queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
                        spin_unlock_irqrestore(&priv->lock, flags);
                } else
@@ -867,8 +866,7 @@ void ipoib_cm_dev_stop(struct net_device *dev)
        begin = jiffies;
 
        while (!list_empty(&priv->cm.rx_error_list) ||
-              !list_empty(&priv->cm.rx_flush_list) ||
-              !list_empty(&priv->cm.rx_drain_list)) {
+              !list_empty(&priv->cm.rx_flush_list)) {
                if (time_after(jiffies, begin + 5 * HZ)) {
                        ipoib_warn(priv, "RX drain timing out\n");
 
@@ -879,8 +877,6 @@ void ipoib_cm_dev_stop(struct net_device *dev)
                                         &priv->cm.rx_reap_list);
                        list_splice_init(&priv->cm.rx_error_list,
                                         &priv->cm.rx_reap_list);
-                       list_splice_init(&priv->cm.rx_drain_list,
-                                        &priv->cm.rx_reap_list);
                        break;
                }
                spin_unlock_irq(&priv->lock);
@@ -1472,7 +1468,6 @@ int ipoib_cm_dev_init(struct net_device *dev)
        INIT_LIST_HEAD(&priv->cm.start_list);
        INIT_LIST_HEAD(&priv->cm.rx_error_list);
        INIT_LIST_HEAD(&priv->cm.rx_flush_list);
-       INIT_LIST_HEAD(&priv->cm.rx_drain_list);
        INIT_LIST_HEAD(&priv->cm.rx_reap_list);
        INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
        INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Re: [ofa-general] Re: [RFC][PATCH] last wqe event handler patch

Reply via email to