[ewg] Re: [ofa-general] Re: Continue of "defer skb_orphan() until irqs enabled"

2008-09-26 Thread Roland Dreier
 > We'll give it a test asap. (Actually, we'll need to port the patch 
 > to OFED 1.3.1 first, since that's what we're using/shipping.)

Thanks!  Let me know how it goes.

 - R.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] Dependencies in OpenSM libraries

2008-09-26 Thread Sasha Khapyorsky
Hi John,

On 14:12 Mon 22 Sep , John Russo wrote:
> A while back we discussed various dependencies on opensm libraries which
> were going to become "non-public"
> 
>  
> 
> Was that intended to occur in 1.4 or later versions of OFED?  In 1.4 I
> noticed that ibutils and infiniband-diags still have a dependency on
> libopensm.so and other opensm-libs supplied files.  Is this a mistake or
> something planned to be resolved in a later release of OFED (1.5, etc).

I don't think it is mistake - what is the issue with infiniband-diags
using opensm-libs?

Sasha
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Re: Continue of "defer skb_orphan() until irqs enabled"

2008-09-26 Thread akepner
On Fri, Sep 26, 2008 at 01:19:00PM -0700, Roland Dreier wrote:
> How about this?  Instead of trying to rely on some complicated and
> fragile reasoning about when some race might occur, let's just do what
> we want to do anyway and get rid of LLTX.  We change from priv->tx_lock
> (taken with IRQ disabling) to netif_tx_lock (taken on with
> BH-disabling).  And then we can keep the skb_orphan in the place it is,
> since our xmit routine runs with IRQs enabled.
> ...

Thanks for doing this work, Roland.

We'll give it a test asap. (Actually, we'll need to port the patch 
to OFED 1.3.1 first, since that's what we're using/shipping.)

-- 
Arthur

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Re: Continue of "defer skb_orphan() until irqs enabled"

2008-09-26 Thread Roland Dreier
How about this?  Instead of trying to rely on some complicated and
fragile reasoning about when some race might occur, let's just do what
we want to do anyway and get rid of LLTX.  We change from priv->tx_lock
(taken with IRQ disabling) to netif_tx_lock (taken on with
BH-disabling).  And then we can keep the skb_orphan in the place it is,
since our xmit routine runs with IRQs enabled.

Most of this patch is just compensating for the fact that the tx_lock
regions are now IRQ-enabled, and so we have to convert places that take
priv->lock to disable IRQs too.

If we could change ipoib_cm_rx_event_handler to not need priv->lock,
then we could change priv->lock to a BH-disabling lock too and simplify
things a bit further.

I've tested this patch some in both datagram and connected mode with a
kernel with lockdep and other debugging enabled, so it is at least
somewhat sane.  However more stress testing would definitely be helpful
if we want to put this in 2.6.28.  Also it would be interesting to see
if there are any performance effects.

Thanks,
  Roland

---

 drivers/infiniband/ulp/ipoib/ipoib.h   |8 +--
 drivers/infiniband/ulp/ipoib/ipoib_cm.c|   88 ++--
 drivers/infiniband/ulp/ipoib/ipoib_ib.c|   30 ++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |   68 ---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   31 -
 5 files changed, 118 insertions(+), 107 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 05eb41b..68ba5c3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -268,10 +268,9 @@ struct ipoib_lro {
 };
 
 /*
- * Device private locking: tx_lock protects members used in TX fast
- * path (and we use LLTX so upper layers don't do extra locking).
- * lock protects everything else.  lock nests inside of tx_lock (ie
- * tx_lock must be acquired first if needed).
+ * Device private locking: network stack tx_lock protects members used
+ * in TX fast path, lock protects everything else.  lock nests inside
+ * of tx_lock (ie tx_lock must be acquired first if needed).
  */
 struct ipoib_dev_priv {
spinlock_t lock;
@@ -320,7 +319,6 @@ struct ipoib_dev_priv {
 
struct ipoib_rx_buf *rx_ring;
 
-   spinlock_t   tx_lock;
struct ipoib_tx_buf *tx_ring;
unsigned tx_head;
unsigned tx_tail;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 341ffed..7b14c2c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -786,7 +786,8 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
ib_wc *wc)
 
dev_kfree_skb_any(tx_req->skb);
 
-   spin_lock_irqsave(&priv->tx_lock, flags);
+   netif_tx_lock(dev);
+
++tx->tx_tail;
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
netif_queue_stopped(dev) &&
@@ -801,7 +802,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
ib_wc *wc)
   "(status=%d, wrid=%d vend_err %x)\n",
   wc->status, wr_id, wc->vendor_err);
 
-   spin_lock(&priv->lock);
+   spin_lock_irqsave(&priv->lock, flags);
neigh = tx->neigh;
 
if (neigh) {
@@ -821,10 +822,10 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
ib_wc *wc)
 
clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
 
-   spin_unlock(&priv->lock);
+   spin_unlock_irqrestore(&priv->lock, flags);
}
 
-   spin_unlock_irqrestore(&priv->tx_lock, flags);
+   netif_tx_unlock(dev);
 }
 
 int ipoib_cm_dev_open(struct net_device *dev)
@@ -1149,7 +1150,6 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
struct ipoib_dev_priv *priv = netdev_priv(p->dev);
struct ipoib_cm_tx_buf *tx_req;
-   unsigned long flags;
unsigned long begin;
 
ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
@@ -1180,12 +1180,12 @@ timeout:
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb);
++p->tx_tail;
-   spin_lock_irqsave(&priv->tx_lock, flags);
+   netif_tx_lock_bh(p->dev);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
netif_queue_stopped(p->dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
netif_wake_queue(p->dev);
-   spin_unlock_irqrestore(&priv->tx_lock, flags);
+   netif_tx_unlock_bh(p->dev);
}
 
if (p->qp)
@@ -1202,6 +1202,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
struct net_device *dev = priv->dev;
struct ipoib_

[ewg] [PATCH 1/1 OFED-1.4] IB/ehca: add flush cqes generation

2008-09-26 Thread Hoang-Nam Nguyen
IB/ehca: Add flush error cqe generation

Signed-off-by: Alexander Schmidt <[EMAIL PROTECTED]>
---
 .../fixes/ehca_0100_flush_error_cqe.patch  |  748 
 1 files changed, 748 insertions(+), 0 deletions(-)
 create mode 100644 kernel_patches/fixes/ehca_0100_flush_error_cqe.patch

diff --git a/kernel_patches/fixes/ehca_0100_flush_error_cqe.patch 
b/kernel_patches/fixes/ehca_0100_flush_error_cqe.patch
new file mode 100644
index 000..fff5418
--- /dev/null
+++ b/kernel_patches/fixes/ehca_0100_flush_error_cqe.patch
@@ -0,0 +1,748 @@
+When a QP goes into error state, it is required that flush CQEs are
+delivered to the application for any outstanding work requests. eHCA does not
+do this in hardware, so this patch adds software flush CQE generation to the
+ehca driver.
+
+Whenever a QP gets into error state, it is added to the QP error list of its
+respective CQ. If the error QP list of a CQ is not empty, poll_cq()
+generates flush CQEs before polling the actual CQ.
+
+Signed-off-by: Alexander Schmidt <[EMAIL PROTECTED]>
+---
+Applies on top of 2.6.27-rc3, please consider this for 2.6.28.
+
+ drivers/infiniband/hw/ehca/ehca_classes.h |   14 +
+ drivers/infiniband/hw/ehca/ehca_cq.c  |3 
+ drivers/infiniband/hw/ehca/ehca_iverbs.h  |2 
+ drivers/infiniband/hw/ehca/ehca_qp.c  |  225 
--
+ drivers/infiniband/hw/ehca/ehca_reqs.c|  211 
+ 5 files changed, 412 insertions(+), 43 deletions(-)
+
+--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_classes.h
 infiniband.git/drivers/infiniband/hw/ehca/ehca_classes.h
+@@ -164,6 +164,13 @@ struct ehca_qmap_entry {
+   u16 reported;
+ };
+ 
++struct ehca_queue_map {
++  struct ehca_qmap_entry *map;
++  unsigned int entries;
++  unsigned int tail;
++  unsigned int left_to_poll;
++};
++
+ struct ehca_qp {
+   union {
+   struct ib_qp ib_qp;
+@@ -173,8 +180,9 @@ struct ehca_qp {
+   enum ehca_ext_qp_type ext_type;
+   enum ib_qp_state state;
+   struct ipz_queue ipz_squeue;
+-  struct ehca_qmap_entry *sq_map;
++  struct ehca_queue_map sq_map;
+   struct ipz_queue ipz_rqueue;
++  struct ehca_queue_map rq_map;
+   struct h_galpas galpas;
+   u32 qkey;
+   u32 real_qp_num;
+@@ -204,6 +212,8 @@ struct ehca_qp {
+   atomic_t nr_events; /* events seen */
+   wait_queue_head_t wait_completion;
+   int mig_armed;
++  struct list_head sq_err_node;
++  struct list_head rq_err_node;
+ };
+ 
+ #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
+@@ -233,6 +243,8 @@ struct ehca_cq {
+   /* mmap counter for resources mapped into user space */
+   u32 mm_count_queue;
+   u32 mm_count_galpa;
++  struct list_head sqp_err_list;
++  struct list_head rqp_err_list;
+ };
+ 
+ enum ehca_mr_flag {
+--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_reqs.c
 infiniband.git/drivers/infiniband/hw/ehca/ehca_reqs.c
+@@ -53,9 +53,25 @@
+ /* in RC traffic, insert an empty RDMA READ every this many packets */
+ #define ACK_CIRC_THRESHOLD 200
+ 
++static u64 replace_wr_id(u64 wr_id, u16 idx)
++{
++  u64 ret;
++
++  ret = wr_id & ~QMAP_IDX_MASK;
++  ret |= idx & QMAP_IDX_MASK;
++
++  return ret;
++}
++
++static u16 get_app_wr_id(u64 wr_id)
++{
++  return wr_id & QMAP_IDX_MASK;
++}
++
+ static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
+ struct ehca_wqe *wqe_p,
+-struct ib_recv_wr *recv_wr)
++struct ib_recv_wr *recv_wr,
++u32 rq_map_idx)
+ {
+   u8 cnt_ds;
+   if (unlikely((recv_wr->num_sge < 0) ||
+@@ -69,7 +85,7 @@ static inline int ehca_write_rwqe(struct
+   /* clear wqe header until sglist */
+   memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+ 
+-  wqe_p->work_request_id = recv_wr->wr_id;
++  wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
+   wqe_p->nr_of_data_seg = recv_wr->num_sge;
+ 
+   for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
+@@ -146,6 +162,7 @@ static inline int ehca_write_swqe(struct
+   u64 dma_length;
+   struct ehca_av *my_av;
+   u32 remote_qkey = send_wr->wr.ud.remote_qkey;
++  struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
+ 
+   if (unlikely((send_wr->num_sge < 0) ||
+(send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
+@@ -158,11 +175,10 @@ static inline int ehca_write_swqe(struct
+   /* clear wqe header until sglist */
+   memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+ 
+-  wqe_p->work_request_id = send_wr->wr_id & ~QMAP_IDX_MASK;
+-  wqe_p->work_request_id |= sq_map_idx & QMAP_IDX_MASK;
++  wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
+ 
+-  qp->sq_map[sq_map_idx].app_wr_id = send_wr->wr_id & QMAP_IDX_

[ewg] [PATCH 0/1 OFED-1.4] IB/ehca: Bug #1128

2008-09-26 Thread Hoang-Nam Nguyen
Hi Vlad and Tziporet!
I'm sending you a patch for ehca that fixes bugzilla #1128
"release IPOIB-CM QP resources in flushing CQE context"
This patch has been posted on LKML http://lkml.org/lkml/2008/9/10/169
and accepted in mainline kernel. It should apply cleanly against
Vlad's git tree ofed-1.4.

Thanks!

Nam


___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg