Re: [PATCH 14/24] libfc: quarantine timed out xids

2016-10-14 Thread Bart Van Assche

On 10/13/2016 06:10 AM, Hannes Reinecke wrote:

When a sequence times out we have no idea what happened to the
frame. And we do not know if we will ever receive the frame.
Hence we cannot re-use the xid as we would risk data corruption
if the xid had been re-used and the timed out frame would be
received after that.
So we need to quarantine the xid until the lport is reset.
Yes, I know this will (eventually) deplete the xid pool.
But for now it's the safest method.


Reviewed-by: Bart Van Assche 
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/24] libfc: quarantine timed out xids

2016-10-13 Thread Hannes Reinecke
When a sequence times out we have no idea what happened to the
frame. And we do not know if we will ever receive the frame.
Hence we cannot re-use the xid as we would risk data corruption
if the xid had been re-used and the timed out frame would be
received after that.
So we need to quarantine the xid until the lport is reset.
Yes, I know this will (eventually) deplete the xid pool.
But for now it's the safest method.

Signed-off-by: Hannes Reinecke 
---
 drivers/scsi/libfc/fc_exch.c | 33 ++---
 drivers/scsi/libfc/fc_fcp.c  | 13 +++--
 include/scsi/libfc.h |  1 +
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index e8784a7..5868824 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -94,6 +94,7 @@ struct fc_exch_pool {
 struct fc_exch_mgr {
struct fc_exch_pool __percpu *pool;
mempool_t   *ep_pool;
+   struct fc_lport *lport;
enum fc_class   class;
struct kref kref;
u16 min_xid;
@@ -408,6 +409,8 @@ static int fc_exch_done_locked(struct fc_exch *ep)
return rc;
 }
 
+static struct fc_exch fc_quarantine_exch;
+
 /**
  * fc_exch_ptr_get() - Return an exchange from an exchange pool
  * @pool:  Exchange Pool to get an exchange from
@@ -452,14 +455,17 @@ static void fc_exch_delete(struct fc_exch *ep)
 
/* update cache of free slot */
index = (ep->xid - ep->em->min_xid) >> fc_cpu_order;
-   if (pool->left == FC_XID_UNKNOWN)
-   pool->left = index;
-   else if (pool->right == FC_XID_UNKNOWN)
-   pool->right = index;
-   else
-   pool->next_index = index;
-
-   fc_exch_ptr_set(pool, index, NULL);
+   if (!(ep->state & FC_EX_QUARANTINE)) {
+   if (pool->left == FC_XID_UNKNOWN)
+   pool->left = index;
+   else if (pool->right == FC_XID_UNKNOWN)
+   pool->right = index;
+   else
+   pool->next_index = index;
+   fc_exch_ptr_set(pool, index, NULL);
+   } else {
+   fc_exch_ptr_set(pool, index, _quarantine_exch);
+   }
list_del(>ex_list);
spin_unlock_bh(>lock);
fc_exch_release(ep);/* drop hold for exch in mp */
@@ -916,14 +922,14 @@ static inline struct fc_exch *fc_exch_alloc(struct 
fc_lport *lport,
  */
 static struct fc_exch *fc_exch_find(struct fc_exch_mgr *mp, u16 xid)
 {
+   struct fc_lport *lport = mp->lport;
struct fc_exch_pool *pool;
struct fc_exch *ep = NULL;
u16 cpu = xid & fc_cpu_mask;
 
if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
-   printk_ratelimited(KERN_ERR
-   "libfc: lookup request for XID = %d, "
-   "indicates invalid CPU %d\n", xid, cpu);
+   pr_err("host%u: lport %6.6x: xid %d invalid CPU %d\n:",
+  lport->host->host_no, lport->port_id, xid, cpu);
return NULL;
}
 
@@ -931,6 +937,10 @@ static struct fc_exch *fc_exch_find(struct fc_exch_mgr 
*mp, u16 xid)
pool = per_cpu_ptr(mp->pool, cpu);
spin_lock_bh(>lock);
ep = fc_exch_ptr_get(pool, (xid - mp->min_xid) >> fc_cpu_order);
+   if (ep == _quarantine_exch) {
+   FC_LPORT_DBG(lport, "xid %x quarantined\n", xid);
+   ep = NULL;
+   }
if (ep) {
WARN_ON(ep->xid != xid);
fc_exch_hold(ep);
@@ -2429,6 +2439,7 @@ struct fc_exch_mgr *fc_exch_mgr_alloc(struct fc_lport 
*lport,
return NULL;
 
mp->class = class;
+   mp->lport = lport;
/* adjust em exch xid range for offload */
mp->min_xid = min_xid;
 
diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c
index f7700cc..780d9f0 100644
--- a/drivers/scsi/libfc/fc_fcp.c
+++ b/drivers/scsi/libfc/fc_fcp.c
@@ -1529,13 +1529,14 @@ static void fc_fcp_rec_resp(struct fc_seq *seq, struct 
fc_frame *fp, void *arg)
   fsp->rport->port_id, rjt->er_reason,
   rjt->er_explan);
/*
-* If no data transfer, the command frame got dropped
-* so we just retry.  If data was transferred, we
-* lost the response but the target has no record,
-* so we abort and retry.
+* If response got lost or is stuck in the
+* queue somewhere we have no idea if and when
+* the response will be received. So quarantine
+* the xid and retry the command.
 */
-   if (rjt->er_explan == ELS_EXPL_OXID_RXID &&