Re: [PATCH v3 28/32] cxlflash: Fix to avoid state change collision

2015-09-25 Thread Brian King
On 09/24/2015 02:42 PM, Matthew R. Ochs wrote:
> diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
> index ab11ee6..325ba31 100644
> --- a/drivers/scsi/cxlflash/main.c
> +++ b/drivers/scsi/cxlflash/main.c
> @@ -496,6 +496,7 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, 
> struct scsi_cmnd *scp)
>   struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)host->hostdata;
>   struct afu *afu = cfg->afu;
>   struct device *dev = >dev->dev;
> + enum cxlflash_state state;
>   struct afu_cmd *cmd;
>   u32 port_sel = scp->device->channel + 1;
>   int nseg, i, ncount;
> @@ -525,7 +526,11 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, 
> struct scsi_cmnd *scp)
>   }
>   spin_unlock_irqrestore(>tmf_slock, lock_flags);
> 
> - switch (cfg->state) {
> + mutex_lock(>mutex);
> + state = cfg->state;
> + mutex_unlock(>mutex);

You can't grab a mutex in queuecommand, since it can sleep and queuecommand can 
be called from soft irq context.

Also, I'm not sure what to think about this patch in general. Obviously state 
can change immediately after
you drop the mutex, and according to Documentation/memory-barriers.txt, memory 
operations after the unlock can
occur before the unlock occurs. Is this a problem?

> +
> + switch (state) {
>   case STATE_RESET:
>   dev_dbg_ratelimited(dev, "%s: device is in reset!\n", __func__);
>   rc = SCSI_MLQUEUE_HOST_BUSY;


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 28/32] cxlflash: Fix to avoid state change collision

2015-09-25 Thread Matthew R. Ochs
> On Sep 25, 2015, at 4:10 PM, Brian King  wrote:
> On 09/24/2015 02:42 PM, Matthew R. Ochs wrote:
>> diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
>> index ab11ee6..325ba31 100644
>> --- a/drivers/scsi/cxlflash/main.c
>> +++ b/drivers/scsi/cxlflash/main.c
>> @@ -496,6 +496,7 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, 
>> struct scsi_cmnd *scp)
>>  struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)host->hostdata;
>>  struct afu *afu = cfg->afu;
>>  struct device *dev = >dev->dev;
>> +enum cxlflash_state state;
>>  struct afu_cmd *cmd;
>>  u32 port_sel = scp->device->channel + 1;
>>  int nseg, i, ncount;
>> @@ -525,7 +526,11 @@ static int cxlflash_queuecommand(struct Scsi_Host 
>> *host, struct scsi_cmnd *scp)
>>  }
>>  spin_unlock_irqrestore(>tmf_slock, lock_flags);
>> 
>> -switch (cfg->state) {
>> +mutex_lock(>mutex);
>> +state = cfg->state;
>> +mutex_unlock(>mutex);
> 
> You can't grab a mutex in queuecommand, since it can sleep and queuecommand 
> can be called from soft irq context.
> 
> Also, I'm not sure what to think about this patch in general. Obviously state 
> can change immediately after
> you drop the mutex, and according to Documentation/memory-barriers.txt, 
> memory operations after the unlock can
> occur before the unlock occurs. Is this a problem?

You bring up some good points.

I'm going to remove this patch from the set and rework it.


-matt
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v3 28/32] cxlflash: Fix to avoid state change collision

2015-09-24 Thread Matthew R. Ochs
The adapter state machine is susceptible to missing and/or
corrupting state updates at runtime. This can lead to a variety
of unintended issues and is due to the lack of a serialization
mechanism to protect the adapter state.

Use an adapter-wide mutex to serialize state changes.

Signed-off-by: Matthew R. Ochs 
Signed-off-by: Manoj N. Kumar 

Conflicts:
drivers/scsi/cxlflash/main.c
---
 drivers/scsi/cxlflash/common.h|  1 +
 drivers/scsi/cxlflash/main.c  | 48 ++-
 drivers/scsi/cxlflash/superpipe.c |  7 +-
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index bbfe711..89c82d2 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -127,6 +127,7 @@ struct cxlflash_cfg {
bool tmf_active;
wait_queue_head_t reset_waitq;
enum cxlflash_state state;
+   struct mutex mutex;
 };
 
 struct afu_cmd {
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index ab11ee6..325ba31 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -496,6 +496,7 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scp)
struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)host->hostdata;
struct afu *afu = cfg->afu;
struct device *dev = >dev->dev;
+   enum cxlflash_state state;
struct afu_cmd *cmd;
u32 port_sel = scp->device->channel + 1;
int nseg, i, ncount;
@@ -525,7 +526,11 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, 
struct scsi_cmnd *scp)
}
spin_unlock_irqrestore(>tmf_slock, lock_flags);
 
-   switch (cfg->state) {
+   mutex_lock(>mutex);
+   state = cfg->state;
+   mutex_unlock(>mutex);
+
+   switch (state) {
case STATE_RESET:
dev_dbg_ratelimited(dev, "%s: device is in reset!\n", __func__);
rc = SCSI_MLQUEUE_HOST_BUSY;
@@ -722,7 +727,9 @@ static void cxlflash_remove(struct pci_dev *pdev)
  cfg->tmf_slock);
spin_unlock_irqrestore(>tmf_slock, lock_flags);
 
+   mutex_lock(>mutex);
cfg->state = STATE_FAILTERM;
+   mutex_unlock(>mutex);
cxlflash_stop_term_user_contexts(cfg);
 
switch (cfg->init_state) {
@@ -1776,9 +1783,10 @@ err1:
  * @mode:  Type of sync to issue (lightweight, heavyweight, global).
  *
  * The AFU can only take 1 sync command at a time. This routine enforces this
- * limitation by using a mutex to provide exclusive access to the AFU during
- * the sync. This design point requires calling threads to not be on interrupt
- * context due to the possibility of sleeping during concurrent sync 
operations.
+ * limitation by holding the adapter mutex across the entirety of the function
+ * to provide exclusive access to the AFU during the sync. This design point
+ * requires calling threads to not be on interrupt context due to the
+ * possibility of sleeping during concurrent sync operations.
  *
  * AFU sync operations are only necessary and allowed when the device is
  * operating normally. When not operating normally, sync requests can occur as
@@ -1798,14 +1806,13 @@ int cxlflash_afu_sync(struct afu *afu, ctx_hndl_t 
ctx_hndl_u,
struct afu_cmd *cmd = NULL;
int rc = 0;
int retry_cnt = 0;
-   static DEFINE_MUTEX(sync_active);
 
+   mutex_lock(>mutex);
if (cfg->state != STATE_NORMAL) {
pr_debug("%s: Sync not required! (%u)\n", __func__, cfg->state);
-   return 0;
+   goto out;
}
 
-   mutex_lock(_active);
 retry:
cmd = cmd_checkout(afu);
if (unlikely(!cmd)) {
@@ -1847,7 +1854,7 @@ retry:
 (cmd->sa.host_use_b[0] & B_ERROR)))
rc = -1;
 out:
-   mutex_unlock(_active);
+   mutex_unlock(>mutex);
if (cmd)
cmd_checkin(cmd);
pr_debug("%s: returning rc=%d\n", __func__, rc);
@@ -1889,6 +1896,7 @@ static int cxlflash_eh_device_reset_handler(struct 
scsi_cmnd *scp)
struct Scsi_Host *host = scp->device->host;
struct cxlflash_cfg *cfg = (struct cxlflash_cfg *)host->hostdata;
struct afu *afu = cfg->afu;
+   enum cxlflash_state state;
int rcr = 0;
 
pr_debug("%s: (scp=%p) %d/%d/%d/%llu "
@@ -1901,7 +1909,11 @@ static int cxlflash_eh_device_reset_handler(struct 
scsi_cmnd *scp)
 get_unaligned_be32(&((u32 *)scp->cmnd)[3]));
 
 retry:
-   switch (cfg->state) {
+   mutex_lock(>mutex);
+   state = cfg->state;
+   mutex_unlock(>mutex);
+
+   switch (state) {
case STATE_NORMAL:
rcr = send_tmf(afu, scp, TMF_LUN_RESET);
if (unlikely(rcr))
@@ -1943,6 +1955,7 @@ static int cxlflash_eh_host_reset_handler(struct