Re: [PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-07 Thread Jaegeuk Kim
On 01/07, Can Guo wrote:
> On 2021-01-07 16:46, Jaegeuk Kim wrote:
> > On 01/07, Can Guo wrote:
> > > On 2021-01-07 16:07, Jaegeuk Kim wrote:
> > > > On 01/07, Can Guo wrote:
> > > > > On 2021-01-07 15:47, Jaegeuk Kim wrote:
> > > > > > From: Jaegeuk Kim 
> > > > > >
> > > > > > This fixes a warning caused by wrong reserve tag usage in
> > > > > > __ufshcd_issue_tm_cmd.
> > > > > >
> > > > > > WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 
> > > > > > blk_get_request+0x68/0x70
> > > > > > WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82
> > > > > > blk_mq_get_tag+0x438/0x46c
> > > > > >
> > > > > > And, in ufshcd_err_handler(), we can avoid to send tm_cmd before
> > > > > > aborting
> > > > > > outstanding commands by waiting a bit for IO completion like this.
> > > > > >
> > > > > > __ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out
> > > > > >
> > > > > > Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to
> > > > > > allocate and free TMFs")
> > > > > > Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err
> > > > > > handler")
> > > > >
> > > > > Hi Jaegeuk,
> > > > >
> > > > > Sorry, what is wrong with commit 2355b66ed20c? Clearing pending I/O
> > > > > reqs is a general procedure for handling all non-fatal errors.
> > > >
> > > > Without waiting IOs, I hit the below timeout all the time from
> > > > LINERESET, which
> > > > causes UFS stuck permanently, as mentioned in the description.
> > > >
> > > > "__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out"
> > > 
> > > In that case, ufshcd_try_to_abort_task(), the caller of
> > > __ufshcd_issue_tm_cmd(),
> > > should return -ETIMEOUT, then err_handler would jump to do a full
> > > reset,
> > > then bail.
> > > I am not sure what gets UFS stuck permanently. Could you please
> > > share the
> > > callstack
> > > if possible? I really want to know what is happening. Thanks.
> > 
> > I can't share all the log tho, it entered full reset. While printing out
> > whole registers, the device was hard reset. Thanks,
> 
> Hi Jaegeuk,
> 
> Entering full reset is expected in this case, which is why I am saying
> line-reset handling logic should not be penalized. I think we need to
> find out what caused the hard reset but not just adding a delay before
> clearing pending reqs, because let's say 3 sec expires and you hit the
> same tm req timeout (maybe with a lower possibility), you may still end
> up same at the hard reset. You don't need to share all the log, just the
> last call stacks before hard reset. Is it a QCOM's platform used in your
> case? Can you check the log/dump if NoC error happened?

Hi Can,

I figured out it is caused by verbose kernel logs printed in terminal.
I posted v5, so could you please review it?

Thanks,

> 
> Thanks.
> Can Guo.
> 
> > 
> > > 
> > > Regards,
> > > Can Guo.
> > > 
> > > >
> > > > >
> > > > > Thanks,
> > > > > Can Guo.
> > > > >
> > > > > > Signed-off-by: Jaegeuk Kim 
> > > > > > ---
> > > > > >  drivers/scsi/ufs/ufshcd.c | 35 +++
> > > > > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> > > > > > index e6e7bdf99cd7..340dd5e515dd 100644
> > > > > > --- a/drivers/scsi/ufs/ufshcd.c
> > > > > > +++ b/drivers/scsi/ufs/ufshcd.c
> > > > > > @@ -44,6 +44,9 @@
> > > > > >  /* Query request timeout */
> > > > > >  #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */
> > > > > >
> > > > > > +/* LINERESET TIME OUT */
> > > > > > +#define LINERESET_IO_TIMEOUT_MS(3) /* 30 
> > > > > > sec */
> > > > > > +
> > > > > >  /* Task management command timeout */
> > > > > >  #define TM_CMD_TIMEOUT 100 /* msecs */
> > > > > >
> > > > > > @@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct 
> > > > > > work_struct
> > > > > > *work)
> > > > > > int err = 0, pmc_err;
> > > > > > int tag;
> > > > > > bool needs_reset = false, needs_restore = false;
> > > > > > +   ktime_t start;
> > > > > >
> > > > > > hba = container_of(work, struct ufs_hba, eh_work);
> > > > > >
> > > > > > @@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct 
> > > > > > work_struct
> > > > > > *work)
> > > > > > }
> > > > > >
> > > > > > hba->silence_err_logs = true;
> > > > > > +
> > > > > > +   /* Wait for IO completion for non-fatal errors to avoid 
> > > > > > aborting IOs
> > > > > > */
> > > > > > +   start = ktime_get();
> > > > > > +   while (hba->outstanding_reqs) {
> > > > > > +   ufshcd_complete_requests(hba);
> > > > > > +   spin_unlock_irqrestore(hba->host->host_lock, flags);
> > > > > > +   schedule();
> > > > > > +   spin_lock_irqsave(hba->host->host_lock, flags);
> > > > > > +   if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
> > > > > > +   
> > > > > > LINERESET_IO_TIMEOUT_MS) {
> > > > > > +   dev_err(hba->dev, "%s: timeout, 
> > > 

Re: [PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-07 Thread Can Guo

On 2021-01-07 16:58, Can Guo wrote:

On 2021-01-07 16:46, Jaegeuk Kim wrote:

On 01/07, Can Guo wrote:

On 2021-01-07 16:07, Jaegeuk Kim wrote:
> On 01/07, Can Guo wrote:
> > On 2021-01-07 15:47, Jaegeuk Kim wrote:
> > > From: Jaegeuk Kim 
> > >
> > > This fixes a warning caused by wrong reserve tag usage in
> > > __ufshcd_issue_tm_cmd.
> > >
> > > WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 blk_get_request+0x68/0x70
> > > WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82
> > > blk_mq_get_tag+0x438/0x46c
> > >
> > > And, in ufshcd_err_handler(), we can avoid to send tm_cmd before
> > > aborting
> > > outstanding commands by waiting a bit for IO completion like this.
> > >
> > > __ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out
> > >
> > > Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to
> > > allocate and free TMFs")
> > > Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err
> > > handler")
> >
> > Hi Jaegeuk,
> >
> > Sorry, what is wrong with commit 2355b66ed20c? Clearing pending I/O
> > reqs is a general procedure for handling all non-fatal errors.
>
> Without waiting IOs, I hit the below timeout all the time from
> LINERESET, which
> causes UFS stuck permanently, as mentioned in the description.
>
> "__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out"

In that case, ufshcd_try_to_abort_task(), the caller of
__ufshcd_issue_tm_cmd(),
should return -ETIMEOUT, then err_handler would jump to do a full 
reset,

then bail.
I am not sure what gets UFS stuck permanently. Could you please share 
the

callstack
if possible? I really want to know what is happening. Thanks.


I can't share all the log tho, it entered full reset. While printing 
out

whole registers, the device was hard reset. Thanks,


Hi Jaegeuk,

Entering full reset is expected in this case, which is why I am saying
line-reset handling logic should not be penalized. I think we need to
find out what caused the hard reset but not just adding a delay before
clearing pending reqs, because let's say 3 sec expires and you hit the
same tm req timeout (maybe with a lower possibility), you may still end
up same at the hard reset. You don't need to share all the log, just 
the
last call stacks before hard reset. Is it a QCOM's platform used in 
your

case? Can you check the log/dump if NoC error happened?

Thanks.
Can Guo.



Hi Jaegeuk,

If it is QCOM's platform, what you described looks like a known issue
which we have already fixed in downstream. Please try attached patch.
If not, please ignore it.

Thanks,
Can Guo.





Regards,
Can Guo.

>
> >
> > Thanks,
> > Can Guo.
> >
> > > Signed-off-by: Jaegeuk Kim 
> > > ---
> > >  drivers/scsi/ufs/ufshcd.c | 35 +++
> > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> > > index e6e7bdf99cd7..340dd5e515dd 100644
> > > --- a/drivers/scsi/ufs/ufshcd.c
> > > +++ b/drivers/scsi/ufs/ufshcd.c
> > > @@ -44,6 +44,9 @@
> > >  /* Query request timeout */
> > >  #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */
> > >
> > > +/* LINERESET TIME OUT */
> > > +#define LINERESET_IO_TIMEOUT_MS (3) /* 30 sec */
> > > +
> > >  /* Task management command timeout */
> > >  #define TM_CMD_TIMEOUT  100 /* msecs */
> > >
> > > @@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct work_struct
> > > *work)
> > >  int err = 0, pmc_err;
> > >  int tag;
> > >  bool needs_reset = false, needs_restore = false;
> > > +ktime_t start;
> > >
> > >  hba = container_of(work, struct ufs_hba, eh_work);
> > >
> > > @@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct work_struct
> > > *work)
> > >  }
> > >
> > >  hba->silence_err_logs = true;
> > > +
> > > +/* Wait for IO completion for non-fatal errors to avoid aborting 
IOs
> > > */
> > > +start = ktime_get();
> > > +while (hba->outstanding_reqs) {
> > > +ufshcd_complete_requests(hba);
> > > +spin_unlock_irqrestore(hba->host->host_lock, flags);
> > > +schedule();
> > > +spin_lock_irqsave(hba->host->host_lock, flags);
> > > +if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
> > > +LINERESET_IO_TIMEOUT_MS) 
{
> > > +dev_err(hba->dev, "%s: timeout, 
outstanding=0x%lx\n",
> > > +__func__, hba->outstanding_reqs);
> > > +break;
> > > +}
> > > +}
> > > +
> > >  /* release lock as clear command might sleep */
> > >  spin_unlock_irqrestore(hba->host->host_lock, flags);
> > >  /* Clear pending transfer requests */
> > > @@ -6302,9 +6322,13 @@ static irqreturn_t ufshcd_intr(int irq, void
> > > *__hba)
> > >  intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
> 

Re: [PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-07 Thread Can Guo

On 2021-01-07 16:46, Jaegeuk Kim wrote:

On 01/07, Can Guo wrote:

On 2021-01-07 16:07, Jaegeuk Kim wrote:
> On 01/07, Can Guo wrote:
> > On 2021-01-07 15:47, Jaegeuk Kim wrote:
> > > From: Jaegeuk Kim 
> > >
> > > This fixes a warning caused by wrong reserve tag usage in
> > > __ufshcd_issue_tm_cmd.
> > >
> > > WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 blk_get_request+0x68/0x70
> > > WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82
> > > blk_mq_get_tag+0x438/0x46c
> > >
> > > And, in ufshcd_err_handler(), we can avoid to send tm_cmd before
> > > aborting
> > > outstanding commands by waiting a bit for IO completion like this.
> > >
> > > __ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out
> > >
> > > Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to
> > > allocate and free TMFs")
> > > Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err
> > > handler")
> >
> > Hi Jaegeuk,
> >
> > Sorry, what is wrong with commit 2355b66ed20c? Clearing pending I/O
> > reqs is a general procedure for handling all non-fatal errors.
>
> Without waiting IOs, I hit the below timeout all the time from
> LINERESET, which
> causes UFS stuck permanently, as mentioned in the description.
>
> "__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out"

In that case, ufshcd_try_to_abort_task(), the caller of
__ufshcd_issue_tm_cmd(),
should return -ETIMEOUT, then err_handler would jump to do a full 
reset,

then bail.
I am not sure what gets UFS stuck permanently. Could you please share 
the

callstack
if possible? I really want to know what is happening. Thanks.


I can't share all the log tho, it entered full reset. While printing 
out

whole registers, the device was hard reset. Thanks,


Hi Jaegeuk,

Entering full reset is expected in this case, which is why I am saying
line-reset handling logic should not be penalized. I think we need to
find out what caused the hard reset but not just adding a delay before
clearing pending reqs, because let's say 3 sec expires and you hit the
same tm req timeout (maybe with a lower possibility), you may still end
up same at the hard reset. You don't need to share all the log, just the
last call stacks before hard reset. Is it a QCOM's platform used in your
case? Can you check the log/dump if NoC error happened?

Thanks.
Can Guo.





Regards,
Can Guo.

>
> >
> > Thanks,
> > Can Guo.
> >
> > > Signed-off-by: Jaegeuk Kim 
> > > ---
> > >  drivers/scsi/ufs/ufshcd.c | 35 +++
> > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> > > index e6e7bdf99cd7..340dd5e515dd 100644
> > > --- a/drivers/scsi/ufs/ufshcd.c
> > > +++ b/drivers/scsi/ufs/ufshcd.c
> > > @@ -44,6 +44,9 @@
> > >  /* Query request timeout */
> > >  #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */
> > >
> > > +/* LINERESET TIME OUT */
> > > +#define LINERESET_IO_TIMEOUT_MS (3) /* 30 sec */
> > > +
> > >  /* Task management command timeout */
> > >  #define TM_CMD_TIMEOUT  100 /* msecs */
> > >
> > > @@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct work_struct
> > > *work)
> > >  int err = 0, pmc_err;
> > >  int tag;
> > >  bool needs_reset = false, needs_restore = false;
> > > +ktime_t start;
> > >
> > >  hba = container_of(work, struct ufs_hba, eh_work);
> > >
> > > @@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct work_struct
> > > *work)
> > >  }
> > >
> > >  hba->silence_err_logs = true;
> > > +
> > > +/* Wait for IO completion for non-fatal errors to avoid aborting 
IOs
> > > */
> > > +start = ktime_get();
> > > +while (hba->outstanding_reqs) {
> > > +ufshcd_complete_requests(hba);
> > > +spin_unlock_irqrestore(hba->host->host_lock, flags);
> > > +schedule();
> > > +spin_lock_irqsave(hba->host->host_lock, flags);
> > > +if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
> > > +LINERESET_IO_TIMEOUT_MS) 
{
> > > +dev_err(hba->dev, "%s: timeout, 
outstanding=0x%lx\n",
> > > +__func__, hba->outstanding_reqs);
> > > +break;
> > > +}
> > > +}
> > > +
> > >  /* release lock as clear command might sleep */
> > >  spin_unlock_irqrestore(hba->host->host_lock, flags);
> > >  /* Clear pending transfer requests */
> > > @@ -6302,9 +6322,13 @@ static irqreturn_t ufshcd_intr(int irq, void
> > > *__hba)
> > >  intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
> > >  }
> > >
> > > -if (enabled_intr_status && retval == IRQ_NONE) {
> > > -dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n",
> > > -__func__, intr_status);
> > > + 

Re: [PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-07 Thread Jaegeuk Kim
On 01/07, Can Guo wrote:
> On 2021-01-07 16:07, Jaegeuk Kim wrote:
> > On 01/07, Can Guo wrote:
> > > On 2021-01-07 15:47, Jaegeuk Kim wrote:
> > > > From: Jaegeuk Kim 
> > > >
> > > > This fixes a warning caused by wrong reserve tag usage in
> > > > __ufshcd_issue_tm_cmd.
> > > >
> > > > WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 blk_get_request+0x68/0x70
> > > > WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82
> > > > blk_mq_get_tag+0x438/0x46c
> > > >
> > > > And, in ufshcd_err_handler(), we can avoid to send tm_cmd before
> > > > aborting
> > > > outstanding commands by waiting a bit for IO completion like this.
> > > >
> > > > __ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out
> > > >
> > > > Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to
> > > > allocate and free TMFs")
> > > > Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err
> > > > handler")
> > > 
> > > Hi Jaegeuk,
> > > 
> > > Sorry, what is wrong with commit 2355b66ed20c? Clearing pending I/O
> > > reqs is a general procedure for handling all non-fatal errors.
> > 
> > Without waiting IOs, I hit the below timeout all the time from
> > LINERESET, which
> > causes UFS stuck permanently, as mentioned in the description.
> > 
> > "__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out"
> 
> In that case, ufshcd_try_to_abort_task(), the caller of
> __ufshcd_issue_tm_cmd(),
> should return -ETIMEOUT, then err_handler would jump to do a full reset,
> then bail.
> I am not sure what gets UFS stuck permanently. Could you please share the
> callstack
> if possible? I really want to know what is happening. Thanks.

I can't share all the log tho, it entered full reset. While printing out
whole registers, the device was hard reset. Thanks,

> 
> Regards,
> Can Guo.
> 
> > 
> > > 
> > > Thanks,
> > > Can Guo.
> > > 
> > > > Signed-off-by: Jaegeuk Kim 
> > > > ---
> > > >  drivers/scsi/ufs/ufshcd.c | 35 +++
> > > >  1 file changed, 31 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> > > > index e6e7bdf99cd7..340dd5e515dd 100644
> > > > --- a/drivers/scsi/ufs/ufshcd.c
> > > > +++ b/drivers/scsi/ufs/ufshcd.c
> > > > @@ -44,6 +44,9 @@
> > > >  /* Query request timeout */
> > > >  #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */
> > > >
> > > > +/* LINERESET TIME OUT */
> > > > +#define LINERESET_IO_TIMEOUT_MS(3) /* 30 
> > > > sec */
> > > > +
> > > >  /* Task management command timeout */
> > > >  #define TM_CMD_TIMEOUT 100 /* msecs */
> > > >
> > > > @@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct work_struct
> > > > *work)
> > > > int err = 0, pmc_err;
> > > > int tag;
> > > > bool needs_reset = false, needs_restore = false;
> > > > +   ktime_t start;
> > > >
> > > > hba = container_of(work, struct ufs_hba, eh_work);
> > > >
> > > > @@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct work_struct
> > > > *work)
> > > > }
> > > >
> > > > hba->silence_err_logs = true;
> > > > +
> > > > +   /* Wait for IO completion for non-fatal errors to avoid 
> > > > aborting IOs
> > > > */
> > > > +   start = ktime_get();
> > > > +   while (hba->outstanding_reqs) {
> > > > +   ufshcd_complete_requests(hba);
> > > > +   spin_unlock_irqrestore(hba->host->host_lock, flags);
> > > > +   schedule();
> > > > +   spin_lock_irqsave(hba->host->host_lock, flags);
> > > > +   if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
> > > > +   
> > > > LINERESET_IO_TIMEOUT_MS) {
> > > > +   dev_err(hba->dev, "%s: timeout, 
> > > > outstanding=0x%lx\n",
> > > > +   __func__, 
> > > > hba->outstanding_reqs);
> > > > +   break;
> > > > +   }
> > > > +   }
> > > > +
> > > > /* release lock as clear command might sleep */
> > > > spin_unlock_irqrestore(hba->host->host_lock, flags);
> > > > /* Clear pending transfer requests */
> > > > @@ -6302,9 +6322,13 @@ static irqreturn_t ufshcd_intr(int irq, void
> > > > *__hba)
> > > > intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
> > > > }
> > > >
> > > > -   if (enabled_intr_status && retval == IRQ_NONE) {
> > > > -   dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n",
> > > > -   __func__, intr_status);
> > > > +   if (enabled_intr_status && retval == IRQ_NONE &&
> > > > +   !ufshcd_eh_in_progress(hba)) {
> > > > +   dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x 
> > > > (0x%08x,
> > > > 0x%08x)\n",
> > > > +   __func__,
> > > > +   intr_status,
> > > > +   

Re: [PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-07 Thread Can Guo

On 2021-01-07 16:07, Jaegeuk Kim wrote:

On 01/07, Can Guo wrote:

On 2021-01-07 15:47, Jaegeuk Kim wrote:
> From: Jaegeuk Kim 
>
> This fixes a warning caused by wrong reserve tag usage in
> __ufshcd_issue_tm_cmd.
>
> WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 blk_get_request+0x68/0x70
> WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82
> blk_mq_get_tag+0x438/0x46c
>
> And, in ufshcd_err_handler(), we can avoid to send tm_cmd before
> aborting
> outstanding commands by waiting a bit for IO completion like this.
>
> __ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out
>
> Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to
> allocate and free TMFs")
> Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err
> handler")

Hi Jaegeuk,

Sorry, what is wrong with commit 2355b66ed20c? Clearing pending I/O
reqs is a general procedure for handling all non-fatal errors.


Without waiting IOs, I hit the below timeout all the time from 
LINERESET, which

causes UFS stuck permanently, as mentioned in the description.

"__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out"


In that case, ufshcd_try_to_abort_task(), the caller of 
__ufshcd_issue_tm_cmd(),
should return -ETIMEOUT, then err_handler would jump to do a full reset, 
then bail.
I am not sure what gets UFS stuck permanently. Could you please share 
the callstack

if possible? I really want to know what is happening. Thanks.

Regards,
Can Guo.





Thanks,
Can Guo.

> Signed-off-by: Jaegeuk Kim 
> ---
>  drivers/scsi/ufs/ufshcd.c | 35 +++
>  1 file changed, 31 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index e6e7bdf99cd7..340dd5e515dd 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -44,6 +44,9 @@
>  /* Query request timeout */
>  #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */
>
> +/* LINERESET TIME OUT */
> +#define LINERESET_IO_TIMEOUT_MS   (3) /* 30 sec */
> +
>  /* Task management command timeout */
>  #define TM_CMD_TIMEOUT100 /* msecs */
>
> @@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct work_struct
> *work)
>int err = 0, pmc_err;
>int tag;
>bool needs_reset = false, needs_restore = false;
> +  ktime_t start;
>
>hba = container_of(work, struct ufs_hba, eh_work);
>
> @@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct work_struct
> *work)
>}
>
>hba->silence_err_logs = true;
> +
> +  /* Wait for IO completion for non-fatal errors to avoid aborting IOs
> */
> +  start = ktime_get();
> +  while (hba->outstanding_reqs) {
> +  ufshcd_complete_requests(hba);
> +  spin_unlock_irqrestore(hba->host->host_lock, flags);
> +  schedule();
> +  spin_lock_irqsave(hba->host->host_lock, flags);
> +  if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
> +  LINERESET_IO_TIMEOUT_MS) {
> +  dev_err(hba->dev, "%s: timeout, outstanding=0x%lx\n",
> +  __func__, hba->outstanding_reqs);
> +  break;
> +  }
> +  }
> +
>/* release lock as clear command might sleep */
>spin_unlock_irqrestore(hba->host->host_lock, flags);
>/* Clear pending transfer requests */
> @@ -6302,9 +6322,13 @@ static irqreturn_t ufshcd_intr(int irq, void
> *__hba)
>intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
>}
>
> -  if (enabled_intr_status && retval == IRQ_NONE) {
> -  dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n",
> -  __func__, intr_status);
> +  if (enabled_intr_status && retval == IRQ_NONE &&
> +  !ufshcd_eh_in_progress(hba)) {
> +  dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x (0x%08x,
> 0x%08x)\n",
> +  __func__,
> +  intr_status,
> +  hba->ufs_stats.last_intr_status,
> +  enabled_intr_status);
>ufshcd_dump_regs(hba, 0, UFSHCI_REG_SPACE_SIZE, "host_regs: ");
>}
>
> @@ -6348,7 +6372,10 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba
> *hba,
> * Even though we use wait_event() which sleeps indefinitely,
> * the maximum wait time is bounded by %TM_CMD_TIMEOUT.
> */
> -  req = blk_get_request(q, REQ_OP_DRV_OUT, BLK_MQ_REQ_RESERVED);
> +  req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
> +  if (IS_ERR(req))
> +  return PTR_ERR(req);
> +
>req->end_io_data = 
>free_slot = req->tag;
>WARN_ON_ONCE(free_slot < 0 || free_slot >= hba->nutmrs);


Re: [PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-07 Thread Jaegeuk Kim
On 01/07, Can Guo wrote:
> On 2021-01-07 15:47, Jaegeuk Kim wrote:
> > From: Jaegeuk Kim 
> > 
> > This fixes a warning caused by wrong reserve tag usage in
> > __ufshcd_issue_tm_cmd.
> > 
> > WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 blk_get_request+0x68/0x70
> > WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82
> > blk_mq_get_tag+0x438/0x46c
> > 
> > And, in ufshcd_err_handler(), we can avoid to send tm_cmd before
> > aborting
> > outstanding commands by waiting a bit for IO completion like this.
> > 
> > __ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out
> > 
> > Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to
> > allocate and free TMFs")
> > Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err
> > handler")
> 
> Hi Jaegeuk,
> 
> Sorry, what is wrong with commit 2355b66ed20c? Clearing pending I/O
> reqs is a general procedure for handling all non-fatal errors.

Without waiting IOs, I hit the below timeout all the time from LINERESET, which
causes UFS stuck permanently, as mentioned in the description.

"__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out"

> 
> Thanks,
> Can Guo.
> 
> > Signed-off-by: Jaegeuk Kim 
> > ---
> >  drivers/scsi/ufs/ufshcd.c | 35 +++
> >  1 file changed, 31 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> > index e6e7bdf99cd7..340dd5e515dd 100644
> > --- a/drivers/scsi/ufs/ufshcd.c
> > +++ b/drivers/scsi/ufs/ufshcd.c
> > @@ -44,6 +44,9 @@
> >  /* Query request timeout */
> >  #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */
> > 
> > +/* LINERESET TIME OUT */
> > +#define LINERESET_IO_TIMEOUT_MS(3) /* 30 sec */
> > +
> >  /* Task management command timeout */
> >  #define TM_CMD_TIMEOUT 100 /* msecs */
> > 
> > @@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct work_struct
> > *work)
> > int err = 0, pmc_err;
> > int tag;
> > bool needs_reset = false, needs_restore = false;
> > +   ktime_t start;
> > 
> > hba = container_of(work, struct ufs_hba, eh_work);
> > 
> > @@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct work_struct
> > *work)
> > }
> > 
> > hba->silence_err_logs = true;
> > +
> > +   /* Wait for IO completion for non-fatal errors to avoid aborting IOs
> > */
> > +   start = ktime_get();
> > +   while (hba->outstanding_reqs) {
> > +   ufshcd_complete_requests(hba);
> > +   spin_unlock_irqrestore(hba->host->host_lock, flags);
> > +   schedule();
> > +   spin_lock_irqsave(hba->host->host_lock, flags);
> > +   if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
> > +   LINERESET_IO_TIMEOUT_MS) {
> > +   dev_err(hba->dev, "%s: timeout, outstanding=0x%lx\n",
> > +   __func__, hba->outstanding_reqs);
> > +   break;
> > +   }
> > +   }
> > +
> > /* release lock as clear command might sleep */
> > spin_unlock_irqrestore(hba->host->host_lock, flags);
> > /* Clear pending transfer requests */
> > @@ -6302,9 +6322,13 @@ static irqreturn_t ufshcd_intr(int irq, void
> > *__hba)
> > intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
> > }
> > 
> > -   if (enabled_intr_status && retval == IRQ_NONE) {
> > -   dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n",
> > -   __func__, intr_status);
> > +   if (enabled_intr_status && retval == IRQ_NONE &&
> > +   !ufshcd_eh_in_progress(hba)) {
> > +   dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x (0x%08x,
> > 0x%08x)\n",
> > +   __func__,
> > +   intr_status,
> > +   hba->ufs_stats.last_intr_status,
> > +   enabled_intr_status);
> > ufshcd_dump_regs(hba, 0, UFSHCI_REG_SPACE_SIZE, "host_regs: ");
> > }
> > 
> > @@ -6348,7 +6372,10 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba
> > *hba,
> >  * Even though we use wait_event() which sleeps indefinitely,
> >  * the maximum wait time is bounded by %TM_CMD_TIMEOUT.
> >  */
> > -   req = blk_get_request(q, REQ_OP_DRV_OUT, BLK_MQ_REQ_RESERVED);
> > +   req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
> > +   if (IS_ERR(req))
> > +   return PTR_ERR(req);
> > +
> > req->end_io_data = 
> > free_slot = req->tag;
> > WARN_ON_ONCE(free_slot < 0 || free_slot >= hba->nutmrs);


Re: [PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-07 Thread Can Guo

On 2021-01-07 15:47, Jaegeuk Kim wrote:

From: Jaegeuk Kim 

This fixes a warning caused by wrong reserve tag usage in 
__ufshcd_issue_tm_cmd.


WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 
blk_get_request+0x68/0x70
WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82 
blk_mq_get_tag+0x438/0x46c


And, in ufshcd_err_handler(), we can avoid to send tm_cmd before 
aborting

outstanding commands by waiting a bit for IO completion like this.

__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out

Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to
allocate and free TMFs")
Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err 
handler")


Hi Jaegeuk,

Sorry, what is wrong with commit 2355b66ed20c? Clearing pending I/O
reqs is a general procedure for handling all non-fatal errors.

Thanks,
Can Guo.


Signed-off-by: Jaegeuk Kim 
---
 drivers/scsi/ufs/ufshcd.c | 35 +++
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index e6e7bdf99cd7..340dd5e515dd 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -44,6 +44,9 @@
 /* Query request timeout */
 #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */

+/* LINERESET TIME OUT */
+#define LINERESET_IO_TIMEOUT_MS(3) /* 30 sec */
+
 /* Task management command timeout */
 #define TM_CMD_TIMEOUT 100 /* msecs */

@@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct work_struct 
*work)

int err = 0, pmc_err;
int tag;
bool needs_reset = false, needs_restore = false;
+   ktime_t start;

hba = container_of(work, struct ufs_hba, eh_work);

@@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct 
work_struct *work)

}

hba->silence_err_logs = true;
+
+	/* Wait for IO completion for non-fatal errors to avoid aborting IOs 
*/

+   start = ktime_get();
+   while (hba->outstanding_reqs) {
+   ufshcd_complete_requests(hba);
+   spin_unlock_irqrestore(hba->host->host_lock, flags);
+   schedule();
+   spin_lock_irqsave(hba->host->host_lock, flags);
+   if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
+   LINERESET_IO_TIMEOUT_MS) {
+   dev_err(hba->dev, "%s: timeout, outstanding=0x%lx\n",
+   __func__, hba->outstanding_reqs);
+   break;
+   }
+   }
+
/* release lock as clear command might sleep */
spin_unlock_irqrestore(hba->host->host_lock, flags);
/* Clear pending transfer requests */
@@ -6302,9 +6322,13 @@ static irqreturn_t ufshcd_intr(int irq, void 
*__hba)

intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
}

-   if (enabled_intr_status && retval == IRQ_NONE) {
-   dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n",
-   __func__, intr_status);
+   if (enabled_intr_status && retval == IRQ_NONE &&
+   !ufshcd_eh_in_progress(hba)) {
+		dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x (0x%08x, 
0x%08x)\n",

+   __func__,
+   intr_status,
+   hba->ufs_stats.last_intr_status,
+   enabled_intr_status);
ufshcd_dump_regs(hba, 0, UFSHCI_REG_SPACE_SIZE, "host_regs: ");
}

@@ -6348,7 +6372,10 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba 
*hba,

 * Even though we use wait_event() which sleeps indefinitely,
 * the maximum wait time is bounded by %TM_CMD_TIMEOUT.
 */
-   req = blk_get_request(q, REQ_OP_DRV_OUT, BLK_MQ_REQ_RESERVED);
+   req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+   if (IS_ERR(req))
+   return PTR_ERR(req);
+
req->end_io_data = 
free_slot = req->tag;
WARN_ON_ONCE(free_slot < 0 || free_slot >= hba->nutmrs);


[PATCH v4 2/2] scsi: ufs: handle LINERESET with correct tm_cmd

2021-01-06 Thread Jaegeuk Kim
From: Jaegeuk Kim 

This fixes a warning caused by wrong reserve tag usage in __ufshcd_issue_tm_cmd.

WARNING: CPU: 7 PID: 7 at block/blk-core.c:630 blk_get_request+0x68/0x70
WARNING: CPU: 4 PID: 157 at block/blk-mq-tag.c:82 blk_mq_get_tag+0x438/0x46c

And, in ufshcd_err_handler(), we can avoid to send tm_cmd before aborting
outstanding commands by waiting a bit for IO completion like this.

__ufshcd_issue_tm_cmd: task management cmd 0x80 timed-out

Fixes: 69a6c269c097 ("scsi: ufs: Use blk_{get,put}_request() to allocate and 
free TMFs")
Fixes: 2355b66ed20c ("scsi: ufs: Handle LINERESET indication in err handler")
Signed-off-by: Jaegeuk Kim 
---
 drivers/scsi/ufs/ufshcd.c | 35 +++
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index e6e7bdf99cd7..340dd5e515dd 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -44,6 +44,9 @@
 /* Query request timeout */
 #define QUERY_REQ_TIMEOUT 1500 /* 1.5 seconds */
 
+/* LINERESET TIME OUT */
+#define LINERESET_IO_TIMEOUT_MS(3) /* 30 sec */
+
 /* Task management command timeout */
 #define TM_CMD_TIMEOUT 100 /* msecs */
 
@@ -5826,6 +5829,7 @@ static void ufshcd_err_handler(struct work_struct *work)
int err = 0, pmc_err;
int tag;
bool needs_reset = false, needs_restore = false;
+   ktime_t start;
 
hba = container_of(work, struct ufs_hba, eh_work);
 
@@ -5911,6 +5915,22 @@ static void ufshcd_err_handler(struct work_struct *work)
}
 
hba->silence_err_logs = true;
+
+   /* Wait for IO completion for non-fatal errors to avoid aborting IOs */
+   start = ktime_get();
+   while (hba->outstanding_reqs) {
+   ufshcd_complete_requests(hba);
+   spin_unlock_irqrestore(hba->host->host_lock, flags);
+   schedule();
+   spin_lock_irqsave(hba->host->host_lock, flags);
+   if (ktime_to_ms(ktime_sub(ktime_get(), start)) >
+   LINERESET_IO_TIMEOUT_MS) {
+   dev_err(hba->dev, "%s: timeout, outstanding=0x%lx\n",
+   __func__, hba->outstanding_reqs);
+   break;
+   }
+   }
+
/* release lock as clear command might sleep */
spin_unlock_irqrestore(hba->host->host_lock, flags);
/* Clear pending transfer requests */
@@ -6302,9 +6322,13 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba)
intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
}
 
-   if (enabled_intr_status && retval == IRQ_NONE) {
-   dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x\n",
-   __func__, intr_status);
+   if (enabled_intr_status && retval == IRQ_NONE &&
+   !ufshcd_eh_in_progress(hba)) {
+   dev_err(hba->dev, "%s: Unhandled interrupt 0x%08x (0x%08x, 
0x%08x)\n",
+   __func__,
+   intr_status,
+   hba->ufs_stats.last_intr_status,
+   enabled_intr_status);
ufshcd_dump_regs(hba, 0, UFSHCI_REG_SPACE_SIZE, "host_regs: ");
}
 
@@ -6348,7 +6372,10 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba,
 * Even though we use wait_event() which sleeps indefinitely,
 * the maximum wait time is bounded by %TM_CMD_TIMEOUT.
 */
-   req = blk_get_request(q, REQ_OP_DRV_OUT, BLK_MQ_REQ_RESERVED);
+   req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
+   if (IS_ERR(req))
+   return PTR_ERR(req);
+
req->end_io_data = 
free_slot = req->tag;
WARN_ON_ONCE(free_slot < 0 || free_slot >= hba->nutmrs);
-- 
2.29.2.729.g45daf8777d-goog