Re: [PATCH v7 4/4] drm/msm: Extend gpu devcore dumps with pgtbl info

2024-08-26 Thread Akhil P Oommen
On Thu, Aug 22, 2024 at 04:15:24PM -0700, Rob Clark wrote:
> On Thu, Aug 22, 2024 at 1:34 PM Akhil P Oommen  
> wrote:
> >
> > On Tue, Aug 20, 2024 at 10:16:47AM -0700, Rob Clark wrote: > From: Rob 
> > Clark 
> > >
> > > In the case of iova fault triggered devcore dumps, include additional
> > > debug information based on what we think is the current page tables,
> > > including the TTBR0 value (which should match what we have in
> > > adreno_smmu_fault_info unless things have gone horribly wrong), and
> > > the pagetable entries traversed in the process of resolving the
> > > faulting iova.
> > >
> > > Signed-off-by: Rob Clark 
> > > ---
> > >  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 10 ++
> > >  drivers/gpu/drm/msm/msm_gpu.c   |  9 +
> > >  drivers/gpu/drm/msm/msm_gpu.h   |  8 
> > >  drivers/gpu/drm/msm/msm_iommu.c | 22 ++
> > >  drivers/gpu/drm/msm/msm_mmu.h   |  3 ++-
> > >  5 files changed, 51 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
> > > b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > > index 1c6626747b98..3848b5a64351 100644
> > > --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > > +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > > @@ -864,6 +864,16 @@ void adreno_show(struct msm_gpu *gpu, struct 
> > > msm_gpu_state *state,
> > >   drm_printf(p, "  - dir=%s\n", info->flags & 
> > > IOMMU_FAULT_WRITE ? "WRITE" : "READ");
> > >   drm_printf(p, "  - type=%s\n", info->type);
> > >   drm_printf(p, "  - source=%s\n", info->block);
> > > +
> > > + /* Information extracted from what we think are the current
> > > +  * pgtables.  Hopefully the TTBR0 matches what we've 
> > > extracted
> > > +  * from the SMMU registers in smmu_info!
> > > +  */
> > > + drm_puts(p, "pgtable-fault-info:\n");
> > > + drm_printf(p, "  - ttbr0: %.16llx\n", 
> > > (u64)info->pgtbl_ttbr0);
> >
> > "0x" prefix? Otherwise, it is a bit confusing when the below one is
> > decimal.
> 
> mixed feelings, the extra 0x is annoying when pasting into calc which
> is a simple way to get binary decoding
> 
> OTOH none of this is machine decoded so I guess we could change it

On second thought, I think it is fine as this is an address. Probably,
it is helpful for the pte values below.

> 
> > > + drm_printf(p, "  - asid: %d\n", info->asid);
> > > + drm_printf(p, "  - ptes: %.16llx %.16llx %.16llx %.16llx\n",
> > > +info->ptes[0], info->ptes[1], info->ptes[2], 
> > > info->ptes[3]);
> >
> > Does crashdec decodes this?
> 
> No, it just passed thru for human eyeballs
> 
> crashdec _does_ have some logic to flag buffers that are "near" the
> faulting iova to help identify if the fault is an underflow/overflow
> (which has been, along with the pte trail, useful to debug some
> issues)

Alright.

Reviewed-by: Akhil P Oommen 

-Akhil.
> 
> BR,
> -R
> 
> > -Akhil.
> >
> > >   }
> > >
> > >   drm_printf(p, "rbbm-status: 0x%08x\n", state->rbbm_status);
> > > diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> > > index 3666b42b4ecd..bf2f8b2a7ccc 100644
> > > --- a/drivers/gpu/drm/msm/msm_gpu.c
> > > +++ b/drivers/gpu/drm/msm/msm_gpu.c
> > > @@ -281,6 +281,15 @@ static void msm_gpu_crashstate_capture(struct 
> > > msm_gpu *gpu,
> > >   if (submit) {
> > >   int i;
> > >
> > > + if (state->fault_info.ttbr0) {
> > > + struct msm_gpu_fault_info *info = 
> > > &state->fault_info;
> > > + struct msm_mmu *mmu = submit->aspace->mmu;
> > > +
> > > + msm_iommu_pagetable_params(mmu, &info->pgtbl_ttbr0,
> > > +&info->asid);
> > > + msm_iommu_pagetable_walk(mmu, info->iova, 
> > > info->ptes);
> > > + }
> > > +
> > >   state->bos = kcalloc(submit->nr_bos,
> > >   sizeof(struct msm_gpu_state_bo), GFP_KERNEL);
> > >
> > > diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> > > index 1f02bb9956be..82e838ba8c80 100644
> > > --- a/drivers/gpu/drm/msm/msm_gpu.h
> > > +++ b/drivers/gpu/drm/msm/msm_gpu.h
> > > @@ -101,6 +101,14 @@ struct msm_gpu_fault_info {
> > >   int flags;
> > >   const char *type;
> > >   const char *block;
> > > +
> > > + /* Information about what we think/expect is the current SMMU state,
> > > +  * for example expected_ttbr0 should match smmu_info.ttbr0 which
> > > +  * was read back from SMMU registers.
> > > +  */
> > > + phys_addr_t pgtbl_ttbr0;
> > > + u64 ptes[4];
> > > + int asid;
> > >  };
> > >
> > >  /**
> > > diff --git a/drivers/gpu/drm/msm/msm_iommu.c 
> > > b/drivers/gpu/drm/msm/msm_iommu.c
> > > index 2a94e82316f9..3e692818ba1f 100644
> > > --- a/drivers/gpu/d

Re: [PATCH v7 4/4] drm/msm: Extend gpu devcore dumps with pgtbl info

2024-08-22 Thread Rob Clark
On Thu, Aug 22, 2024 at 1:34 PM Akhil P Oommen  wrote:
>
> On Tue, Aug 20, 2024 at 10:16:47AM -0700, Rob Clark wrote: > From: Rob Clark 
> 
> >
> > In the case of iova fault triggered devcore dumps, include additional
> > debug information based on what we think is the current page tables,
> > including the TTBR0 value (which should match what we have in
> > adreno_smmu_fault_info unless things have gone horribly wrong), and
> > the pagetable entries traversed in the process of resolving the
> > faulting iova.
> >
> > Signed-off-by: Rob Clark 
> > ---
> >  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 10 ++
> >  drivers/gpu/drm/msm/msm_gpu.c   |  9 +
> >  drivers/gpu/drm/msm/msm_gpu.h   |  8 
> >  drivers/gpu/drm/msm/msm_iommu.c | 22 ++
> >  drivers/gpu/drm/msm/msm_mmu.h   |  3 ++-
> >  5 files changed, 51 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
> > b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > index 1c6626747b98..3848b5a64351 100644
> > --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> > @@ -864,6 +864,16 @@ void adreno_show(struct msm_gpu *gpu, struct 
> > msm_gpu_state *state,
> >   drm_printf(p, "  - dir=%s\n", info->flags & IOMMU_FAULT_WRITE 
> > ? "WRITE" : "READ");
> >   drm_printf(p, "  - type=%s\n", info->type);
> >   drm_printf(p, "  - source=%s\n", info->block);
> > +
> > + /* Information extracted from what we think are the current
> > +  * pgtables.  Hopefully the TTBR0 matches what we've extracted
> > +  * from the SMMU registers in smmu_info!
> > +  */
> > + drm_puts(p, "pgtable-fault-info:\n");
> > + drm_printf(p, "  - ttbr0: %.16llx\n", (u64)info->pgtbl_ttbr0);
>
> "0x" prefix? Otherwise, it is a bit confusing when the below one is
> decimal.

mixed feelings, the extra 0x is annoying when pasting into calc which
is a simple way to get binary decoding

OTOH none of this is machine decoded so I guess we could change it

> > + drm_printf(p, "  - asid: %d\n", info->asid);
> > + drm_printf(p, "  - ptes: %.16llx %.16llx %.16llx %.16llx\n",
> > +info->ptes[0], info->ptes[1], info->ptes[2], 
> > info->ptes[3]);
>
> Does crashdec decodes this?

No, it just passed thru for human eyeballs

crashdec _does_ have some logic to flag buffers that are "near" the
faulting iova to help identify if the fault is an underflow/overflow
(which has been, along with the pte trail, useful to debug some
issues)

BR,
-R

> -Akhil.
>
> >   }
> >
> >   drm_printf(p, "rbbm-status: 0x%08x\n", state->rbbm_status);
> > diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> > index 3666b42b4ecd..bf2f8b2a7ccc 100644
> > --- a/drivers/gpu/drm/msm/msm_gpu.c
> > +++ b/drivers/gpu/drm/msm/msm_gpu.c
> > @@ -281,6 +281,15 @@ static void msm_gpu_crashstate_capture(struct msm_gpu 
> > *gpu,
> >   if (submit) {
> >   int i;
> >
> > + if (state->fault_info.ttbr0) {
> > + struct msm_gpu_fault_info *info = &state->fault_info;
> > + struct msm_mmu *mmu = submit->aspace->mmu;
> > +
> > + msm_iommu_pagetable_params(mmu, &info->pgtbl_ttbr0,
> > +&info->asid);
> > + msm_iommu_pagetable_walk(mmu, info->iova, info->ptes);
> > + }
> > +
> >   state->bos = kcalloc(submit->nr_bos,
> >   sizeof(struct msm_gpu_state_bo), GFP_KERNEL);
> >
> > diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> > index 1f02bb9956be..82e838ba8c80 100644
> > --- a/drivers/gpu/drm/msm/msm_gpu.h
> > +++ b/drivers/gpu/drm/msm/msm_gpu.h
> > @@ -101,6 +101,14 @@ struct msm_gpu_fault_info {
> >   int flags;
> >   const char *type;
> >   const char *block;
> > +
> > + /* Information about what we think/expect is the current SMMU state,
> > +  * for example expected_ttbr0 should match smmu_info.ttbr0 which
> > +  * was read back from SMMU registers.
> > +  */
> > + phys_addr_t pgtbl_ttbr0;
> > + u64 ptes[4];
> > + int asid;
> >  };
> >
> >  /**
> > diff --git a/drivers/gpu/drm/msm/msm_iommu.c 
> > b/drivers/gpu/drm/msm/msm_iommu.c
> > index 2a94e82316f9..3e692818ba1f 100644
> > --- a/drivers/gpu/drm/msm/msm_iommu.c
> > +++ b/drivers/gpu/drm/msm/msm_iommu.c
> > @@ -195,6 +195,28 @@ struct iommu_domain_geometry 
> > *msm_iommu_get_geometry(struct msm_mmu *mmu)
> >   return &iommu->domain->geometry;
> >  }
> >
> > +int
> > +msm_iommu_pagetable_walk(struct msm_mmu *mmu, unsigned long iova, uint64_t 
> > ptes[4])
> > +{
> > + struct msm_iommu_pagetable *pagetable;
> > + struct arm_lpae_io_pgtable_walk_data wd = {};
> > +
> > + if (mmu->type != MSM_MMU_IOMMU

Re: [PATCH v7 4/4] drm/msm: Extend gpu devcore dumps with pgtbl info

2024-08-22 Thread Akhil P Oommen
On Tue, Aug 20, 2024 at 10:16:47AM -0700, Rob Clark wrote: > From: Rob Clark 

> 
> In the case of iova fault triggered devcore dumps, include additional
> debug information based on what we think is the current page tables,
> including the TTBR0 value (which should match what we have in
> adreno_smmu_fault_info unless things have gone horribly wrong), and
> the pagetable entries traversed in the process of resolving the
> faulting iova.
> 
> Signed-off-by: Rob Clark 
> ---
>  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 10 ++
>  drivers/gpu/drm/msm/msm_gpu.c   |  9 +
>  drivers/gpu/drm/msm/msm_gpu.h   |  8 
>  drivers/gpu/drm/msm/msm_iommu.c | 22 ++
>  drivers/gpu/drm/msm/msm_mmu.h   |  3 ++-
>  5 files changed, 51 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
> b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index 1c6626747b98..3848b5a64351 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -864,6 +864,16 @@ void adreno_show(struct msm_gpu *gpu, struct 
> msm_gpu_state *state,
>   drm_printf(p, "  - dir=%s\n", info->flags & IOMMU_FAULT_WRITE ? 
> "WRITE" : "READ");
>   drm_printf(p, "  - type=%s\n", info->type);
>   drm_printf(p, "  - source=%s\n", info->block);
> +
> + /* Information extracted from what we think are the current
> +  * pgtables.  Hopefully the TTBR0 matches what we've extracted
> +  * from the SMMU registers in smmu_info!
> +  */
> + drm_puts(p, "pgtable-fault-info:\n");
> + drm_printf(p, "  - ttbr0: %.16llx\n", (u64)info->pgtbl_ttbr0);

"0x" prefix? Otherwise, it is a bit confusing when the below one is
decimal.

> + drm_printf(p, "  - asid: %d\n", info->asid);
> + drm_printf(p, "  - ptes: %.16llx %.16llx %.16llx %.16llx\n",
> +info->ptes[0], info->ptes[1], info->ptes[2], 
> info->ptes[3]);

Does crashdec decodes this?

-Akhil.

>   }
>  
>   drm_printf(p, "rbbm-status: 0x%08x\n", state->rbbm_status);
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index 3666b42b4ecd..bf2f8b2a7ccc 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -281,6 +281,15 @@ static void msm_gpu_crashstate_capture(struct msm_gpu 
> *gpu,
>   if (submit) {
>   int i;
>  
> + if (state->fault_info.ttbr0) {
> + struct msm_gpu_fault_info *info = &state->fault_info;
> + struct msm_mmu *mmu = submit->aspace->mmu;
> +
> + msm_iommu_pagetable_params(mmu, &info->pgtbl_ttbr0,
> +&info->asid);
> + msm_iommu_pagetable_walk(mmu, info->iova, info->ptes);
> + }
> +
>   state->bos = kcalloc(submit->nr_bos,
>   sizeof(struct msm_gpu_state_bo), GFP_KERNEL);
>  
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 1f02bb9956be..82e838ba8c80 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -101,6 +101,14 @@ struct msm_gpu_fault_info {
>   int flags;
>   const char *type;
>   const char *block;
> +
> + /* Information about what we think/expect is the current SMMU state,
> +  * for example expected_ttbr0 should match smmu_info.ttbr0 which
> +  * was read back from SMMU registers.
> +  */
> + phys_addr_t pgtbl_ttbr0;
> + u64 ptes[4];
> + int asid;
>  };
>  
>  /**
> diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
> index 2a94e82316f9..3e692818ba1f 100644
> --- a/drivers/gpu/drm/msm/msm_iommu.c
> +++ b/drivers/gpu/drm/msm/msm_iommu.c
> @@ -195,6 +195,28 @@ struct iommu_domain_geometry 
> *msm_iommu_get_geometry(struct msm_mmu *mmu)
>   return &iommu->domain->geometry;
>  }
>  
> +int
> +msm_iommu_pagetable_walk(struct msm_mmu *mmu, unsigned long iova, uint64_t 
> ptes[4])
> +{
> + struct msm_iommu_pagetable *pagetable;
> + struct arm_lpae_io_pgtable_walk_data wd = {};
> +
> + if (mmu->type != MSM_MMU_IOMMU_PAGETABLE)
> + return -EINVAL;
> +
> + pagetable = to_pagetable(mmu);
> +
> + if (!pagetable->pgtbl_ops->pgtable_walk)
> + return -EINVAL;
> +
> + pagetable->pgtbl_ops->pgtable_walk(pagetable->pgtbl_ops, iova, &wd);
> +
> + for (int i = 0; i < ARRAY_SIZE(wd.ptes); i++)
> + ptes[i] = wd.ptes[i];
> +
> + return 0;
> +}
> +
>  static const struct msm_mmu_funcs pagetable_funcs = {
>   .map = msm_iommu_pagetable_map,
>   .unmap = msm_iommu_pagetable_unmap,
> diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
> index 88af4f490881..96e509bd96a6 100644
> --- a/drivers/gpu/drm/msm/msm_mmu.h
> +++ b/